Skip to content

Commit

Permalink
Merge pull request #17510 from protocolbuffers/cherrypick-ruby-utf8
Browse files Browse the repository at this point in the history
[Ruby] Warn if assigning a "UTF-8" string with invalid UTF-8. (#17253)
  • Loading branch information
haberman authored Jul 17, 2024
2 parents ca75771 + fe9a53c commit fa8dbae
Show file tree
Hide file tree
Showing 7 changed files with 219 additions and 21 deletions.
1 change: 1 addition & 0 deletions ruby/Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ test_protos = %w[
tests/test_import_proto2.proto
tests/test_ruby_package.proto
tests/test_ruby_package_proto2.proto
tests/utf8.proto
]

# These are omitted for now because we don't support proto2.
Expand Down
50 changes: 37 additions & 13 deletions ruby/ext/google/protobuf_c/convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,41 @@ static int32_t Convert_ToEnum(VALUE value, const char* name,
rb_raise(rb_eRangeError, "Unknown symbol value for enum field '%s'.", name);
}

VALUE Convert_CheckStringUtf8(VALUE str) {
VALUE utf8 = rb_enc_from_encoding(rb_utf8_encoding());

if (rb_obj_encoding(str) == utf8) {
// Note: Just because a string is marked as having UTF-8 encoding does
// not mean that it is *valid* UTF-8. We have to check separately
// whether it is valid.
if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
// TODO: For now
// we only warn for this case. We will remove the warning and throw an
// exception below in the 30.x release

rb_warn(
"String is invalid UTF-8. This will be an error in a future "
"version.");
// VALUE exc = rb_const_get_at(
// rb_cEncoding, rb_intern("InvalidByteSequenceError"));
// rb_raise(exc, "String is invalid UTF-8");
}
} else {
// Note: this will not duplicate underlying string data unless
// necessary.
//
// This will throw an exception if the conversion cannot be performed:
// - Encoding::UndefinedConversionError if certain characters cannot be
// converted to UTF-8.
// - Encoding::InvalidByteSequenceError if certain characters were invalid
// in the source encoding.
str = rb_str_encode(str, utf8, 0, Qnil);
PBRUBY_ASSERT(rb_enc_str_coderange(str) != ENC_CODERANGE_BROKEN);
}

return str;
}

upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
TypeInfo type_info, upb_Arena* arena) {
upb_MessageValue ret;
Expand Down Expand Up @@ -137,8 +172,7 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
}
break;
}
case kUpb_CType_String: {
VALUE utf8 = rb_enc_from_encoding(rb_utf8_encoding());
case kUpb_CType_String:
if (rb_obj_class(value) == rb_cSymbol) {
value = rb_funcall(value, rb_intern("to_s"), 0);
} else if (!rb_obj_is_kind_of(value, rb_cString)) {
Expand All @@ -147,19 +181,9 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
rb_class2name(CLASS_OF(value)));
}

if (rb_obj_encoding(value) != utf8) {
// Note: this will not duplicate underlying string data unless
// necessary.
value = rb_str_encode(value, utf8, 0, Qnil);

if (rb_enc_str_coderange(value) == ENC_CODERANGE_BROKEN) {
rb_raise(rb_eEncodingError, "String is invalid UTF-8");
}
}

value = Convert_CheckStringUtf8(value);
ret.str_val = Convert_StringData(value, arena);
break;
}
case kUpb_CType_Bytes: {
VALUE bytes = rb_enc_from_encoding(rb_ascii8bit_encoding());
if (rb_obj_class(value) != rb_cString) {
Expand Down
15 changes: 11 additions & 4 deletions ruby/lib/google/protobuf/ffi/internal/convert.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,18 @@ def convert_ruby_to_upb(value, arena, c_type, msg_or_enum_def)
return_value[:bool_val] = value
when :string
raise TypeError.new "Invalid argument for string field '#{name}' (given #{value.class})." unless value.is_a?(String) or value.is_a?(Symbol)
begin
value = value.to_s if value.is_a?(Symbol)
if value.encoding == Encoding::UTF_8
unless value.valid_encoding?
# TODO:
# For now we only warn for this case. We will remove the
# warning and throw an exception below in the 30.x release
warn "String is invalid UTF-8. This will be an error in a future version."
# raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8"
end
string_value = value
else
string_value = value.to_s.encode("UTF-8")
rescue Encoding::UndefinedConversionError
# TODO - why not include the field name here?
raise Encoding::UndefinedConversionError.new "String is invalid UTF-8"
end
return_value[:str_val][:size] = string_value.bytesize
return_value[:str_val][:data] = Google::Protobuf::FFI.arena_malloc(arena, string_value.bytesize)
Expand Down
19 changes: 15 additions & 4 deletions ruby/src/main/java/com/google/protobuf/jruby/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jruby.*;
import org.jruby.common.RubyWarnings;
import org.jruby.exceptions.RaiseException;
import org.jruby.ext.bigdecimal.RubyBigDecimal;
import org.jruby.runtime.Block;
Expand Down Expand Up @@ -389,11 +390,21 @@ private static IRubyObject validateAndEncodeString(
if (!(value instanceof RubyString))
throw createInvalidTypeError(context, fieldType, fieldName, value);

RubyString string = (RubyString) value;
if (encoding == UTF8Encoding.INSTANCE && string.getEncoding().isUTF8()) {
if (string.isCodeRangeBroken()) {
// TODO: For now we only warn for
// this case. We will remove the warning and throw an exception in the 30.x release
context
.runtime
.getWarnings()
.warn("String is invalid UTF-8. This will be an error in a future version.");
}
}

value =
((RubyString) value)
.encode(
context,
context.runtime.getEncodingService().convertEncodingToRubyEncoding(encoding));
string.encode(
context, context.runtime.getEncodingService().convertEncodingToRubyEncoding(encoding));
value.setFrozen(true);
return value;
}
Expand Down
10 changes: 10 additions & 0 deletions ruby/tests/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,16 @@ ruby_test(
],
)

ruby_test(
name = "utf8",
srcs = ["utf8.rb"],
deps = [
":test_ruby_protos",
"//ruby:protobuf",
"@protobuf_bundle//:test-unit",
],
)

ruby_test(
name = "well_known_types_test",
srcs = ["well_known_types_test.rb"],
Expand Down
9 changes: 9 additions & 0 deletions ruby/tests/utf8.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
syntax = "proto2";

package utf8_test_protos;

message TestUtf8 {
optional string optional_string = 1;
repeated string repeated_string = 2;
map<string, string> map_string_string = 3;
}
136 changes: 136 additions & 0 deletions ruby/tests/utf8.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/usr/bin/ruby

require 'google/protobuf'
require 'utf8_pb'
require 'test/unit'

module CaptureWarnings
@@warnings = nil

module_function

def warn(message, category: nil, **kwargs)
if @@warnings
@@warnings << message
else
super
end
end

def capture
@@warnings = []
yield
@@warnings
ensure
@@warnings = nil
end
end

Warning.extend CaptureWarnings

module Utf8Test
def test_scalar
msg = Utf8TestProtos::TestUtf8.new
assert_bad_utf8 { msg.optional_string = bad_utf8_string() }
end

def test_repeated
msg = Utf8TestProtos::TestUtf8.new
assert_bad_utf8 { msg.repeated_string << bad_utf8_string() }
end

def test_map_key
msg = Utf8TestProtos::TestUtf8.new
assert_bad_utf8 { msg.map_string_string[bad_utf8_string()] = "abc" }
end

def test_map_value
msg = Utf8TestProtos::TestUtf8.new
assert_bad_utf8 { msg.map_string_string["abc"] = bad_utf8_string() }
end
end

# Tests the case of string objects that are marked UTF-8, but contain invalid
# UTF-8.
#
# For now these only warn, but in the next major version they will throw an
# exception.
class MarkedUtf8Test < Test::Unit::TestCase
def assert_bad_utf8(&block)
warnings = CaptureWarnings.capture(&block)
assert_equal 1, warnings.length
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
end

def bad_utf8_string
str = "\x80"
assert_false str.valid_encoding?
str
end

include Utf8Test
end

# This test doesn't work in JRuby because JRuby appears to have a bug where
# the "valid" bit on a string's data is not invalidated properly when the
# string is modified: https://github.com/jruby/jruby/issues/8316
if !defined? JRUBY_VERSION
# Tests the case of string objects that are marked UTF-8, and initially contain
# valid UTF-8, but are later modified to be invalid UTF-8. This may put the
# string into an state of "unknown" validity.
#
# For now these only warn, but in the next major version they will throw an
# exception.
class MarkedModifiedUtf8Test < Test::Unit::TestCase
def assert_bad_utf8(&block)
warnings = CaptureWarnings.capture(&block)
assert_equal 1, warnings.length
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
end

def bad_utf8_string
str = " "
assert_true str.valid_encoding?
str[0] = "\x80"
str
end

include Utf8Test
end
end

# Tests the case of string objects that are marked with a non-UTF-8 encoding,
# but contain invalid UTF-8.
#
# This case will raise Encoding::UndefinedConversionError.
class MarkedNonUtf8Test < Test::Unit::TestCase
def assert_bad_utf8
assert_raises(Encoding::UndefinedConversionError) { yield }
end

def bad_utf8_string
str = "\x80".force_encoding(Encoding::ASCII_8BIT)
assert_true str.valid_encoding?
str
end

include Utf8Test
end

# Tests the case of string objects that are marked with a non-UTF-8 encoding,
# but are invalid even in their source encoding.
#
# This case will raise Encoding::InvalidByteSequenceError
class MarkedNonUtf8Test < Test::Unit::TestCase
def assert_bad_utf8(&block)
assert_raises(Encoding::InvalidByteSequenceError, &block)
end

def bad_utf8_string
str = "\x80".force_encoding(Encoding::ASCII)
assert_false str.valid_encoding?
str
end

include Utf8Test
end

0 comments on commit fa8dbae

Please sign in to comment.