-
Notifications
You must be signed in to change notification settings - Fork 15.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #17510 from protocolbuffers/cherrypick-ruby-utf8
[Ruby] Warn if assigning a "UTF-8" string with invalid UTF-8. (#17253)
- Loading branch information
Showing
7 changed files
with
219 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
syntax = "proto2"; | ||
|
||
package utf8_test_protos; | ||
|
||
message TestUtf8 { | ||
optional string optional_string = 1; | ||
repeated string repeated_string = 2; | ||
map<string, string> map_string_string = 3; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
#!/usr/bin/ruby | ||
|
||
require 'google/protobuf' | ||
require 'utf8_pb' | ||
require 'test/unit' | ||
|
||
module CaptureWarnings | ||
@@warnings = nil | ||
|
||
module_function | ||
|
||
def warn(message, category: nil, **kwargs) | ||
if @@warnings | ||
@@warnings << message | ||
else | ||
super | ||
end | ||
end | ||
|
||
def capture | ||
@@warnings = [] | ||
yield | ||
@@warnings | ||
ensure | ||
@@warnings = nil | ||
end | ||
end | ||
|
||
Warning.extend CaptureWarnings | ||
|
||
module Utf8Test | ||
def test_scalar | ||
msg = Utf8TestProtos::TestUtf8.new | ||
assert_bad_utf8 { msg.optional_string = bad_utf8_string() } | ||
end | ||
|
||
def test_repeated | ||
msg = Utf8TestProtos::TestUtf8.new | ||
assert_bad_utf8 { msg.repeated_string << bad_utf8_string() } | ||
end | ||
|
||
def test_map_key | ||
msg = Utf8TestProtos::TestUtf8.new | ||
assert_bad_utf8 { msg.map_string_string[bad_utf8_string()] = "abc" } | ||
end | ||
|
||
def test_map_value | ||
msg = Utf8TestProtos::TestUtf8.new | ||
assert_bad_utf8 { msg.map_string_string["abc"] = bad_utf8_string() } | ||
end | ||
end | ||
|
||
# Tests the case of string objects that are marked UTF-8, but contain invalid | ||
# UTF-8. | ||
# | ||
# For now these only warn, but in the next major version they will throw an | ||
# exception. | ||
class MarkedUtf8Test < Test::Unit::TestCase | ||
def assert_bad_utf8(&block) | ||
warnings = CaptureWarnings.capture(&block) | ||
assert_equal 1, warnings.length | ||
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0]) | ||
end | ||
|
||
def bad_utf8_string | ||
str = "\x80" | ||
assert_false str.valid_encoding? | ||
str | ||
end | ||
|
||
include Utf8Test | ||
end | ||
|
||
# This test doesn't work in JRuby because JRuby appears to have a bug where | ||
# the "valid" bit on a string's data is not invalidated properly when the | ||
# string is modified: https://github.com/jruby/jruby/issues/8316 | ||
if !defined? JRUBY_VERSION | ||
# Tests the case of string objects that are marked UTF-8, and initially contain | ||
# valid UTF-8, but are later modified to be invalid UTF-8. This may put the | ||
# string into an state of "unknown" validity. | ||
# | ||
# For now these only warn, but in the next major version they will throw an | ||
# exception. | ||
class MarkedModifiedUtf8Test < Test::Unit::TestCase | ||
def assert_bad_utf8(&block) | ||
warnings = CaptureWarnings.capture(&block) | ||
assert_equal 1, warnings.length | ||
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0]) | ||
end | ||
|
||
def bad_utf8_string | ||
str = " " | ||
assert_true str.valid_encoding? | ||
str[0] = "\x80" | ||
str | ||
end | ||
|
||
include Utf8Test | ||
end | ||
end | ||
|
||
# Tests the case of string objects that are marked with a non-UTF-8 encoding, | ||
# but contain invalid UTF-8. | ||
# | ||
# This case will raise Encoding::UndefinedConversionError. | ||
class MarkedNonUtf8Test < Test::Unit::TestCase | ||
def assert_bad_utf8 | ||
assert_raises(Encoding::UndefinedConversionError) { yield } | ||
end | ||
|
||
def bad_utf8_string | ||
str = "\x80".force_encoding(Encoding::ASCII_8BIT) | ||
assert_true str.valid_encoding? | ||
str | ||
end | ||
|
||
include Utf8Test | ||
end | ||
|
||
# Tests the case of string objects that are marked with a non-UTF-8 encoding, | ||
# but are invalid even in their source encoding. | ||
# | ||
# This case will raise Encoding::InvalidByteSequenceError | ||
class MarkedNonUtf8Test < Test::Unit::TestCase | ||
def assert_bad_utf8(&block) | ||
assert_raises(Encoding::InvalidByteSequenceError, &block) | ||
end | ||
|
||
def bad_utf8_string | ||
str = "\x80".force_encoding(Encoding::ASCII) | ||
assert_false str.valid_encoding? | ||
str | ||
end | ||
|
||
include Utf8Test | ||
end |