Skip to content

Commit

Permalink
Error if assigning a "UTF-8" string with invalid UTF-8.
Browse files Browse the repository at this point in the history
This has been a warning since v28.x: fa8dbae

See also https://protobuf.dev/news/2024-10-02/#utf-8-enforcement which pre-announces this for v30.x.

Fixes #17484

PiperOrigin-RevId: 691917648
  • Loading branch information
zhangskz authored and copybara-github committed Oct 31, 2024
1 parent 7521732 commit 2f505a7
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 31 deletions.
13 changes: 3 additions & 10 deletions ruby/ext/google/protobuf_c/convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,9 @@ VALUE Convert_CheckStringUtf8(VALUE str) {
// not mean that it is *valid* UTF-8. We have to check separately
// whether it is valid.
if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
// TODO: For now
// we only warn for this case. We will remove the warning and throw an
// exception below in the 30.x release

rb_warn(
"String is invalid UTF-8. This will be an error in a future "
"version.");
// VALUE exc = rb_const_get_at(
// rb_cEncoding, rb_intern("InvalidByteSequenceError"));
// rb_raise(exc, "String is invalid UTF-8");
VALUE exc = rb_const_get_at(
rb_cEncoding, rb_intern("InvalidByteSequenceError"));
rb_raise(exc, "String is invalid UTF-8");
}
} else {
// Note: this will not duplicate underlying string data unless
Expand Down
6 changes: 1 addition & 5 deletions ruby/lib/google/protobuf/ffi/internal/convert.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,7 @@ def convert_ruby_to_upb(value, arena, c_type, msg_or_enum_def)
value = value.to_s if value.is_a?(Symbol)
if value.encoding == Encoding::UTF_8
unless value.valid_encoding?
# TODO:
# For now we only warn for this case. We will remove the
# warning and throw an exception below in the 30.x release
warn "String is invalid UTF-8. This will be an error in a future version."
# raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8"
raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8"
end
string_value = value
else
Expand Down
17 changes: 11 additions & 6 deletions ruby/src/main/java/com/google/protobuf/jruby/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,15 @@ public static boolean isMapEntry(FieldDescriptor fieldDescriptor) {
&& fieldDescriptor.getMessageType().getOptions().getMapEntry();
}

public static RaiseException createInvalidByteSequenceError(
ThreadContext context, String message) {
if (cInvalidByteSequenceError == null) {
cInvalidByteSequenceError =
(RubyClass) context.runtime.getClassFromPath("Encoding::InvalidByteSequenceError");
}
return RaiseException.from(context.runtime, cInvalidByteSequenceError, message);
}

public static RaiseException createTypeError(ThreadContext context, String message) {
if (cTypeError == null) {
cTypeError = (RubyClass) context.runtime.getClassFromPath("Google::Protobuf::TypeError");
Expand Down Expand Up @@ -393,12 +402,7 @@ private static IRubyObject validateAndEncodeString(
RubyString string = (RubyString) value;
if (encoding == UTF8Encoding.INSTANCE && string.getEncoding().isUTF8()) {
if (string.isCodeRangeBroken()) {
// TODO: For now we only warn for
// this case. We will remove the warning and throw an exception in the 30.x release
context
.runtime
.getWarnings()
.warn("String is invalid UTF-8. This will be an error in a future version.");
throw createInvalidByteSequenceError(context, "String is invalid UTF-8.");
}
}

Expand All @@ -424,4 +428,5 @@ private static IRubyObject validateAndEncodeString(
private static final long UINT_MAX = 0xffffffffl;

private static RubyClass cTypeError;
private static RubyClass cInvalidByteSequenceError;
}
14 changes: 4 additions & 10 deletions ruby/tests/utf8.rb
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,10 @@ def test_map_value
# Tests the case of string objects that are marked UTF-8, but contain invalid
# UTF-8.
#
# For now these only warn, but in the next major version they will throw an
# exception.
# This case will raise Encoding::InvalidByteSequenceError
class MarkedUtf8Test < Test::Unit::TestCase
def assert_bad_utf8(&block)
warnings = CaptureWarnings.capture(&block)
assert_equal 1, warnings.length
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
assert_raises(Encoding::InvalidByteSequenceError, &block)
end

def bad_utf8_string
Expand All @@ -79,13 +76,10 @@ def bad_utf8_string
# valid UTF-8, but are later modified to be invalid UTF-8. This may put the
# string into an state of "unknown" validity.
#
# For now these only warn, but in the next major version they will throw an
# exception.
# This case will raise Encoding::InvalidByteSequenceError
class MarkedModifiedUtf8Test < Test::Unit::TestCase
def assert_bad_utf8(&block)
warnings = CaptureWarnings.capture(&block)
assert_equal 1, warnings.length
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
assert_raises(Encoding::InvalidByteSequenceError, &block)
end

def bad_utf8_string
Expand Down

0 comments on commit 2f505a7

Please sign in to comment.