Error if assigning a "UTF-8" string with invalid UTF-8.

This has been a warning since v28.x: fa8dbae See also https://protobuf.dev/news/2024-10-02/#utf-8-enforcement which pre-announces this for v30.x. Fixes #17484 PiperOrigin-RevId: 691917648
protocolbuffers · Oct 31, 2024 · 2f505a7 · 2f505a7
1 parent 7521732
commit 2f505a7
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 31 deletions.
diff --git a/ruby/ext/google/protobuf_c/convert.c b/ruby/ext/google/protobuf_c/convert.c
@@ -112,16 +112,9 @@ VALUE Convert_CheckStringUtf8(VALUE str) {
     // not mean that it is *valid* UTF-8.  We have to check separately
     // whether it is valid.
     if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
-      // TODO: For now
-      // we only warn for this case.  We will remove the warning and throw an
-      // exception below in the 30.x release
-
-      rb_warn(
-          "String is invalid UTF-8. This will be an error in a future "
-          "version.");
-      // VALUE exc = rb_const_get_at(
-      //     rb_cEncoding, rb_intern("InvalidByteSequenceError"));
-      // rb_raise(exc, "String is invalid UTF-8");
+      VALUE exc = rb_const_get_at(
+          rb_cEncoding, rb_intern("InvalidByteSequenceError"));
+      rb_raise(exc, "String is invalid UTF-8");
     }
   } else {
     // Note: this will not duplicate underlying string data unless

diff --git a/ruby/lib/google/protobuf/ffi/internal/convert.rb b/ruby/lib/google/protobuf/ffi/internal/convert.rb
@@ -36,11 +36,7 @@ def convert_ruby_to_upb(value, arena, c_type, msg_or_enum_def)
             value = value.to_s if value.is_a?(Symbol)
             if value.encoding == Encoding::UTF_8
               unless value.valid_encoding?
-                # TODO:
-                # For now we only warn for this case.  We will remove the
-                # warning and throw an exception below in the 30.x release
-                warn "String is invalid UTF-8. This will be an error in a future version."
-                # raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8"
+                raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8"
               end
               string_value = value
             else

diff --git a/ruby/src/main/java/com/google/protobuf/jruby/Utils.java b/ruby/src/main/java/com/google/protobuf/jruby/Utils.java
@@ -331,6 +331,15 @@ public static boolean isMapEntry(FieldDescriptor fieldDescriptor) {
         && fieldDescriptor.getMessageType().getOptions().getMapEntry();
   }
 
+  public static RaiseException createInvalidByteSequenceError(
+      ThreadContext context, String message) {
+    if (cInvalidByteSequenceError == null) {
+      cInvalidByteSequenceError =
+          (RubyClass) context.runtime.getClassFromPath("Encoding::InvalidByteSequenceError");
+    }
+    return RaiseException.from(context.runtime, cInvalidByteSequenceError, message);
+  }
+
   public static RaiseException createTypeError(ThreadContext context, String message) {
     if (cTypeError == null) {
       cTypeError = (RubyClass) context.runtime.getClassFromPath("Google::Protobuf::TypeError");
@@ -393,12 +402,7 @@ private static IRubyObject validateAndEncodeString(
     RubyString string = (RubyString) value;
     if (encoding == UTF8Encoding.INSTANCE && string.getEncoding().isUTF8()) {
       if (string.isCodeRangeBroken()) {
-        // TODO: For now we only warn for
-        // this case.  We will remove the warning and throw an exception in the 30.x release
-        context
-            .runtime
-            .getWarnings()
-            .warn("String is invalid UTF-8. This will be an error in a future version.");
+        throw createInvalidByteSequenceError(context, "String is invalid UTF-8.");
       }
     }
 
@@ -424,4 +428,5 @@ private static IRubyObject validateAndEncodeString(
   private static final long UINT_MAX = 0xffffffffl;
 
   private static RubyClass cTypeError;
+  private static RubyClass cInvalidByteSequenceError;
 }
diff --git a/ruby/tests/utf8.rb b/ruby/tests/utf8.rb
@@ -53,13 +53,10 @@ def test_map_value
 # Tests the case of string objects that are marked UTF-8, but contain invalid
 # UTF-8.
 #
-# For now these only warn, but in the next major version they will throw an
-# exception.
+# This case will raise Encoding::InvalidByteSequenceError
 class MarkedUtf8Test < Test::Unit::TestCase
   def assert_bad_utf8(&block)
-    warnings = CaptureWarnings.capture(&block)
-    assert_equal 1, warnings.length
-    assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
+    assert_raises(Encoding::InvalidByteSequenceError, &block)
   end
 
   def bad_utf8_string
@@ -79,13 +76,10 @@ def bad_utf8_string
   # valid UTF-8, but are later modified to be invalid UTF-8.  This may put the
   # string into an state of "unknown" validity.
   #
-  # For now these only warn, but in the next major version they will throw an
-  # exception.
+  # This case will raise Encoding::InvalidByteSequenceError
   class MarkedModifiedUtf8Test < Test::Unit::TestCase
     def assert_bad_utf8(&block)
-      warnings = CaptureWarnings.capture(&block)
-      assert_equal 1, warnings.length
-      assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
+      assert_raises(Encoding::InvalidByteSequenceError, &block)
     end
 
     def bad_utf8_string