Skip to content

Commit

Permalink
Throw InvalidProtocolBufferException when parsing invalid UTF-8 in C#.
Browse files Browse the repository at this point in the history
Uses UTF8Encoding configured with throwOnInvalidBytes=True instead of default Encoding.UTF8 per https://stackoverflow.com/questions/62762770/forcing-encoding-utf8-getstring-to-throw-an-argumentexception/62763077#62763077

This surfaces errors for bad UTF-8 earlier in C# for consistency across languages as announced in https://protobuf.dev/news/2024-10-02/#utf-8-enforcement

PiperOrigin-RevId: 694491987
  • Loading branch information
zhangskz authored and copybara-github committed Nov 8, 2024
1 parent fa3790f commit db9b2c8
Show file tree
Hide file tree
Showing 7 changed files with 58 additions and 47 deletions.
16 changes: 7 additions & 9 deletions csharp/src/Google.Protobuf.Test/CodedInputStreamTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
// https://developers.google.com/open-source/licenses/bsd
#endregion

using System;
using System.Buffers;
using System.IO;
using Google.Protobuf.TestProtos;
using Proto2 = Google.Protobuf.TestProtos.Proto2;
using NUnit.Framework;
using System;
using System.Buffers;
using System.IO;

namespace Google.Protobuf
{
Expand Down Expand Up @@ -577,12 +577,11 @@ public void SizeLimit()
}

/// <summary>
/// Tests that if we read an string that contains invalid UTF-8, no exception
/// is thrown. Instead, the invalid bytes are replaced with the Unicode
/// "replacement character" U+FFFD.
/// Tests that if we read a string that contains invalid UTF-8, an exception
/// is thrown.
/// </summary>
[Test]
public void ReadInvalidUtf8()
public void ReadInvalidUtf8ThrowsInvalidProtocolBufferException()
{
MemoryStream ms = new MemoryStream();
CodedOutputStream output = new CodedOutputStream(ms);
Expand All @@ -597,8 +596,7 @@ public void ReadInvalidUtf8()
CodedInputStream input = new CodedInputStream(ms);

Assert.AreEqual(tag, input.ReadTag());
string text = input.ReadString();
Assert.AreEqual('\ufffd', text[0]);
Assert.Throws<InvalidProtocolBufferException>(() => input.ReadString());
}

[Test]
Expand Down
16 changes: 6 additions & 10 deletions csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
// https://developers.google.com/open-source/licenses/bsd
#endregion

using System;
using System.IO;
using Google.Protobuf.Collections;
using Google.Protobuf.TestProtos;
using Google.Protobuf.WellKnownTypes;
using NUnit.Framework;
using System;
using System.IO;
using System.Linq;
using Google.Protobuf.WellKnownTypes;
using Google.Protobuf.Collections;

namespace Google.Protobuf
{
Expand Down Expand Up @@ -112,14 +112,10 @@ public void Roundtrip_UnpairedSurrogate()
}

[Test]
public void InvalidUtf8ParsesAsReplacementChars()
public void ParseInvalidUtf8Rejected()
{
var payload = new byte[] { 0x72, 1, 0x80 };

// We would prefer to have this parse operation fail, but at the moment it substitutes
// the replacement character.
var message = TestAllTypes.Parser.ParseFrom(payload);
Assert.AreEqual("\ufffd", message.SingleString);
Assert.Throws<InvalidProtocolBufferException>(() => TestAllTypes.Parser.ParseFrom(payload));
}

[Test]
Expand Down
11 changes: 6 additions & 5 deletions csharp/src/Google.Protobuf.Test/ParsingPrimitivesTest.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#region Copyright notice and license
#region Copyright notice and license
// Protocol Buffers - Google's data interchange format
// Copyright 2022 Google Inc. All rights reserved.
//
Expand All @@ -24,11 +24,12 @@ internal class ParsingPrimitivesTest
[TestCase("A\ufffd\ufffdB", 65, 255, 255, 66)]
// Overlong form of "space"
[TestCase("\ufffd\ufffd", 0xc0, 0xa0)]
public void ReadRawString_NonUtf8(string expectedText, params int[] bytes)
public void ReadRawString_NonUtf8ThrowsInvalidProtocolBufferException(string expectedText, params int[] bytes)
{
var context = CreateContext(bytes);
string text = ParsingPrimitives.ReadRawString(ref context.buffer, ref context.state, bytes.Length);
Assert.AreEqual(expectedText, text);
Assert.Throws<InvalidProtocolBufferException>(() => {
var context = CreateContext(bytes);
ParsingPrimitives.ReadRawString(ref context.buffer, ref context.state, bytes.Length);
});
}

private static ParseContext CreateContext(int[] bytes)
Expand Down
6 changes: 6 additions & 0 deletions csharp/src/Google.Protobuf/InvalidProtocolBufferException.cs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ internal static InvalidProtocolBufferException InvalidBase64(Exception innerExce
return new InvalidProtocolBufferException("Invalid base64 data", innerException);
}

internal static InvalidProtocolBufferException InvalidUtf8(Exception innerException)
{
return new InvalidProtocolBufferException(
"String is invalid UTF-8.", innerException);
}

internal static InvalidProtocolBufferException InvalidEndTag()
{
return new InvalidProtocolBufferException(
Expand Down
35 changes: 30 additions & 5 deletions csharp/src/Google.Protobuf/ParsingPrimitives.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#region Copyright notice and license
#region Copyright notice and license
// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc. All rights reserved.
//
Expand All @@ -15,6 +15,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Security;
using System.Text;

namespace Google.Protobuf
{
Expand All @@ -24,6 +25,9 @@ namespace Google.Protobuf
[SecuritySafeCritical]
internal static class ParsingPrimitives
{
internal static readonly Encoding Utf8Encoding =
new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);

private const int StackallocThreshold = 256;

/// <summary>
Expand Down Expand Up @@ -576,7 +580,14 @@ public static string ReadRawString(ref ReadOnlySpan<byte> buffer, ref ParserInte
{
fixed (byte* sourceBytes = &MemoryMarshal.GetReference(data))
{
value = WritingPrimitives.Utf8Encoding.GetString(sourceBytes, length);
try
{
value = Utf8Encoding.GetString(sourceBytes, length);
}
catch (DecoderFallbackException e)
{
throw InvalidProtocolBufferException.InvalidUtf8(e);
}
}
}

Expand Down Expand Up @@ -618,8 +629,14 @@ private static string ReadStringSlow(ref ReadOnlySpan<byte> buffer, ref ParserIn
// Make compiler happy by passing a new span created from pointer.
var tempSpan = new Span<byte>(pByteSpan, byteSpan.Length);
ReadRawBytesIntoSpan(ref buffer, ref state, length, tempSpan);

return WritingPrimitives.Utf8Encoding.GetString(pByteSpan, length);
try
{
return Utf8Encoding.GetString(pByteSpan, length);
}
catch (DecoderFallbackException e)
{
throw InvalidProtocolBufferException.InvalidUtf8(e);
}
}
}
}
Expand All @@ -637,7 +654,15 @@ private static string ReadStringSlow(ref ReadOnlySpan<byte> buffer, ref ParserIn
// This will be called when reading from a Stream because we don't know the length of the stream,
// or there is not enough data in the sequence. If there is not enough data then ReadRawBytes will
// throw an exception.
return WritingPrimitives.Utf8Encoding.GetString(ReadRawBytes(ref buffer, ref state, length), 0, length);
byte[] bytes = ReadRawBytes(ref buffer, ref state, length);
try
{
return Utf8Encoding.GetString(bytes, 0, length);
}
catch (DecoderFallbackException e)
{
throw InvalidProtocolBufferException.InvalidUtf8(e);
}
}

/// <summary>
Expand Down
17 changes: 0 additions & 17 deletions csharp/src/Google.Protobuf/Reflection/FeatureSetDescriptor.g.cs

This file was deleted.

4 changes: 3 additions & 1 deletion csharp/src/Google.Protobuf/WritingPrimitives.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ namespace Google.Protobuf
[SecuritySafeCritical]
internal static class WritingPrimitives
{
// Ideally we would use the same UTF8Encoding as parse, but we should be able to serialize
// invalid UTF-8 that make it in via unvalidated setters without throwing an exception.
#if NET5_0_OR_GREATER
internal static Encoding Utf8Encoding => Encoding.UTF8; // allows JIT to devirtualize
#else
Expand All @@ -35,7 +37,7 @@ internal static class WritingPrimitives
// difference.)
#endif

#region Writing of values (not including tags)
#region Writing of values (not including tags)

/// <summary>
/// Writes a double field value, without a tag, to the stream.
Expand Down

0 comments on commit db9b2c8

Please sign in to comment.