From 3e82ed436b498a8ea953c89f99fc66eba0bdc6a0 Mon Sep 17 00:00:00 2001 From: Tony Liao Date: Thu, 31 Oct 2024 14:57:19 -0700 Subject: [PATCH] Generate internal hasbits for singular proto3 implicit presence fields. N.B.: - This change is not intended to affect any well-defined protobuf behaviour in an observable way. - The wire parsing codepath is not affected. - This change only affects the C++ protobuf implementation (other languages are not affected). - sizeof proto3 message objects may increase in 32-bit increments to accommodate hasbits. - When profiled on some of Google's largest binaries, we have seen a code size increase of ~0.1%, which we consider to be a reasonable increase. There are quite a few terminologies in the title: - **singular**: a field that is not repeated, not oneof, not extension, not lazy, just a field with a simple primitive type (number or boolean), or string/bytes. - **proto3**: describes behaviour consistent to the "proto3" syntax. This is equivalent to `edition = "2023"` with `option features.field_presence = IMPLICIT;`. - **implicit presence**: describes behaviour consistent with "non-optional" fields in proto3. This is described in more detail in https://protobuf.dev/programming-guides/field_presence/#presence-in-proto3-apis This change enables C++ proto3 objects to generate hasbits for regular proto3 (i.e. non-`optional`) fields. This code change might make certain codepaths negligibly more efficient, but large improvement or regression is unlikely. A larger performance improvement is expected from generating hasbits for repeated fields -- this change will pave the way for future work there. Hasbits in C++ will have slightly different semantics for implicit presence fields. In the past, all hasbits are true field presence indicators. If the hasbit is set, the field is guaranteed to be present; if the hasbit is unset, the field is guaranteed to be missing. This change introduces a new hasbit mode that I will call "hint hasbits", denoted by a newly-introduced enum, `internal::cpp::HasbitMode::kHintHasbit`. For implicit presence fields, it may be possible to mutate the field and have it end up as a zero field, especially with `mutable_foo` APIs. To handle those cases correctly, we unconditionally set the hasbit when `mutable_foo` is called, then we must do an additional check for field emptiness before serializing the field onto the wire. PiperOrigin-RevId: 691945237 --- .../cpp/field_generators/string_field.cc | 23 +- src/google/protobuf/compiler/cpp/message.cc | 80 ++++-- src/google/protobuf/descriptor.cc | 22 +- src/google/protobuf/descriptor.h | 29 +- src/google/protobuf/descriptor_unittest.cc | 272 ++++++++++++++++++ src/google/protobuf/dynamic_message.cc | 44 ++- .../protobuf/generated_message_reflection.cc | 204 ++++++++----- .../protobuf/generated_message_reflection.h | 4 + src/google/protobuf/message.h | 21 +- 9 files changed, 603 insertions(+), 96 deletions(-) diff --git a/src/google/protobuf/compiler/cpp/field_generators/string_field.cc b/src/google/protobuf/compiler/cpp/field_generators/string_field.cc index 1fc3951d49afd..bd1aa7f2b7c24 100644 --- a/src/google/protobuf/compiler/cpp/field_generators/string_field.cc +++ b/src/google/protobuf/compiler/cpp/field_generators/string_field.cc @@ -16,6 +16,7 @@ #include "absl/log/absl_check.h" #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" #include "google/protobuf/compiler/cpp/field.h" #include "google/protobuf/compiler/cpp/field_generators/generators.h" #include "google/protobuf/compiler/cpp/helpers.h" @@ -23,12 +24,15 @@ #include "google/protobuf/descriptor.h" #include "google/protobuf/descriptor.pb.h" #include "google/protobuf/io/printer.h" +#include "google/protobuf/port.h" namespace google { namespace protobuf { namespace compiler { namespace cpp { namespace { +using ::google::protobuf::internal::cpp::GetFieldHasbitMode; +using ::google::protobuf::internal::cpp::HasbitMode; using ::google::protobuf::internal::cpp::HasHasbit; using ::google::protobuf::io::AnnotationCollector; using Sub = ::google::protobuf::io::Printer::Sub; @@ -533,6 +537,22 @@ void SingularString::GenerateClearingCode(io::Printer* p) const { )cc"); } +// Returns "ClearNonDefaultToEmpty" or "ClearToEmpty" depending on whether the +// field might still point to the default string instance. +absl::string_view GetClearFunctionForField(const FieldDescriptor* field) { + switch (GetFieldHasbitMode(field)) { + case HasbitMode::kNoHasbit: + case HasbitMode::kHintHasbit: + // TODO: b/376149315 - Would be nice to call ClearNonDefaultToEmpty for + // hint hasbits too. + return "ClearToEmpty"; + case HasbitMode::kTrueHasbit: + return "ClearNonDefaultToEmpty"; + default: + internal::Unreachable(); + } +} + void SingularString::GenerateMessageClearingCode(io::Printer* p) const { if (is_oneof()) { p->Emit(R"cc( @@ -573,8 +593,7 @@ void SingularString::GenerateMessageClearingCode(io::Printer* p) const { return; } - p->Emit({{"Clear", - HasHasbit(field_) ? "ClearNonDefaultToEmpty" : "ClearToEmpty"}}, + p->Emit({{"Clear", GetClearFunctionForField(field_)}}, R"cc( $field_$.$Clear$(); )cc"); diff --git a/src/google/protobuf/compiler/cpp/message.cc b/src/google/protobuf/compiler/cpp/message.cc index f5ac295c41992..903331515a7cd 100644 --- a/src/google/protobuf/compiler/cpp/message.cc +++ b/src/google/protobuf/compiler/cpp/message.cc @@ -62,6 +62,8 @@ namespace cpp { namespace { using ::google::protobuf::internal::WireFormat; using ::google::protobuf::internal::WireFormatLite; +using ::google::protobuf::internal::cpp::GetFieldHasbitMode; +using ::google::protobuf::internal::cpp::HasbitMode; using ::google::protobuf::internal::cpp::HasHasbit; using Semantic = ::google::protobuf::io::AnnotationCollector::Semantic; using Sub = ::google::protobuf::io::Printer::Sub; @@ -187,7 +189,7 @@ RunMap FindRuns(const std::vector& fields, void EmitNonDefaultCheck(io::Printer* p, const std::string& prefix, const FieldDescriptor* field) { - ABSL_CHECK(!HasHasbit(field)); + ABSL_CHECK(GetFieldHasbitMode(field) != HasbitMode::kTrueHasbit); ABSL_CHECK(!field->is_repeated()); ABSL_CHECK(!field->containing_oneof() || field->real_containing_oneof()); @@ -216,19 +218,27 @@ void EmitNonDefaultCheck(io::Printer* p, const std::string& prefix, } bool ShouldEmitNonDefaultCheck(const FieldDescriptor* field) { - return (!field->is_repeated() && !field->containing_oneof()) || - field->real_containing_oneof(); + if (GetFieldHasbitMode(field) == HasbitMode::kTrueHasbit) { + return false; + } + return !field->is_repeated(); } // Emits an if-statement with a condition that evaluates to true if |field| is // considered non-default (will be sent over the wire), for message types // without true field presence. Should only be called if // !HasHasbit(field). +// If |with_enclosing_braces_always| is set to true, will generate enclosing +// braces even if nondefault check is not emitted -- i.e. code may look like: +// { +// // code... +// } +// If |with_enclosing_braces_always| is set to false, enclosing braces will not +// be generated if nondefault check is not emitted. void MayEmitIfNonDefaultCheck(io::Printer* p, const std::string& prefix, const FieldDescriptor* field, - absl::AnyInvocable emit_body) { - ABSL_CHECK(!HasHasbit(field)); - + absl::AnyInvocable emit_body, + bool with_enclosing_braces_always) { if (ShouldEmitNonDefaultCheck(field)) { p->Emit( { @@ -240,7 +250,10 @@ void MayEmitIfNonDefaultCheck(io::Printer* p, const std::string& prefix, $emit_body$; } )cc"); - } else { + return; + } + + if (with_enclosing_braces_always) { // In repeated fields, the same variable name may be emitted multiple // times, hence the need for emitting braces even when the if condition is // not necessary, so that the code looks like: @@ -259,7 +272,11 @@ void MayEmitIfNonDefaultCheck(io::Printer* p, const std::string& prefix, $emit_body$; } )cc"); + return; } + + // If no enclosing braces need to be emitted, just emit the body directly. + emit_body(); } bool HasInternalHasMethod(const FieldDescriptor* field) { @@ -1034,7 +1051,7 @@ void MessageGenerator::GenerateSingularFieldHasBits( )cc"); return; } - if (HasHasbit(field)) { + if (GetFieldHasbitMode(field) == HasbitMode::kTrueHasbit) { auto v = p->WithVars(HasBitVars(field)); p->Emit( {Sub{"ASSUME", @@ -1244,7 +1261,8 @@ void MessageGenerator::EmitCheckAndUpdateByteSizeForField( }; if (!HasHasbit(field)) { - MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body)); + MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body), + /*with_enclosing_braces_always=*/true); return; } if (field->options().weak()) { @@ -1260,10 +1278,17 @@ void MessageGenerator::EmitCheckAndUpdateByteSizeForField( int has_bit_index = has_bit_indices_[field->index()]; p->Emit({{"mask", absl::StrFormat("0x%08xu", uint32_t{1} << (has_bit_index % 32))}, - {"emit_body", [&] { emit_body(); }}}, + {"check_nondefault_and_emit_body", + [&] { + // Note that it's possible that the field has explicit presence. + // In that case, nondefault check will not be emitted but + // emit_body will still be emitted. + MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body), + /*with_enclosing_braces_always=*/false); + }}}, R"cc( if (cached_has_bits & $mask$) { - $emit_body$; + $check_nondefault_and_emit_body$; } )cc"); } @@ -4227,9 +4252,10 @@ void MessageGenerator::GenerateClassSpecificMergeImpl(io::Printer* p) { } else if (field->is_optional() && !HasHasbit(field)) { // Merge semantics without true field presence: primitive fields are // merged only if non-zero (numeric) or non-empty (string). - MayEmitIfNonDefaultCheck(p, "from.", field, /*emit_body=*/[&]() { - generator.GenerateMergingCode(p); - }); + MayEmitIfNonDefaultCheck( + p, "from.", field, + /*emit_body=*/[&]() { generator.GenerateMergingCode(p); }, + /*with_enclosing_braces_always=*/true); } else if (field->options().weak() || cached_has_word_index != HasWordIndex(field)) { // Check hasbit, not using cached bits. @@ -4250,10 +4276,20 @@ void MessageGenerator::GenerateClassSpecificMergeImpl(io::Printer* p) { format("if (cached_has_bits & 0x$1$u) {\n", mask); format.Indent(); - if (check_has_byte && IsPOD(field)) { - generator.GenerateCopyConstructorCode(p); + if (GetFieldHasbitMode(field) == HasbitMode::kHintHasbit) { + // Merge semantics without true field presence: primitive fields are + // merged only if non-zero (numeric) or non-empty (string). + MayEmitIfNonDefaultCheck( + p, "from.", field, + /*emit_body=*/[&]() { generator.GenerateMergingCode(p); }, + /*with_enclosing_braces_always=*/false); } else { - generator.GenerateMergingCode(p); + ABSL_DCHECK(GetFieldHasbitMode(field) == HasbitMode::kTrueHasbit); + if (check_has_byte && IsPOD(field)) { + generator.GenerateCopyConstructorCode(p); + } else { + generator.GenerateMergingCode(p); + } } format.Outdent(); @@ -4476,7 +4512,12 @@ void MessageGenerator::GenerateSerializeOneField(io::Printer* p, if (HasHasbit(field)) { p->Emit( { - {"body", emit_body}, + {"body", + [&]() { + MayEmitIfNonDefaultCheck(p, "this_.", field, + std::move(emit_body), + /*with_enclosing_braces_always=*/false); + }}, {"cond", [&] { int has_bit_index = HasBitIndex(field); @@ -4496,7 +4537,8 @@ void MessageGenerator::GenerateSerializeOneField(io::Printer* p, } )cc"); } else if (field->is_optional()) { - MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body)); + MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body), + /*with_enclosing_braces_always=*/true); } else { emit_body(); } diff --git a/src/google/protobuf/descriptor.cc b/src/google/protobuf/descriptor.cc index 637921b0ce92a..2ff141059ca3f 100644 --- a/src/google/protobuf/descriptor.cc +++ b/src/google/protobuf/descriptor.cc @@ -9798,9 +9798,27 @@ bool HasPreservingUnknownEnumSemantics(const FieldDescriptor* field) { return field->enum_type() != nullptr && !field->enum_type()->is_closed(); } +HasbitMode GetFieldHasbitMode(const FieldDescriptor* field) { + // Do not generate hasbits for "real-oneof" and weak fields. + if (field->real_containing_oneof() || field->options().weak()) { + return HasbitMode::kNoHasbit; + } + + // Explicit-presence fields always have true hasbits. + if (field->has_presence()) { + return HasbitMode::kTrueHasbit; + } + + // Implicit presence fields. + if (!field->is_repeated()) { + return HasbitMode::kHintHasbit; + } + // We currently don't implement hasbits for implicit repeated fields. + return HasbitMode::kNoHasbit; +} + bool HasHasbit(const FieldDescriptor* field) { - return field->has_presence() && !field->real_containing_oneof() && - !field->options().weak(); + return GetFieldHasbitMode(field) != HasbitMode::kNoHasbit; } static bool IsVerifyUtf8(const FieldDescriptor* field, bool is_lite) { diff --git a/src/google/protobuf/descriptor.h b/src/google/protobuf/descriptor.h index 9ac74624032a5..3bc2a1b5e2f98 100644 --- a/src/google/protobuf/descriptor.h +++ b/src/google/protobuf/descriptor.h @@ -42,6 +42,7 @@ #include "google/protobuf/stubs/common.h" #include "absl/base/attributes.h" #include "absl/base/call_once.h" +#include "absl/base/optimization.h" #include "absl/container/btree_map.h" #include "absl/container/flat_hash_map.h" #include "absl/functional/any_invocable.h" @@ -2949,9 +2950,33 @@ constexpr int MaxMessageDeclarationNestingDepth() { return 32; } PROTOBUF_EXPORT bool HasPreservingUnknownEnumSemantics( const FieldDescriptor* field); -PROTOBUF_EXPORT bool HasHasbit(const FieldDescriptor* field); - #ifndef SWIG +enum class HasbitMode : uint8_t { + // Hasbits do not exist for the field. + kNoHasbit, + // Hasbits exist and indicate field presence. + // Hasbit is set if and only if field is present. + kTrueHasbit, + // Hasbits exist and "hint at" field presence. + // When hasbit is set, field is 'probably' present, but field accessors must + // still check for field presence (i.e. false positives are possible). + // When hasbit is unset, field is guaranteed to be not present. + kHintHasbit, +}; + +// Returns the "hasbit mode" of the field. Depending on the implementation, a +// field can: +// - have no hasbits in its internal object (kNoHasbit); +// - have hasbits where hasbit == 1 indicates field presence and hasbit == 0 +// indicates an unset field (kTrueHasbit); +// - have hasbits where hasbit == 1 indicates "field is possibly modified" and +// hasbit == 0 indicates "field is definitely missing" (kHintHasbit). +PROTOBUF_EXPORT HasbitMode GetFieldHasbitMode(const FieldDescriptor* field); + +// Returns true if there are hasbits for the field. +// Note that this does not correlate with "hazzer"s, i.e., whether has_foo APIs +// are emitted. +PROTOBUF_EXPORT bool HasHasbit(const FieldDescriptor* field); enum class Utf8CheckMode : uint8_t { kStrict = 0, // Parsing will fail if non UTF-8 data is in string fields. diff --git a/src/google/protobuf/descriptor_unittest.cc b/src/google/protobuf/descriptor_unittest.cc index c9a109c6691e7..347e05fec846c 100644 --- a/src/google/protobuf/descriptor_unittest.cc +++ b/src/google/protobuf/descriptor_unittest.cc @@ -20,8 +20,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -77,7 +79,10 @@ // Must be included last. #include "google/protobuf/port_def.inc" +using ::google::protobuf::internal::cpp::GetFieldHasbitMode; using ::google::protobuf::internal::cpp::GetUtf8CheckMode; +using ::google::protobuf::internal::cpp::HasbitMode; +using ::google::protobuf::internal::cpp::HasHasbit; using ::google::protobuf::internal::cpp::HasPreservingUnknownEnumSemantics; using ::google::protobuf::internal::cpp::Utf8CheckMode; using ::testing::AnyOf; @@ -2994,6 +2999,273 @@ TEST_F(MiscTest, FieldOptions) { EXPECT_EQ(FieldOptions::CORD, bar->options().ctype()); } +// =================================================================== + +struct HasHasbitTestParam { + struct ExpectedOutput { + HasbitMode expected_hasbitmode; + bool expected_has_presence; + bool expected_has_hasbit; + }; + + std::string input_foo_proto; + ExpectedOutput expected_output; +}; + +class HasHasbitTest : public testing::TestWithParam { + protected: + void SetUp() override { + ASSERT_TRUE( + TextFormat::ParseFromString(GetParam().input_foo_proto, &foo_proto_)); + foo_ = pool_.BuildFile(foo_proto_); + } + + const FieldDescriptor* GetField() { return foo_->message_type(0)->field(0); } + + DescriptorPool pool_; + FileDescriptorProto foo_proto_; + const FileDescriptor* foo_; +}; + +TEST_P(HasHasbitTest, TestHasHasbitExplicitPresence) { + EXPECT_EQ(GetField()->has_presence(), + GetParam().expected_output.expected_has_presence); + EXPECT_EQ(GetFieldHasbitMode(GetField()), + GetParam().expected_output.expected_hasbitmode); + EXPECT_EQ(HasHasbit(GetField()), + GetParam().expected_output.expected_has_hasbit); +} + +// NOTE: with C++20 we can use designated initializers to ensure +// that struct members match commented names, but as we are still working with +// C++17 in the foreseeable future, we won't be able to refactor this for a +// while... +// https://github.com/google/oss-policies-info/blob/main/foundational-cxx-support-matrix.md +INSTANTIATE_TEST_SUITE_P( + HasHasbitLegacySyntaxTests, HasHasbitTest, + testing::Values( + // Test case: proto2 singular fields + HasHasbitTestParam{R"pb(name: 'foo.proto' + package: 'foo' + syntax: 'proto2' + message_type { + name: 'FooMessage' + field { + name: 'f' + number: 1 + type: TYPE_INT64 + label: LABEL_OPTIONAL + } + } + )pb", + /*expected_output=*/{ + /*expected_hasbitmode=*/HasbitMode::kTrueHasbit, + /*expected_has_presence=*/true, + /*expected_has_hasbit=*/true, + }}, + // Test case: proto2 repeated fields + HasHasbitTestParam{R"pb(name: 'foo.proto' + package: 'foo' + syntax: 'proto2' + message_type { + name: 'FooMessage' + field { + name: 'f' + number: 1 + type: TYPE_STRING + label: LABEL_REPEATED + } + } + )pb", + /*expected_output=*/{ + /*expected_hasbitmode=*/HasbitMode::kNoHasbit, + /*expected_has_presence=*/false, + /*expected_has_hasbit=*/false, + }}, + // Test case: proto3 singular fields + HasHasbitTestParam{R"pb(name: 'foo.proto' + package: 'foo' + syntax: 'proto3' + message_type { + name: 'FooMessage' + field { + name: 'f' + number: 1 + type: TYPE_INT64 + label: LABEL_OPTIONAL + } + } + )pb", + /*expected_output=*/{ + /*expected_hasbitmode=*/HasbitMode::kHintHasbit, + /*expected_has_presence=*/false, + /*expected_has_hasbit=*/true, + }}, + // Test case: proto3 optional fields + HasHasbitTestParam{ + R"pb(name: 'foo.proto' + package: 'foo' + syntax: 'proto3' + message_type { + name: 'Foo' + field { + name: 'int_field' + number: 1 + type: TYPE_INT32 + label: LABEL_OPTIONAL + oneof_index: 0 + proto3_optional: true + } + oneof_decl { name: '_int_field' } + } + )pb", + /*expected_output=*/{ + /*expected_hasbitmode=*/HasbitMode::kTrueHasbit, + /*expected_has_presence=*/true, + /*expected_has_hasbit=*/true, + }}, + // Test case: proto3 repeated fields + HasHasbitTestParam{R"pb(name: 'foo.proto' + package: 'foo' + syntax: 'proto3' + message_type { + name: 'FooMessage' + field { + name: 'f' + number: 1 + type: TYPE_STRING + label: LABEL_REPEATED + } + } + )pb", + /*expected_output=*/{ + /*expected_hasbitmode=*/HasbitMode::kNoHasbit, + /*expected_has_presence=*/false, + /*expected_has_hasbit=*/false, + }})); + +// NOTE: with C++20 we can use designated initializers to ensure +// that struct members match commented names, but as we are still working with +// C++17 in the foreseeable future, we won't be able to refactor this for a +// while... +// https://github.com/google/oss-policies-info/blob/main/foundational-cxx-support-matrix.md +INSTANTIATE_TEST_SUITE_P( + HasHasbitEditionsTests, HasHasbitTest, + testing::Values( + // Test case: explicit-presence, singular fields + HasHasbitTestParam{ + R"pb(name: 'foo.proto' + package: 'foo' + syntax: 'editions' + edition: EDITION_2023 + message_type { + name: 'FooMessage' + field { + name: 'f' + number: 1 + type: TYPE_INT64 + options { features { field_presence: EXPLICIT } } + } + } + )pb", + /*expected_output=*/{ + /*expected_hasbitmode=*/HasbitMode::kTrueHasbit, + /*expected_has_presence=*/true, + /*expected_has_hasbit=*/true, + }}, + // Test case: implicit-presence, singular fields + HasHasbitTestParam{ + R"pb(name: 'foo.proto' + package: 'foo' + syntax: 'editions' + edition: EDITION_2023 + message_type { + name: 'FooMessage' + field { + name: 'f' + number: 1 + type: TYPE_INT64 + options { features { field_presence: IMPLICIT } } + } + } + )pb", + /*expected_output=*/{ + /*expected_hasbitmode=*/HasbitMode::kHintHasbit, + /*expected_has_presence=*/false, + /*expected_has_hasbit=*/true, + }}, + // Test case: oneof fields. + // Note that oneof fields can't specify field presence. + HasHasbitTestParam{ + R"pb(name: 'foo.proto' + package: 'foo' + syntax: 'editions' + edition: EDITION_2023 + message_type { + name: 'FooMessage' + field { + name: 'f' + number: 1 + type: TYPE_STRING + oneof_index: 0 + } + oneof_decl { name: "onebar" } + } + )pb", + /*expected_output=*/{ + /*expected_hasbitmode=*/HasbitMode::kNoHasbit, + /*expected_has_presence=*/true, + /*expected_has_hasbit=*/false, + }}, + // Test case: message fields. + // Note that message fields cannot specify implicit presence. + HasHasbitTestParam{ + R"pb(name: 'foo.proto' + package: 'foo' + syntax: 'editions' + edition: EDITION_2023 + message_type { + name: 'FooMessage' + field { + name: 'f' + number: 1 + type: TYPE_MESSAGE + type_name: "Bar" + } + } + message_type { + name: 'Bar' + field { name: 'int_field' number: 1 type: TYPE_INT32 } + } + )pb", + /*expected_output=*/{ + /*expected_hasbitmode=*/HasbitMode::kTrueHasbit, + /*expected_has_presence=*/true, + /*expected_has_hasbit=*/true, + }}, + // Test case: repeated fields. + // Note that repeated fields can't specify presence. + HasHasbitTestParam{R"pb(name: 'foo.proto' + package: 'foo' + syntax: 'editions' + edition: EDITION_2023 + message_type { + name: 'FooMessage' + field { + name: 'f' + number: 1 + type: TYPE_STRING + label: LABEL_REPEATED + } + } + )pb", + /*expected_output=*/{ + /*expected_hasbitmode=*/HasbitMode::kNoHasbit, + /*expected_has_presence=*/false, + /*expected_has_hasbit=*/false, + }})); + + // =================================================================== enum DescriptorPoolMode { NO_DATABASE, FALLBACK_DATABASE }; diff --git a/src/google/protobuf/dynamic_message.cc b/src/google/protobuf/dynamic_message.cc index 43a00ad800cd1..5c403716d6719 100644 --- a/src/google/protobuf/dynamic_message.cc +++ b/src/google/protobuf/dynamic_message.cc @@ -53,6 +53,7 @@ #include "absl/base/attributes.h" #include "absl/hash/hash.h" #include "absl/log/absl_check.h" +#include "absl/log/absl_log.h" #include "absl/types/variant.h" #include "absl/utility/utility.h" #include "google/protobuf/arenastring.h" @@ -465,6 +466,11 @@ namespace { bool IsMapFieldInApi(const FieldDescriptor* field) { return field->is_map(); } +bool IsMapEntryField(const FieldDescriptor* field) { + return (field->containing_type() != nullptr && + field->containing_type()->options().map_entry()); +} + inline bool InRealOneof(const FieldDescriptor* field) { return field->real_containing_oneof() != nullptr; @@ -1086,7 +1092,43 @@ const Message* DynamicMessageFactory::GetPrototypeNoLock( type_info->has_bits_offset = -1; int max_hasbit = 0; for (int i = 0; i < type->field_count(); i++) { - if (internal::cpp::HasHasbit(type->field(i))) { + const FieldDescriptor* field = type->field(i); + + // If a field has hasbits, it could be either an explicit-presence or + // implicit-presence field. Explicit presence fields will have "true + // hasbits" where hasbit is set iff field is present. Implicit presence + // fields will have "hint hasbits" where + // - if hasbit is unset, field is not present. + // - if hasbit is set, field is present if it is also nonempty. + if (internal::cpp::HasHasbit(field)) { + // TODO: b/112602698 - during Python textproto serialization, MapEntry + // messages may be generated from DynamicMessage on the fly. C++ + // implementations of MapEntry messages always have hasbits, but + // has_presence return values might be different depending on how field + // presence is set. For MapEntrys, has_presence returns true for + // explicit-presence (proto2) messages and returns false for + // implicit-presence (proto3) messages. + // + // In the case of implicit presence, there is a potential inconsistency in + // code behavior between C++ and Python: + // - If C++ implementation is linked, hasbits are always generated for + // MapEntry messages, and MapEntry messages will behave like explicit + // presence. + // - If C++ implementation is not linked, Python defaults to the + // DynamicMessage implementation for MapEntrys which traditionally does + // not assume the presence of hasbits, so the default Python behavior + // for MapEntry messages (by default C++ implementations are not linked) + // will fall back to the DynamicMessage implementation and behave like + // implicit presence. + // This is an inconsistency and this if-condition preserves it. + // + // Longer term, we want to get rid of this additional if-check of + // IsMapEntryField. It might take one or more breaking changes and more + // consensus gathering & clarification though. + if (!field->has_presence() && IsMapEntryField(field)) { + continue; + } + if (type_info->has_bits_offset == -1) { // At least one field in the message requires a hasbit, so allocate // hasbits. diff --git a/src/google/protobuf/generated_message_reflection.cc b/src/google/protobuf/generated_message_reflection.cc index baf77078f662c..6ce9db5684fd3 100644 --- a/src/google/protobuf/generated_message_reflection.cc +++ b/src/google/protobuf/generated_message_reflection.cc @@ -31,6 +31,7 @@ #include "absl/strings/str_format.h" #include "absl/strings/string_view.h" #include "absl/synchronization/mutex.h" +#include "google/protobuf/arena.h" #include "google/protobuf/descriptor.h" #include "google/protobuf/descriptor.pb.h" #include "google/protobuf/descriptor_lite.h" @@ -79,6 +80,11 @@ namespace protobuf { namespace { bool IsMapFieldInApi(const FieldDescriptor* field) { return field->is_map(); } +bool IsMapEntry(const FieldDescriptor* field) { + return (field->containing_type() != nullptr && + field->containing_type()->options().map_entry()); +} + Message* MaybeForceCopy(Arena* arena, Message* msg) { if (arena != nullptr || msg == nullptr) return msg; @@ -1178,7 +1184,7 @@ void Reflection::SwapFieldsImpl( // oneof already. This has to be done after SwapField, because SwapField // may depend on the information in has bits. if (!field->is_repeated()) { - SwapHasBit(message1, message2, field); + NaiveSwapHasBit(message1, message2, field); if (field->cpp_type() == FieldDescriptor::CPPTYPE_STRING && field->cpp_string_type() == FieldDescriptor::CppStringType::kString && @@ -1749,7 +1755,8 @@ void Reflection::ListFields(const Message& message, } } else if (has_bits && has_bits_indices[i] != static_cast(-1)) { // Equivalent to: HasFieldSingular(message, field) - if (IsIndexInHasBitSet(has_bits, has_bits_indices[i])) { + if (IsFieldPresentGivenHasbits(message, field, has_bits, + has_bits_indices[i])) { append_to_output(field); } } else if (HasFieldSingular(message, field)) { @@ -2989,74 +2996,127 @@ void Reflection::SwapInlinedStringDonated(Message* lhs, Message* rhs, } } -// Simple accessors for manipulating has_bits_. +bool Reflection::IsSingularFieldNonEmpty(const Message& message, + const FieldDescriptor* field) const { + ABSL_DCHECK(IsMapEntry(field) || !field->has_presence()); + ABSL_DCHECK(!field->is_repeated()); + ABSL_DCHECK(!field->is_map()); + ABSL_DCHECK(field->cpp_type() != FieldDescriptor::CPPTYPE_MESSAGE); + // Scalar primitive (numeric or string/bytes) fields are present if + // their value is non-zero (numeric) or non-empty (string/bytes). N.B.: + // we must use this definition here, rather than the "scalar fields + // always present" in the proto3 docs, because MergeFrom() semantics + // require presence as "present on wire", and reflection-based merge + // (which uses HasField()) needs to be consistent with this. + switch (field->cpp_type()) { + case FieldDescriptor::CPPTYPE_BOOL: + return GetRaw(message, field) != false; + case FieldDescriptor::CPPTYPE_INT32: + return GetRaw(message, field) != 0; + case FieldDescriptor::CPPTYPE_INT64: + return GetRaw(message, field) != 0; + case FieldDescriptor::CPPTYPE_UINT32: + return GetRaw(message, field) != 0; + case FieldDescriptor::CPPTYPE_UINT64: + return GetRaw(message, field) != 0; + case FieldDescriptor::CPPTYPE_FLOAT: + static_assert(sizeof(uint32_t) == sizeof(float), + "Code assumes uint32_t and float are the same size."); + return GetRaw(message, field) != 0; + case FieldDescriptor::CPPTYPE_DOUBLE: + static_assert(sizeof(uint64_t) == sizeof(double), + "Code assumes uint64_t and double are the same size."); + return GetRaw(message, field) != 0; + case FieldDescriptor::CPPTYPE_ENUM: + return GetRaw(message, field) != 0; + case FieldDescriptor::CPPTYPE_STRING: + switch (field->cpp_string_type()) { + case FieldDescriptor::CppStringType::kCord: + return !GetField(message, field).empty(); + case FieldDescriptor::CppStringType::kView: + case FieldDescriptor::CppStringType::kString: { + if (IsInlined(field)) { + return !GetField(message, field) + .GetNoArena() + .empty(); + } + + return !GetField(message, field).Get().empty(); + } + default: + internal::Unreachable(); + } + case FieldDescriptor::CPPTYPE_MESSAGE: + default: + internal::Unreachable(); + } +} + +bool Reflection::IsFieldPresentGivenHasbits(const Message& message, + const FieldDescriptor* field, + const uint32_t* hasbits, + uint32_t hasbit_index) const { + // If hasbit exists but is not set, field is guaranteed to be missing. + if (!IsIndexInHasBitSet(hasbits, hasbit_index)) { + return false; + } + + // For explicit-presence fields, a set hasbit indicates a present field. + if (field->has_presence()) { + return true; + } + + // proto3: hasbits are present, but an additional zero check must be + // performed because hasbit can be set to true while field is zero. + + // Repeated fields do not have hasbits enabled in proto3. + ABSL_DCHECK(!field->is_repeated()) + << "repeated fields do not have hasbits in proto3."; + + // Handling map entries in proto3: + // Implicit presence map fields are represented as a native C++ map, but their + // corresponding MapEntry messages (e.g. if we want to access them as repeated + // MapEntry fields) will unconditionally be generated with hasbits. MapEntrys + // behave like explicit presence fields. That is, in MapEntry's C++ + // implementation... + // - key can be null, empty, or nonempty; + // - value can be null, empty, or nonempty. + if (IsMapEntry(field)) { + return true; + } + + // This is the vanilla case: for a non-repeated primitive or string field, + // returns if the field is nonzero (i.e. present in proto3 semantics). + return IsSingularFieldNonEmpty(message, field); +} + bool Reflection::HasFieldSingular(const Message& message, const FieldDescriptor* field) const { ABSL_DCHECK(!field->options().weak()); if (schema_.HasBitIndex(field) != static_cast(-1)) { - return IsIndexInHasBitSet(GetHasBits(message), schema_.HasBitIndex(field)); + return IsFieldPresentGivenHasbits(message, field, GetHasBits(message), + schema_.HasBitIndex(field)); } - // proto3: no has-bits. All fields present except messages, which are + // The python implementation traditionally assumes that proto3 messages don't + // have hasbits. As a result, proto3 objects created through dynamic message + // in Python won't have hasbits. We need the following code to preserve + // compatibility. + // NOTE: It would be nice to be able to remove it, but we need one + // or more breaking changes in order to do so. + // + // proto3 with no has-bits. All fields present except messages, which are // present only if their message-field pointer is non-null. if (field->cpp_type() == FieldDescriptor::CPPTYPE_MESSAGE) { return !schema_.IsDefaultInstance(message) && GetRaw(message, field) != nullptr; - } else { - // Non-message field (and non-oneof, since that was handled in HasField() - // before calling us), and singular (again, checked in HasField). So, this - // field must be a scalar. - - // Scalar primitive (numeric or string/bytes) fields are present if - // their value is non-zero (numeric) or non-empty (string/bytes). N.B.: - // we must use this definition here, rather than the "scalar fields - // always present" in the proto3 docs, because MergeFrom() semantics - // require presence as "present on wire", and reflection-based merge - // (which uses HasField()) needs to be consistent with this. - switch (field->cpp_type()) { - case FieldDescriptor::CPPTYPE_STRING: - switch (field->cpp_string_type()) { - case FieldDescriptor::CppStringType::kCord: - return !GetField(message, field).empty(); - case FieldDescriptor::CppStringType::kView: - case FieldDescriptor::CppStringType::kString: { - if (IsInlined(field)) { - return !GetField(message, field) - .GetNoArena() - .empty(); - } - - return GetField(message, field).Get().size() > 0; - } - } - internal::Unreachable(); - case FieldDescriptor::CPPTYPE_BOOL: - return GetRaw(message, field) != false; - case FieldDescriptor::CPPTYPE_INT32: - return GetRaw(message, field) != 0; - case FieldDescriptor::CPPTYPE_INT64: - return GetRaw(message, field) != 0; - case FieldDescriptor::CPPTYPE_UINT32: - return GetRaw(message, field) != 0; - case FieldDescriptor::CPPTYPE_UINT64: - return GetRaw(message, field) != 0; - case FieldDescriptor::CPPTYPE_FLOAT: - static_assert(sizeof(uint32_t) == sizeof(float), - "Code assumes uint32_t and float are the same size."); - return GetRaw(message, field) != 0; - case FieldDescriptor::CPPTYPE_DOUBLE: - static_assert(sizeof(uint64_t) == sizeof(double), - "Code assumes uint64_t and double are the same size."); - return GetRaw(message, field) != 0; - case FieldDescriptor::CPPTYPE_ENUM: - return GetRaw(message, field) != 0; - case FieldDescriptor::CPPTYPE_MESSAGE: - // handled above; avoid warning - break; - } - ABSL_LOG(FATAL) << "Reached impossible case in HasFieldSingular()."; - return false; } + + // Non-message field (and non-oneof, since that was handled in HasField() + // before calling us), and singular (again, checked in HasField). So, this + // field must be a scalar. + + return IsSingularFieldNonEmpty(message, field); } void Reflection::SetHasBit(Message* message, @@ -3077,23 +3137,31 @@ void Reflection::ClearHasBit(Message* message, ~(static_cast(1) << (index % 32)); } -void Reflection::SwapHasBit(Message* message1, Message* message2, - const FieldDescriptor* field) const { +void Reflection::NaiveSwapHasBit(Message* message1, Message* message2, + const FieldDescriptor* field) const { ABSL_DCHECK(!field->options().weak()); if (!schema_.HasHasbits()) { return; } - bool temp_is_present = HasFieldSingular(*message1, field); - if (HasFieldSingular(*message2, field)) { - SetHasBit(message1, field); - } else { - ClearHasBit(message1, field); - } - if (temp_is_present) { + const Reflection* r1 = message1->GetReflection(); + const Reflection* r2 = message2->GetReflection(); + + bool is_m1_hasbit_set = IsIndexInHasBitSet(r1->GetHasBits(*message1), + r1->schema_.HasBitIndex(field)); + bool is_m2_hasbit_set = IsIndexInHasBitSet(r2->GetHasBits(*message2), + r2->schema_.HasBitIndex(field)); + + if (is_m1_hasbit_set) { SetHasBit(message2, field); } else { ClearHasBit(message2, field); } + + if (is_m2_hasbit_set) { + SetHasBit(message1, field); + } else { + ClearHasBit(message1, field); + } } bool Reflection::HasOneof(const Message& message, diff --git a/src/google/protobuf/generated_message_reflection.h b/src/google/protobuf/generated_message_reflection.h index aa75cd0a9511d..ef08dc77de163 100644 --- a/src/google/protobuf/generated_message_reflection.h +++ b/src/google/protobuf/generated_message_reflection.h @@ -148,6 +148,10 @@ struct ReflectionSchema { sizeof(uint32_t)); } + // Returns true iff the field object has usable hasbit offset. + // Note that this is not necessarily correlated with *field presence* : + // Fields with implicit presence (i.e. ones that don't expose has_foo API) + // can still have hasbits in their underlying implementation. bool HasHasbits() const { return has_bits_offset_ != -1; } // Bit index within the bit array of hasbits. Bit order is low-to-high. diff --git a/src/google/protobuf/message.h b/src/google/protobuf/message.h index 92edb721bdcc6..ee79157d12bcb 100644 --- a/src/google/protobuf/message.h +++ b/src/google/protobuf/message.h @@ -1234,6 +1234,17 @@ class PROTOBUF_EXPORT Reflection final { return schema_.IsFieldInlined(field); } + // For "proto3 non-optional" primitive fields, aka implicit-presence fields, + // returns true if the field is populated, i.e., nonzero. False otherwise. + bool IsSingularFieldNonEmpty(const Message& message, + const FieldDescriptor* field) const; + // Returns whether the field is present if there are usable hasbits in the + // field schema. (Note that in some cases hasbits are merely a hint to + // indicate "possible presence", and another empty-check is required). + bool IsFieldPresentGivenHasbits(const Message& message, + const FieldDescriptor* field, + const uint32_t* hasbits, + uint32_t hasbit_index) const; // Returns true if the field is considered to be present. // Requires the input to be 'singular' i.e. non-extension, non-oneof, non-weak // field. @@ -1243,8 +1254,14 @@ class PROTOBUF_EXPORT Reflection final { const FieldDescriptor* field) const; void SetHasBit(Message* message, const FieldDescriptor* field) const; inline void ClearHasBit(Message* message, const FieldDescriptor* field) const; - inline void SwapHasBit(Message* message1, Message* message2, - const FieldDescriptor* field) const; + // Naively swaps the hasbit without checking for field existence. + // For explicit presence fields, the hasbit is swapped normally. + // For implicit presence fields, the hasbit is swapped without checking for + // field emptiness. That is, the destination message may have hasbit set even + // if the field is empty. This should still result in correct behaviour due to + // HasbitMode being set to kHintHasbits for implicit presence fields. + inline void NaiveSwapHasBit(Message* message1, Message* message2, + const FieldDescriptor* field) const; inline const uint32_t* GetInlinedStringDonatedArray( const Message& message) const;