Merge with main to pull in a fix to an unrelated test.

protocolbuffers · Nov 2, 2023 · c46f0e4 · c46f0e4
2 parents cf02e0b + f78f9c5
commit c46f0e4
Show file tree

Hide file tree

Showing 29 changed files with 465 additions and 132 deletions.
diff --git a/.github/workflows/test_cpp.yml b/.github/workflows/test_cpp.yml
@@ -324,8 +324,8 @@ jobs:
             # for Apple Silicon to detect issues there.
             bazel: build --cpu=darwin_arm64 //src/...
           - name: Windows
-            os: windows-2019
-            cache_key: windows-2019
+            os: windows-2022
+            cache_key: windows-2022
             bazel: test //src/...  @com_google_protobuf_examples//... --test_tag_filters=-conformance --build_tag_filters=-conformance
     name: ${{ matrix.name }} Bazel
     runs-on: ${{ matrix.os }}
@@ -351,44 +351,46 @@ jobs:
             flags: -DCMAKE_CXX_STANDARD=14
             cache-prefix: macos-cmake
           - name: Windows CMake
-            os: windows-2019
-            flags: >-
-              -G Ninja -Dprotobuf_WITH_ZLIB=OFF -Dprotobuf_BUILD_CONFORMANCE=OFF
-              -Dprotobuf_BUILD_SHARED_LIBS=OFF
-              -Dprotobuf_BUILD_EXAMPLES=ON
-            vsversion: '2019'
-            cache-prefix: windows-2019-cmake
-          - name: Windows CMake 2022
             os: windows-2022
             flags: >-
               -G Ninja -Dprotobuf_WITH_ZLIB=OFF -Dprotobuf_BUILD_CONFORMANCE=OFF
               -Dprotobuf_BUILD_SHARED_LIBS=OFF
               -Dprotobuf_BUILD_EXAMPLES=ON
             vsversion: '2022'
             cache-prefix: windows-2022-cmake
-          - name: Windows CMake 32-bit
+          - name: Windows CMake 2019
             os: windows-2019
             flags: >-
               -G Ninja -Dprotobuf_WITH_ZLIB=OFF -Dprotobuf_BUILD_CONFORMANCE=OFF
+              -Dprotobuf_BUILD_SHARED_LIBS=OFF
+              -Dprotobuf_BUILD_EXAMPLES=ON
             vsversion: '2019'
+            cache-prefix: windows-2019-cmake
+            # windows-2019 has python3.7 installed, which is incompatible with the latest gcloud
+            python-version: '3.8'
+          - name: Windows CMake 32-bit
+            os: windows-2022
+            flags: >-
+              -G Ninja -Dprotobuf_WITH_ZLIB=OFF -Dprotobuf_BUILD_CONFORMANCE=OFF
+            vsversion: '2022'
             windows-arch: 'win32'
-            cache-prefix: windows-2019-win32-cmake
+            cache-prefix: windows-2022-win32-cmake
           - name: Windows CMake Shared
-            os: windows-2019
+            os: windows-2022
             flags: >-
               -G Ninja -Dprotobuf_WITH_ZLIB=OFF -Dprotobuf_BUILD_CONFORMANCE=OFF
               -Dprotobuf_BUILD_SHARED_LIBS=ON
-            vsversion: '2019'
-            cache-prefix: windows-2019-cmake
+            vsversion: '2022'
+            cache-prefix: windows-2022-cmake
           - name: Windows CMake Install
-            os: windows-2019
+            os: windows-2022
             install-flags: -G Ninja -Dprotobuf_WITH_ZLIB=OFF -Dprotobuf_BUILD_CONFORMANCE=OFF -Dprotobuf_BUILD_TESTS=OFF
             flags: >-
               -G Ninja -Dprotobuf_WITH_ZLIB=OFF -Dprotobuf_BUILD_CONFORMANCE=OFF
               -Dprotobuf_REMOVE_INSTALLED_HEADERS=ON
               -Dprotobuf_BUILD_PROTOBUF_BINARIES=OFF
-            vsversion: '2019'
-            cache-prefix: windows-2019-cmake
+            vsversion: '2022'
+            cache-prefix: windows-2022-cmake
     name: ${{ matrix.name }}
     runs-on: ${{ matrix.os }}
     steps:
@@ -405,6 +407,17 @@ jobs:
           arch: ${{ matrix.windows-arch || 'x64' }}
           vsversion: ${{ matrix.vsversion }}
 
+      # Workaround for incompatibility between gcloud and windows-2019 runners.
+      - name: Install Python
+        if: ${{ matrix.python-version }}
+        uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Use custom python for gcloud
+        if: ${{ matrix.python-version }}
+        run: echo "CLOUDSDK_PYTHON=${Python3_ROOT_DIR}\\python3" >> $GITHUB_ENV
+        shell: bash
+
       - name: Setup sccache
         uses: protocolbuffers/protobuf-ci/sccache@v2
         with:

diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml
@@ -76,6 +76,7 @@ jobs:
         with:
           python-version: ${{ matrix.version }}
           cache: pip
+          cache-dependency-path: 'python/requirements.txt'
 
       - name: Validate version
         run: python3 --version | grep ${{ matrix.version }} || (echo "Invalid Python version - $(python3 --version)" && exit 1)

diff --git a/.github/workflows/test_upb.yml b/.github/workflows/test_upb.yml
@@ -64,12 +64,16 @@ jobs:
     strategy:
       fail-fast: false   # Don't cancel all jobs if one fails.
     name: Windows
-    runs-on: windows-2019
+    runs-on: windows-2022
     steps:
       - name: Checkout pending changes
         uses: protocolbuffers/protobuf-ci/checkout@v2
         with:
           ref: ${{ inputs.safe-checkout }}
+      - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
+        with:
+          cache: pip
+          cache-dependency-path: 'python/requirements.txt'
       - name: Run tests
         uses: protocolbuffers/protobuf-ci/bazel@v2
         with:
@@ -92,6 +96,10 @@ jobs:
         uses: protocolbuffers/protobuf-ci/checkout@v2
         with:
           ref: ${{ inputs.safe-checkout }}
+      - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
+        with:
+          cache: pip
+          cache-dependency-path: 'python/requirements.txt'
       - name: Run tests
         uses: protocolbuffers/protobuf-ci/bazel@v2
         with:
@@ -195,7 +203,7 @@ jobs:
         with:
           name: requirements
           path: requirements
-      - uses: actions/setup-python@v2
+      - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
         with:
           python-version: ${{ matrix.python-version }}
           architecture: ${{ matrix.architecture }}
@@ -250,7 +258,7 @@ jobs:
           path: wheels
       - name: Delete Binary Wheels
         run: find wheels -type f | grep -v none-any | xargs rm
-      - uses: actions/setup-python@v2
+      - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
         with:
           python-version: ${{ matrix.python-version }}
       - name: Setup Python venv

diff --git a/docs/design/editions/README.md b/docs/design/editions/README.md
@@ -36,3 +36,4 @@ The following topics are in this repository:
 *   [Edition Naming](edition-naming.md)
 *   [Editions Feature Visibility](editions-feature-visibility.md)
 *   [Legacy Syntax Editions](legacy-syntax-editions.md)
+*   [Editions: Feature Extension Layout](editions-feature-extension-layout.md)
diff --git a/docs/design/editions/editions-feature-extension-layout.md b/docs/design/editions/editions-feature-extension-layout.md
@@ -0,0 +1,150 @@
+# Editions: Feature Extension Layout
+
+**Author:** [@mkruskal-google](https://github.com/mkruskal-google),
+[@zhangskz](https://github.com/zhangskz)
+
+**Approved:** 2023-08-23
+
+## Background
+
+"[What are Protobuf Editions](what-are-protobuf-editions.md)" lays out a plan
+for allowing for more targeted features not owned by the protobuf team. It uses
+extensions of the global features proto to implement this. One thing that was
+left a bit ambiguous was *who* should own these extensions. Language, code
+generator, and runtime implementations are all similar but not identical
+distinctions.
+
+"Editions Zero Feature: utf8_validation" (not available externally, though a
+later version,
+"[Editions Zero: utf8_validation Without Problematic Options](editions-zero-utf8_validation.md)"
+is) is a recent plan to add a new set of generator features for utf8 validation.
+While the sole feature we had originally created (`legacy_closed_enum` in Java
+and C++) didn't have any ambiguity here, this one did. Specifically in Python,
+the current behaviors across proto2/proto3 are distinct for all 3
+implementations: pure python, Python/C++, Python/upb.
+
+## Overview
+
+In meetings, we've discussed various alternatives, captured below. The original
+plan was to make feature extensions runtime implementation-specific (e.g. C++,
+Java, Python, upb). There are some notable complications that came up though:
+
+1.  **Polyglot** - it's not clear how upb or C++ runtimes should behave in
+    multi-language situations. Which feature sets do they consider for runtime
+    behaviors? *Note: this is already a serious issue today, where all proto2
+    strings and many proto3 strings are completely unsafe across languages.*
+
+2.  **Shared Implementations** - Runtimes like upb and C++ are used as backing
+    implementations of multiple other languages (e.g. Python, Rust, Ruby, PHP).
+    If we have a single set of `upb` or `cpp` features, migrating to those
+    shared implementations would be more difficult (since there's no independent
+    switches per-language). *Note: this is already the situation we're in today,
+    where switching the runtime implementation can cause subtle and dangerous
+    behavior changes.*
+
+Given that we only have two behaviors, and one of them is unambiguous, it seems
+reasonable to punt on this decision until we have more information. We may
+encounter more edge cases that require feature extensions (and give us more
+information) during the rollout of edition zero. We also have a lot of freedom
+to re-model features in later editions, so keeping the initial implementation as
+simple as possible seems best (i.e. Alternative 2).
+
+## Alternatives
+
+### Alternative 1: Runtime Implementation Features
+
+Features would be per-runtime implementation as originally described in
+"Editions Zero Feature: utf8_validation." For example, Protobuf Python users
+would set different features depending on the backing implementation (e.g.
+`features.(pb.cpp).<feature>`, `features.(pb.upb).<feature>`).
+
+#### Pros
+
+*   Most consistent with range of behaviors expressible pre-Editions
+
+#### Cons
+
+*   Implementation may / should not be obvious to users.
+*   Lack of levers specifically for language / implementation combos. For
+    example, there is no way to set Python-C++ behavior independently of C++
+    behavior which may make migration harder from other Python implementations.
+
+### Alternative 2: Generator Features
+
+Features would be per-generator only (i.e. each protoc plugin would own one set
+of features). This was the second decision we made in later discussions, and
+while very similar to the above alternative, it's more inline with our goal of
+making features primarily for codegen.
+
+For example, all Python implementations would share the same set of features
+(e.g. `features.(pb.python).<feature>`). However, certain features could be
+targeted to specific implementations (e.g.
+`features.(pb.python).upb_utf8_validation` would only be used by Python/upb).
+
+#### Pros
+
+*   Allows independent controls of shared implementations in different target
+    languages (e.g. Python's upb feature won't affect PHP).
+
+#### Cons
+
+*   Possible complexity in upb to understand which language's features to
+    respect. UPB is not currently aware of what language it is being used for.
+*   Limits in-process sharing across languages with shared implementations (e.g.
+    Python upb, PHP upb) in the case of conflicting behaviors.
+    *   Additional checks may be needed.
+
+### Alternative 3: Migrate to bytes
+
+Since this whole discussion revolves around the utf8 validation feature, one
+option would be to just remove it from edition zero. Instead of adding a new
+toggle for UTF8 behavior, we could simply migrate everyone who doesn't enforce
+utf8 today to `bytes`. This would likely need another new *codegen* feature for
+generating byte getters/setters as strings, but that wouldn't have any of the
+ambiguity we're seeing today.
+
+Unfortunately, this doesn't seem feasible because of all the different behaviors
+laid out in "Editions Zero Feature: utf8_validation." UTF8 validation isn't
+really a binary on/off decision, and it can vary widely between languages. There
+are many cases where UTF8 is validated in **some** languages but not others, and
+there's also the C++ "hint" behavior that logs errors but allows invalid UTF8.
+
+**Note:** This could still be partially done in a follow-up LSC by targeting
+specific combinations of the new feature that disable validation in all relevant
+languages.
+
+#### Pros
+
+*   Punts on the issue, we wouldn't need any upb features and C++ features would
+    all be code-gen only
+*   Simplifies the situation, avoids adding a very complicated feature in
+    edition zero
+
+#### Cons
+
+*   Not really possible given the current complexity
+*   There are O(10M) proto2 string fields that would be blindly changed to bytes
+
+### Alternative 4: Nested Features
+
+Another option is to allow for shared feature set messages. For example, upb
+would define a feature message, but *not* make it an extension of the global
+`FeatureSet`. Instead, languages with upb implementations would have a field of
+this type to allow for finer-grained controls. C++ would both extend the global
+`FeatureSet` and also be allowed as a field in other languages.
+
+For example, python utf8 validation could be specified as:
+
+We could have checks during feature validation that enforce that impossible
+combinations aren't specified. For example, with our current implementation
+`features.(pb.python).cpp` should always be identical to `features.(pb.cpp)`,
+since we don't have any mechanism for distinguishing them.
+
+#### Pros
+
+*   Much more explicit than options 1 and 2
+
+#### Cons
+
+*   Maybe too explicit? Proto owners would be forced to duplicate a lot of
+    features
diff --git a/docs/options.md b/docs/options.md
@@ -486,3 +486,8 @@ with info about your project (name and website) so we can add an entry for you.
 
     *   Website: https://square.github.io/wire/
     *   Extensions: 1185
+
+1.  Protons
+
+    *   Website: https://github.com/ipfs/protons
+    *   Extensions: 1186
diff --git a/python/BUILD b/python/BUILD
@@ -203,5 +203,6 @@ py_extension(
         "//upb/util:compare",
         "//upb/util:def_to_proto",
         "//upb/util:required_fields",
+        "@utf8_range",
     ],
 )
diff --git a/python/convert.c b/python/convert.c
@@ -35,6 +35,7 @@
 #include "upb/message/map.h"
 #include "upb/reflection/message.h"
 #include "upb/util/compare.h"
+#include "utf8_range.h"
 
 // Must be last.
 #include "upb/port/def.inc"
@@ -259,20 +260,27 @@ bool PyUpb_PyToUpb(PyObject* obj, const upb_FieldDef* f, upb_MessageValue* val,
     }
     case kUpb_CType_String: {
       Py_ssize_t size;
-      const char* ptr;
-      PyObject* unicode = NULL;
       if (PyBytes_Check(obj)) {
-        unicode = obj = PyUnicode_FromEncodedObject(obj, "utf-8", NULL);
-        if (!obj) return false;
+        // Use the object's bytes if they are valid UTF-8.
+        char* ptr;
+        if (PyBytes_AsStringAndSize(obj, &ptr, &size) < 0) return false;
+        if (utf8_range2((const unsigned char*)ptr, size) != 0) {
+          // Invalid UTF-8.  Try to convert the message to a Python Unicode
+          // object, even though we know this will fail, just to get the
+          // idiomatic Python error message.
+          obj = PyUnicode_FromEncodedObject(obj, "utf-8", NULL);
+          assert(!obj);
+          return false;
+        }
+        *val = PyUpb_MaybeCopyString(ptr, size, arena);
+        return true;
+      } else {
+        const char* ptr;
+        ptr = PyUnicode_AsUTF8AndSize(obj, &size);
+        if (PyErr_Occurred()) return false;
+        *val = PyUpb_MaybeCopyString(ptr, size, arena);
+        return true;
       }
-      ptr = PyUnicode_AsUTF8AndSize(obj, &size);
-      if (PyErr_Occurred()) {
-        Py_XDECREF(unicode);
-        return false;
-      }
-      *val = PyUpb_MaybeCopyString(ptr, size, arena);
-      Py_XDECREF(unicode);
-      return true;
     }
     case kUpb_CType_Message:
       PyErr_Format(PyExc_ValueError, "Message objects may not be assigned");

diff --git a/python/google/protobuf/internal/message_test.py b/python/google/protobuf/internal/message_test.py
@@ -48,7 +48,6 @@
 
 warnings.simplefilter('error', DeprecationWarning)
 
-
 @_parameterized.named_parameters(('_proto2', unittest_pb2),
                                 ('_proto3', unittest_proto3_arena_pb2))
 @testing_refleaks.TestCase