Skip to content

Commit

Permalink
hotfix: get rid of the path for fully qualified names.
Browse files Browse the repository at this point in the history
This is not complete normalization feature but rather more a fix for the fully qualified paths vs simple name references. This fix not manage the various way a type can be expressed with a partial path.

Long explanation below:

If we accept the specifications from buf as correct, we can look at how a type reference is defined [here](https://protobuf.com/docs/language-spec#type-references).

The main problem with the previous implementation is that the current parser can't tell if a fully-qualified type reference in dot notation is the same as one identified by name alone aka simple name notation ([fully qualified reference](https://protobuf.com/docs/language-spec#fully-qualified-references)).

The fix do not consider all the different ways users can define a relative reference, schemas
 with different way of expressing a [relative reference](https://protobuf.com/docs/language-spec#relative-references) even if normalized,
 for now will keep being considered different.

Right now, our logic removes the `.package_name` (+ the Message scope) part
 before comparing field modifications (in fact it re-writes the schema using the simple name notation).

Even though the TypeTree (trie data structure) could help resolve [relative references](https://protobuf.com/docs/language-spec#relative-references), we don't want to add this feature in the python implementation now because it might cause new bugs due to the non-trivial behaviour of the protoc compiler.

We plan to properly normalize the protobuf schemas later.

We'll use protobuf descriptors (after the compilation and linking step) to gather type references already resolved, and we will threaten all the protobuf using always the fully qualified names.
Properly handling all path variations means reimplementing the protoc compiler behavior, we prefer relying on the already processed proto descriptor.

So, for now, we'll only implement a normalization for the fully-qualified references in dot notation and by simple name alone.

**NB:** This is not changing the semantics of the message since the local scope its always the one
 with max priority, so if you get rid of the fully-qualified reference protoc will resolve
 the reference with the one specified in the package scope.
  • Loading branch information
eliax1996 committed Jul 3, 2024
1 parent c6a0948 commit 5c87e1c
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 15 deletions.
37 changes: 25 additions & 12 deletions karapace/protobuf/proto_normalizations.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,17 @@ class NormalizedOneOfElement(OneOfElement):
groups: Sequence[NormalizedGroupElement]


def type_field_element_with_sorted_options(type_field: FieldElement) -> NormalizedFieldElement:
def type_field_element_with_sorted_options(type_field: FieldElement, scope: str) -> NormalizedFieldElement:
sorted_options = None if type_field.options is None else list(sorted(type_field.options, key=sort_by_name))
field_type_normalized = type_field.element_type.removeprefix(
"."
).removeprefix( # getting rid of the optional `.` before the package name
f"{scope}."
)
return NormalizedFieldElement(
location=type_field.location,
label=type_field.label,
element_type=type_field.element_type,
element_type=field_type_normalized,
name=type_field.name,
default_value=type_field.default_value,
json_name=type_field.json_name,
Expand All @@ -119,13 +124,14 @@ def enum_constant_element_with_sorted_options(enum_constant: EnumConstantElement
)


def enum_element_with_sorted_options(enum_element: EnumElement) -> NormalizedEnumElement:
def enum_element_with_sorted_options(enum_element: EnumElement, scope: str) -> NormalizedEnumElement:
sorted_options = None if enum_element.options is None else list(sorted(enum_element.options, key=sort_by_name))
constants_with_sorted_options = (
None
if enum_element.constants is None
else [enum_constant_element_with_sorted_options(constant) for constant in enum_element.constants]
)
# todo: strip scope.
return NormalizedEnumElement(
location=enum_element.location,
name=enum_element.name,
Expand Down Expand Up @@ -163,10 +169,13 @@ def one_ofs_with_sorted_options(one_ofs: OneOfElement) -> NormalizedOneOfElement
)


def message_element_with_sorted_options(message_element: MessageElement) -> NormalizedMessageElement:
def message_element_with_sorted_options(message_element: MessageElement, scope: str) -> NormalizedMessageElement:
message_scope = f"{scope}.{message_element.name}"
sorted_options = None if message_element.options is None else list(sorted(message_element.options, key=sort_by_name))
sorted_nested_types = [type_element_with_sorted_options(nested_type) for nested_type in message_element.nested_types]
sorted_fields = [type_field_element_with_sorted_options(field) for field in message_element.fields]
sorted_nested_types = [
type_element_with_sorted_options(nested_type, scope) for nested_type in message_element.nested_types
]
sorted_fields = [type_field_element_with_sorted_options(field, scope) for field in message_element.fields]
sorted_one_ofs = [one_ofs_with_sorted_options(one_of) for one_of in message_element.one_ofs]

return NormalizedMessageElement(
Expand All @@ -183,25 +192,28 @@ def message_element_with_sorted_options(message_element: MessageElement) -> Norm
)


def type_element_with_sorted_options(type_element: TypeElement) -> NormalizedTypeElement:
def type_element_with_sorted_options(type_element: TypeElement, scope: str) -> NormalizedTypeElement:
sorted_nested_types: list[TypeElement] = []

parent_name = type_element.name
message_scope = f"{scope}.{parent_name}"

for nested_type in type_element.nested_types:
if isinstance(nested_type, EnumElement):
sorted_nested_types.append(enum_element_with_sorted_options(nested_type))
sorted_nested_types.append(enum_element_with_sorted_options(nested_type, message_scope))
elif isinstance(nested_type, MessageElement):
sorted_nested_types.append(message_element_with_sorted_options(nested_type))
sorted_nested_types.append(message_element_with_sorted_options(nested_type, message_scope))
else:
raise ValueError("Unknown type element") # tried with assert_never but it did not work

# doing it here since the subtypes do not declare the nested_types property
type_element.nested_types = sorted_nested_types

if isinstance(type_element, EnumElement):
return enum_element_with_sorted_options(type_element)
return enum_element_with_sorted_options(type_element, scope)

if isinstance(type_element, MessageElement):
return message_element_with_sorted_options(type_element)
return message_element_with_sorted_options(type_element, scope)

raise ValueError("Unknown type element") # tried with assert_never but it did not work

Expand Down Expand Up @@ -251,7 +263,8 @@ def service_element_with_sorted_options(service_element: ServiceElement) -> Norm

def normalize(proto_file_element: ProtoFileElement) -> NormalizedProtoFileElement:
sorted_types: Sequence[NormalizedTypeElement] = [
type_element_with_sorted_options(type_element) for type_element in proto_file_element.types
type_element_with_sorted_options(type_element, proto_file_element.package_name)
for type_element in proto_file_element.types
]
sorted_options = list(sorted(proto_file_element.options, key=sort_by_name))
sorted_services: Sequence[NormalizedServiceElement] = [
Expand Down
107 changes: 104 additions & 3 deletions tests/unit/protobuf/test_protobuf_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
from karapace.protobuf.location import Location
from karapace.protobuf.proto_normalizations import normalize
from karapace.protobuf.proto_parser import ProtoParser
from typing import Final

import pytest

location: Location = Location("some/folder", "file.proto")
LOCATION: Final[Location] = Location("somefolder", "file.proto")


# this would be a good case for using a property based test with a well-formed message generator
Expand Down Expand Up @@ -511,10 +512,110 @@
),
)
def test_differently_ordered_options_normalizes_equally(ordered_schema: str, unordered_schema: str) -> None:
ordered_proto = ProtoParser.parse(location, ordered_schema)
unordered_proto = ProtoParser.parse(location, unordered_schema)
ordered_proto = ProtoParser.parse(LOCATION, ordered_schema)
unordered_proto = ProtoParser.parse(LOCATION, unordered_schema)

result = CompareResult()
normalize(ordered_proto).compare(normalize(unordered_proto), result)
assert result.is_compatible()
assert normalize(ordered_proto).to_schema() == normalize(unordered_proto).to_schema()


PROTO_WITH_FULLY_QUALIFIED_PATHS = """\
syntax = "proto3";
package my.awesome.customer.v1;
import "my/awesome/customer/v1/nested_value.proto";
import "google/protobuf/timestamp.proto";
option csharp_namespace = "my.awesome.customer.V1";
option go_package = "github.com/customer/api/my/awesome/customer/v1;dspv1";
option java_multiple_files = true;
option java_outer_classname = "EventValueProto";
option java_package = "com.my.awesome.customer.v1";
option objc_class_prefix = "TDD";
option php_metadata_namespace = "My\\Awesome\\Customer\\V1";
option php_namespace = "My\\Awesome\\Customer\\V1";
option ruby_package = "My::Awesome::Customer::V1";
message EventValue {
.my.awesome.customer.v1.NestedValue nested_value = 1;
.google.protobuf.Timestamp created_at = 2;
}
"""


PROTO_WITH_SIMPLE_NAMES = """\
syntax = "proto3";
package my.awesome.customer.v1;
import "my/awesome/customer/v1/nested_value.proto";
import "google/protobuf/timestamp.proto";
option csharp_namespace = "my.awesome.customer.V1";
option go_package = "github.com/customer/api/my/awesome/customer/v1;dspv1";
option java_multiple_files = true;
option java_outer_classname = "EventValueProto";
option java_package = "com.my.awesome.customer.v1";
option objc_class_prefix = "TDD";
option php_metadata_namespace = "My\\Awesome\\Customer\\V1";
option php_namespace = "My\\Awesome\\Customer\\V1";
option ruby_package = "My::Awesome::Customer::V1";
message EventValue {
NestedValue nested_value = 1;
google.protobuf.Timestamp created_at = 2;
}"""


def test_full_path_and_simple_names_are_equal() -> None:
"""
This test aims to ensure that after the normalization process the schema expressed as SimpleNames will match the same
schemas expressed with fully qualified references.
This does not consider the normalization process for the different ways a type can be expressed as a relative reference.
Long explaination below:
If we accept the specifications from buf as correct, we can look at how a type
reference is defined here: https://protobuf.com/docs/language-spec#type-references.
The main problem with the previous implementation is that the current parser can't tell
if a fully-qualified type reference in dot notation is the same as one identified by name
alone aka simple name notation (https://protobuf.com/docs/language-spec#fully-qualified-references).
The fix do not consider all the different ways users can define a relative reference, schemas
with different way of expressing a relative reference even if normalized,
for now will keep being considered different (https://protobuf.com/docs/language-spec#relative-references).
Right now, our logic removes the `.package_name` (+ the Message scope) part
before comparing field modifications (in fact it re-writes the schema using the simple name notation).
Even though the TypeTree (trie data structure) could help resolve
relative references (https://protobuf.com/docs/language-spec#relative-references),
we don't want to add this feature in the python implementation now because it might cause new bugs due to the
non-trivial behaviour of the protoc compiler.
We plan to properly normalize the protobuf schemas later.
We'll use protobuf descriptors (after the compilation and linking step)
to gather type references already resolved, and we will threaten all the protobuf
using always the fully qualified names.
Properly handling all path variations means reimplementing the protoc compiler behavior,
we prefer relying on the already processed proto descriptor.
So, for now, we'll only implement a normalization for the fully-qualified references
in dot notation and by simple name alone.
This is not changing the semantics of the message since the local scope its always the one
with max priority, so if you get rid of the fully-qualified reference protoc will resolve
the reference with the one specified in the package scope.
"""

fully_qualitifed_simple_name_notation = ProtoParser.parse(LOCATION, PROTO_WITH_SIMPLE_NAMES)
fully_qualitifed_dot_notation = ProtoParser.parse(LOCATION, PROTO_WITH_FULLY_QUALIFIED_PATHS)
result = CompareResult()
normalize(fully_qualitifed_dot_notation).compare(normalize(fully_qualitifed_simple_name_notation), result)
assert result.is_compatible()
assert (
normalize(fully_qualitifed_dot_notation).to_schema() == normalize(fully_qualitifed_simple_name_notation).to_schema()
)

0 comments on commit 5c87e1c

Please sign in to comment.