Skip to content

Commit

Permalink
fix: get rid of the path for fully qualified names.
Browse files Browse the repository at this point in the history
This is not complete normalization feature but rather more a fix for the fully qualified paths vs simple name references. This fix not manage the various way a type can be expressed with a partial path.

Long explanation below:

If we accept the specifications from buf as correct, we can look at how a type reference is defined [here](https://protobuf.com/docs/language-spec#type-references).

The main problem with the previous implementation is that the current parser can't tell if a fully-qualified type reference in dot notation is the same as one identified by name alone aka simple name notation ([fully qualified reference](https://protobuf.com/docs/language-spec#fully-qualified-references)).

The fix do not consider all the different ways users can define a relative reference, schemas
 with different way of expressing a [relative reference](https://protobuf.com/docs/language-spec#relative-references) even if normalized,
 for now will keep being considered different.

Right now, our logic removes the `.package_name` (+ the Message scope) part
 before comparing field modifications (in fact it re-writes the schema using the simple name notation).

Even though the TypeTree (trie data structure) could help resolve [relative references](https://protobuf.com/docs/language-spec#relative-references), we don't want to add this feature in the python implementation now because it might cause new bugs due to the non-trivial behaviour of the protoc compiler.

We plan to properly normalize the protobuf schemas later.

We'll use protobuf descriptors (after the compilation and linking step) to gather type references already resolved, and we will threaten all the protobuf using always the fully qualified names.
Properly handling all path variations means reimplementing the protoc compiler behavior, we prefer relying on the already processed proto descriptor.

So, for now, we'll only implement a normalization for the fully-qualified references in dot notation and by simple name alone.

**NB:** This is not changing the semantics of the message since the local scope its always the one
 with max priority, so if you get rid of the fully-qualified reference protoc will resolve
 the reference with the one specified in the package scope.
  • Loading branch information
eliax1996 committed Jul 4, 2024
1 parent c6a0948 commit 0faa236
Show file tree
Hide file tree
Showing 7 changed files with 433 additions and 123 deletions.
100 changes: 66 additions & 34 deletions karapace/protobuf/proto_normalizations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

from __future__ import annotations

from karapace.dependency import Dependency
from karapace.protobuf.enum_constant_element import EnumConstantElement
from karapace.protobuf.enum_element import EnumElement
from karapace.protobuf.extend_element import ExtendElement
from karapace.protobuf.field_element import FieldElement
from karapace.protobuf.group_element import GroupElement
from karapace.protobuf.known_dependency import KnownDependency
from karapace.protobuf.message_element import MessageElement
from karapace.protobuf.one_of_element import OneOfElement
from karapace.protobuf.option_element import OptionElement
Expand All @@ -19,8 +19,9 @@
from karapace.protobuf.schema import ProtobufSchema
from karapace.protobuf.service_element import ServiceElement
from karapace.protobuf.type_element import TypeElement
from karapace.schema_references import Reference
from typing import Mapping, Sequence
from karapace.protobuf.type_tree import TypeTree
from karapace.utils import remove_prefix
from typing import Sequence

import abc

Expand Down Expand Up @@ -77,28 +78,41 @@ class NormalizedGroupElement(GroupElement):
class NormalizedProtobufSchema(ProtobufSchema):
proto_file_element: NormalizedProtoFileElement

def __init__(
self,
schema: str,
references: Sequence[Reference] | None = None,
dependencies: Mapping[str, Dependency] | None = None,
proto_file_element: ProtoFileElement | None = None,
) -> None:
super().__init__(schema, references, dependencies, proto_file_element)
self.proto_file_element = normalize(self.proto_file_element)
@staticmethod
def from_protobuf_schema(protobuf_schema: ProtobufSchema) -> NormalizedProtobufSchema:
return normalize(protobuf_schema)


class NormalizedOneOfElement(OneOfElement):
fields: Sequence[NormalizedFieldElement]
groups: Sequence[NormalizedGroupElement]


def type_field_element_with_sorted_options(type_field: FieldElement) -> NormalizedFieldElement:
def type_field_element_with_sorted_options(
type_field: FieldElement, package: str, type_tree: TypeTree
) -> NormalizedFieldElement:
sorted_options = None if type_field.options is None else list(sorted(type_field.options, key=sort_by_name))
field_type_normalized = remove_prefix(remove_prefix(type_field.element_type, "."), f"{package}.")
reference_in_type_tree = type_tree.type_in_tree(field_type_normalized)
google_included_type = (
field_type_normalized in KnownDependency.index_simple or field_type_normalized in KnownDependency.index
)
element_type = (
field_type_normalized
# we can remove the package from the type only if there aren't clashing names in the same
# definition with the same relative prefix.
if (
reference_in_type_tree is not None
and reference_in_type_tree.is_uniquely_identifiable_type
and not google_included_type
)
or google_included_type
else type_field.element_type
)
return NormalizedFieldElement(
location=type_field.location,
label=type_field.label,
element_type=type_field.element_type,
element_type=element_type,
name=type_field.name,
default_value=type_field.default_value,
json_name=type_field.json_name,
Expand Down Expand Up @@ -135,9 +149,11 @@ def enum_element_with_sorted_options(enum_element: EnumElement) -> NormalizedEnu
)


def groups_with_sorted_options(group: GroupElement) -> NormalizedGroupElement:
def groups_with_sorted_options(group: GroupElement, package: str, type_tree: TypeTree) -> NormalizedGroupElement:
sorted_fields = (
None if group.fields is None else [type_field_element_with_sorted_options(field) for field in group.fields]
None
if group.fields is None
else [type_field_element_with_sorted_options(field, package, type_tree) for field in group.fields]
)
return NormalizedGroupElement(
label=group.label,
Expand All @@ -149,10 +165,10 @@ def groups_with_sorted_options(group: GroupElement) -> NormalizedGroupElement:
)


def one_ofs_with_sorted_options(one_ofs: OneOfElement) -> NormalizedOneOfElement:
def one_ofs_with_sorted_options(one_ofs: OneOfElement, package: str, type_tree: TypeTree) -> NormalizedOneOfElement:
sorted_options = None if one_ofs.options is None else list(sorted(one_ofs.options, key=sort_by_name))
sorted_fields = [type_field_element_with_sorted_options(field) for field in one_ofs.fields]
sorted_groups = [groups_with_sorted_options(group) for group in one_ofs.groups]
sorted_fields = [type_field_element_with_sorted_options(field, package, type_tree) for field in one_ofs.fields]
sorted_groups = [groups_with_sorted_options(group, package, type_tree) for group in one_ofs.groups]

return NormalizedOneOfElement(
name=one_ofs.name,
Expand All @@ -163,11 +179,15 @@ def one_ofs_with_sorted_options(one_ofs: OneOfElement) -> NormalizedOneOfElement
)


def message_element_with_sorted_options(message_element: MessageElement) -> NormalizedMessageElement:
def message_element_with_sorted_options(
message_element: MessageElement, package: str, type_tree: TypeTree
) -> NormalizedMessageElement:
sorted_options = None if message_element.options is None else list(sorted(message_element.options, key=sort_by_name))
sorted_nested_types = [type_element_with_sorted_options(nested_type) for nested_type in message_element.nested_types]
sorted_fields = [type_field_element_with_sorted_options(field) for field in message_element.fields]
sorted_one_ofs = [one_ofs_with_sorted_options(one_of) for one_of in message_element.one_ofs]
sorted_nested_types = [
type_element_with_sorted_options(nested_type, package, type_tree) for nested_type in message_element.nested_types
]
sorted_fields = [type_field_element_with_sorted_options(field, package, type_tree) for field in message_element.fields]
sorted_one_ofs = [one_ofs_with_sorted_options(one_of, package, type_tree) for one_of in message_element.one_ofs]

return NormalizedMessageElement(
location=message_element.location,
Expand All @@ -183,16 +203,16 @@ def message_element_with_sorted_options(message_element: MessageElement) -> Norm
)


def type_element_with_sorted_options(type_element: TypeElement) -> NormalizedTypeElement:
def type_element_with_sorted_options(type_element: TypeElement, package: str, type_tree: TypeTree) -> NormalizedTypeElement:
sorted_nested_types: list[TypeElement] = []

for nested_type in type_element.nested_types:
if isinstance(nested_type, EnumElement):
sorted_nested_types.append(enum_element_with_sorted_options(nested_type))
elif isinstance(nested_type, MessageElement):
sorted_nested_types.append(message_element_with_sorted_options(nested_type))
sorted_nested_types.append(message_element_with_sorted_options(nested_type, package, type_tree))
else:
raise ValueError("Unknown type element") # tried with assert_never but it did not work
raise ValueError(f"Unknown type element {type(nested_type)}") # tried with assert_never but it did not work

# doing it here since the subtypes do not declare the nested_types property
type_element.nested_types = sorted_nested_types
Expand All @@ -201,16 +221,18 @@ def type_element_with_sorted_options(type_element: TypeElement) -> NormalizedTyp
return enum_element_with_sorted_options(type_element)

if isinstance(type_element, MessageElement):
return message_element_with_sorted_options(type_element)
return message_element_with_sorted_options(type_element, package, type_tree)

raise ValueError("Unknown type element") # tried with assert_never but it did not work
raise ValueError(f"Unknown type element of type {type(type_element)}") # tried with assert_never but it did not work


def extends_element_with_sorted_options(extend_element: ExtendElement) -> NormalizedExtendElement:
def extends_element_with_sorted_options(
extend_element: ExtendElement, package: str, type_tree: TypeTree
) -> NormalizedExtendElement:
sorted_fields = (
None
if extend_element.fields is None
else [type_field_element_with_sorted_options(field) for field in extend_element.fields]
else [type_field_element_with_sorted_options(field, package, type_tree) for field in extend_element.fields]
)
return NormalizedExtendElement(
location=extend_element.location,
Expand Down Expand Up @@ -249,19 +271,22 @@ def service_element_with_sorted_options(service_element: ServiceElement) -> Norm
)


def normalize(proto_file_element: ProtoFileElement) -> NormalizedProtoFileElement:
def normalize(protobuf_schema: ProtobufSchema) -> NormalizedProtobufSchema:
proto_file_element = protobuf_schema.proto_file_element
type_tree = protobuf_schema.types_tree()
package = proto_file_element.package_name or ""
sorted_types: Sequence[NormalizedTypeElement] = [
type_element_with_sorted_options(type_element) for type_element in proto_file_element.types
type_element_with_sorted_options(type_element, package, type_tree) for type_element in proto_file_element.types
]
sorted_options = list(sorted(proto_file_element.options, key=sort_by_name))
sorted_services: Sequence[NormalizedServiceElement] = [
service_element_with_sorted_options(service) for service in proto_file_element.services
]
sorted_extend_declarations: Sequence[NormalizedExtendElement] = [
extends_element_with_sorted_options(extend) for extend in proto_file_element.extend_declarations
extends_element_with_sorted_options(extend, package, type_tree) for extend in proto_file_element.extend_declarations
]

return NormalizedProtoFileElement(
normalized_protobuf_element = NormalizedProtoFileElement(
location=proto_file_element.location,
package_name=proto_file_element.package_name,
syntax=proto_file_element.syntax,
Expand All @@ -272,3 +297,10 @@ def normalize(proto_file_element: ProtoFileElement) -> NormalizedProtoFileElemen
extend_declarations=sorted_extend_declarations,
options=sorted_options,
)

return NormalizedProtobufSchema(
protobuf_schema.schema,
protobuf_schema.references,
protobuf_schema.dependencies,
normalized_protobuf_element,
)
87 changes: 3 additions & 84 deletions karapace/protobuf/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@
from karapace.protobuf.proto_parser import ProtoParser
from karapace.protobuf.serialization import deserialize, serialize
from karapace.protobuf.type_element import TypeElement
from karapace.protobuf.type_tree import SourceFileReference, TypeTree
from karapace.protobuf.utils import append_documentation, append_indented
from karapace.schema_references import Reference
from typing import Iterable, Mapping, Sequence
from typing import Mapping, Sequence

import binascii
import itertools


def add_slashes(text: str) -> str:
Expand Down Expand Up @@ -122,88 +122,6 @@ class UsedType:
used_attribute_type: str


@default_dataclass
class SourceFileReference:
reference: str
import_order: int


@default_dataclass
class TypeTree:
token: str
children: list[TypeTree]
source_reference: SourceFileReference | None

def source_reference_tree_recursive(self) -> Iterable[SourceFileReference | None]:
sources = [] if self.source_reference is None else [self.source_reference]
for child in self.children:
sources = itertools.chain(sources, child.source_reference_tree())
return sources

def source_reference_tree(self) -> Iterable[SourceFileReference]:
return filter(lambda x: x is not None, self.source_reference_tree_recursive())

def inserted_elements(self) -> int:
"""
Returns the newest element generation accessible from that node.
Where with the term generation we mean the order for which a message
has been imported.
If called on the root of the tree it corresponds with the number of
fully specified path objects present in the tree.
"""
return max(reference.import_order for reference in self.source_reference_tree())

def oldest_matching_import(self) -> int:
"""
Returns the oldest element generation accessible from that node.
Where with the term generation we mean the order for which a message
has been imported.
"""
return min(reference.import_order for reference in self.source_reference_tree())

def expand_missing_absolute_path_recursive(self, oldest_import: int) -> Sequence[str]:
"""
Method that, once called on a node, traverse all the leaf and
return the oldest imported element with the common postfix.
This is also the current behaviour
of protobuf while dealing with a not fully specified path, it seeks for
the firstly imported message with a matching path.
"""
if self.source_reference is not None:
if self.source_reference.import_order == oldest_import:
return [self.token]
return []

for child in self.children:
maybe_oldest_child = child.expand_missing_absolute_path_recursive(oldest_import)
if maybe_oldest_child is not None:
return list(maybe_oldest_child) + [self.token]

return []

def expand_missing_absolute_path(self) -> Sequence[str]:
oldest_import = self.oldest_matching_import()
expanded_missing_path = self.expand_missing_absolute_path_recursive(oldest_import)
assert (
expanded_missing_path is not None
), "each node should have, by construction, at least a leaf that is a fully specified path"
return expanded_missing_path[:-1] # skipping myself since I was matched

@property
def is_fully_qualified_type(self) -> bool:
return len(self.children) == 0

def represent(self, level=0) -> str:
spacing = " " * 3 * level
if self.is_fully_qualified_type:
return f"{spacing}>{self.token}"
child_repr = "\n".join(child.represent(level=level + 1) for child in self.children)
return f"{spacing}{self.token} ->\n{child_repr}"

def __repr__(self) -> str:
return self.represent()


def _add_new_type_recursive(
parent_tree: TypeTree,
remaining_tokens: list[str],
Expand Down Expand Up @@ -258,6 +176,7 @@ def __init__(
) -> None:
if type(schema).__name__ != "str":
raise IllegalArgumentException("Non str type of schema string")
self.schema = schema
self.cache_string = ""

if proto_file_element is not None:
Expand Down
Loading

0 comments on commit 0faa236

Please sign in to comment.