Fix 1633 unexpected key error with primary key in schema and 1635 une…

…xpected missing label error when ignoring header case (#1641) - fixes #1633 - fixes #1635 --- This PR fixes unexpected `missing-label` error in validation report which occurs when validating tabular data in this specific case (#1635): The data contains a named column corresponding to a required field in the schema used for validation, written without respecting case-sensitivity. For example: ``` data = [["aa", "BB"], ["a", "b"]] schema = { "$schema": "https://frictionlessdata.io/schemas/table-schema.json", "fields": [ {"name": "AA", "constraints": {"required": True}}, {"name": "bb", "constraints": {"required": True}} ] } ``` It adds some test cases: - one test case to ensure that validating this tabular data with this schema and using `schema_sync=True` and `dialect=Dialect(header_case=False)` options, returns a valid report. - one test case to ensure that if validation is case-sensitive, if a name column does not respect case a missing-label occurs in validation report. - another test case to ensure that validating with missing required column "AA" with this schema and using `schema_sync=True` and `dialect=Dialect(header_case=False)` options, returns an invalid report with `missing-label` error related to field "AA". This PR also fixes unexpected KeyError raised when a primary key used in shema is missing in the tabular data to validate(#1633). For example: ``` data = [["B"], ["b"]] schema = { "$schema": "https://frictionlessdata.io/schemas/table-schema.json", "fields": [ {"name": "A"}, {"name": "B"} ], "primaryKey": ["A"] } ``` It adds some test cases: - two test cases to ensure that when a label related to the schema primary key is missing, the validation report is invalid with single missing-label error: - a test case with the schema containing the primary key field not specified as 'required' - a test case with the schema containing the primary key field specified as 'required' - a test case to deal with insensitivity case in the data label corresponding to the primary key schema field: the validation report is valid Finally, I suggest some refactoring in this PR: - refactoring removing missing required label from field info refactoring removing missing required label from field info - refactoring creating schema fields among labels part when using `schema_sync` option in `detect_schema()` `Detector` method - refactoring to introduce intermediary `TableResource` methods to create `cells` related to primary key schema. --------- Co-authored-by: Pierre Camilleri <[email protected]>
frictionlessdata · Aug 30, 2024 · 7861f0e · 7861f0e
1 parent 4d334f0
commit 7861f0e
Show file tree

Hide file tree

Showing 5 changed files with 276 additions and 40 deletions.
diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py
@@ -298,6 +298,7 @@ def detect_schema(
         labels: Optional[List[str]] = None,
         schema: Optional[Schema] = None,
         field_candidates: List[Dict[str, Any]] = settings.DEFAULT_FIELD_CANDIDATES,
+        **options: Any,
     ) -> Schema:
         """Detect schema from fragment
 
@@ -405,20 +406,29 @@ def detect_schema(
         # Sync schema
         if self.schema_sync:
             if labels:
+                case_sensitive = options["header_case"]
+
+                if not case_sensitive:
+                    labels = [label.lower() for label in labels]
+
                 if len(labels) != len(set(labels)):
                     note = '"schema_sync" requires unique labels in the header'
                     raise FrictionlessException(note)
-                mapping = {field.name: field for field in schema.fields}  # type: ignore
-                schema.clear_fields()
-                for name in labels:
-                    field = mapping.get(name)
-                    if not field:
-                        field = Field.from_descriptor({"name": name, "type": "any"})
-                    schema.add_field(field)
-                # For required fields that are missing
-                for _, field in mapping.items():
-                    if field and field.required and field.name not in labels:
-                        schema.add_field(field)
+
+                mapped_fields = self.mapped_schema_fields_names(
+                    schema.fields,  # type: ignore
+                    case_sensitive,
+                )
+
+                self.rearrange_schema_fields_given_labels(
+                    mapped_fields,
+                    schema,
+                    labels,
+                )
+
+                self.add_missing_required_labels_to_schema_fields(
+                    mapped_fields, schema, labels, case_sensitive
+                )
 
         # Patch schema
         if self.schema_patch:
@@ -432,4 +442,58 @@ def detect_schema(
                 field_descriptor.update(field_patch)
             schema = Schema.from_descriptor(descriptor)
 
-        return schema  # type: ignore
+        return schema
+
+    @staticmethod
+    def mapped_schema_fields_names(
+        fields: List[Field], case_sensitive: bool
+    ) -> Dict[str, Field]:
+        """Create a dictionnary to map field names with schema fields"""
+        if case_sensitive:
+            return {field.name: field for field in fields}
+        else:
+            return {field.name.lower(): field for field in fields}
+
+    @staticmethod
+    def rearrange_schema_fields_given_labels(
+        fields_mapping: Dict[str, Field],
+        schema: Schema,
+        labels: List[str],
+    ):
+        """Rearrange fields according to the order of labels. All fields
+        missing from labels are dropped"""
+        schema.clear_fields()
+
+        for name in labels:
+            default_field = Field.from_descriptor({"name": name, "type": "any"})
+            field = fields_mapping.get(name, default_field)
+            schema.add_field(field)
+
+    def add_missing_required_labels_to_schema_fields(
+        self,
+        fields_mapping: Dict[str, Field],
+        schema: Schema,
+        labels: List[str],
+        case_sensitive: bool,
+    ):
+        """This method aims to add missing required labels and
+        primary key field not in labels to schema fields.
+        """
+        for name, field in fields_mapping.items():
+            if (
+                self.field_is_required(field, schema, case_sensitive)
+                and name not in labels
+            ):
+                schema.add_field(field)
+
+    @staticmethod
+    def field_is_required(
+        field: Field,
+        schema: Schema,
+        case_sensitive: bool,
+    ) -> bool:
+        if case_sensitive:
+            return field.required or field.name in schema.primary_key
+        else:
+            lower_primary_key = [pk.lower() for pk in schema.primary_key]
+            return field.required or field.name.lower() in lower_primary_key
diff --git a/frictionless/resource/__spec__/test_security.py b/frictionless/resource/__spec__/test_security.py
@@ -27,9 +27,11 @@ def test_resource_source_path_error_bad_path_not_safe_traversing():
         Resource(
             {
                 "name": "name",
-                "path": "data/../data/table.csv"
-                if not platform.type == "windows"
-                else "data\\..\\table.csv",
+                "path": (
+                    "data/../data/table.csv"
+                    if not platform.type == "windows"
+                    else "data\\..\\table.csv"
+                ),
             }
         )
     error = excinfo.value.error

diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py
@@ -5,6 +5,8 @@
 import warnings
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
+from frictionless.schema.field import Field
+
 from .. import errors, helpers, settings
 from ..analyzer import Analyzer
 from ..dialect import Dialect
@@ -204,6 +206,7 @@ def __open_schema(self):
             labels=self.labels,
             schema=self.schema,
             field_candidates=system.detect_field_candidates(),
+            header_case=self.dialect.header_case,
         )
         self.stats.fields = len(self.schema.fields)
 
@@ -331,15 +334,21 @@ def row_stream():
 
                 # Primary Key Error
                 if is_integrity and self.schema.primary_key:
-                    cells = tuple(row[name] for name in self.schema.primary_key)
-                    if set(cells) == {None}:
-                        note = 'cells composing the primary keys are all "None"'
-                        error = errors.PrimaryKeyError.from_row(row, note=note)
-                        row.errors.append(error)
+                    try:
+                        cells = self.primary_key_cells(row, self.dialect.header_case)
+                    except KeyError:
+                        # Row does not have primary_key as label
+                        # There should already be a missing-label error in
+                        # in self.header corresponding to the schema primary key
+                        assert not self.header.valid
                     else:
-                        match = memory_primary.get(cells)
-                        memory_primary[cells] = row.row_number
-                        if match:
+                        if set(cells) == {None}:
+                            note = 'cells composing the primary keys are all "None"'
+                            error = errors.PrimaryKeyError.from_row(row, note=note)
+                            row.errors.append(error)
+                        else:
+                            match = memory_primary.get(cells)
+                            memory_primary[cells] = row.row_number
                             if match:
                                 note = "the same as in the row at position %s" % match
                                 error = errors.PrimaryKeyError.from_row(row, note=note)
@@ -386,20 +395,71 @@ def row_stream():
                 # Yield row
                 yield row
 
-        # NB: missing required labels are not included in the
-        # field_info parameter used for row creation
         if self.detector.schema_sync:
+            # Missing required labels are not included in the
+            # field_info parameter used for row creation
             for field in self.schema.fields:
-                if field.name not in self.labels and field.name in field_info["names"]:
-                    field_index = field_info["names"].index(field.name)
-                    del field_info["names"][field_index]
-                    del field_info["objects"][field_index]
-                    del field_info["mapping"][field.name]
-        # # Create row stream
+                self.remove_missing_required_label_from_field_info(field, field_info)
+
+        # Create row stream
         self.__row_stream = row_stream()
 
-    # Read
+    def remove_missing_required_label_from_field_info(
+        self, field: Field, field_info: Dict[str, Any]
+    ):
+        is_case_sensitive = self.dialect.header_case
+        if self.label_is_missing(
+            field.name, field_info["names"], self.labels, is_case_sensitive
+        ):
+            self.remove_field_from_field_info(field.name, field_info)
+
+    @staticmethod
+    def label_is_missing(
+        field_name: str,
+        expected_field_names: List[str],
+        table_labels: types.ILabels,
+        case_sensitive: bool,
+    ) -> bool:
+        """Check if a schema field name is missing from the TableResource
+        labels.
+        """
+        if not case_sensitive:
+            field_name = field_name.lower()
+            table_labels = [label.lower() for label in table_labels]
+            expected_field_names = [
+                field_name.lower() for field_name in expected_field_names
+            ]
+
+        return field_name not in table_labels and field_name in expected_field_names
 
+    @staticmethod
+    def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]):
+        field_index = field_info["names"].index(field_name)
+        del field_info["names"][field_index]
+        del field_info["objects"][field_index]
+        del field_info["mapping"][field_name]
+
+    def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[Any, ...]:
+        """Create a tuple containg all cells from a given row associated to primary
+        keys"""
+        return tuple(row[label] for label in self.primary_key_labels(row, case_sensitive))
+
+    def primary_key_labels(
+        self,
+        row: Row,
+        case_sensitive: bool,
+    ) -> List[str]:
+        """Create a list of TableResource labels that are primary keys"""
+        if case_sensitive:
+            labels_primary_key = self.schema.primary_key
+        else:
+            lower_primary_key = [pk.lower() for pk in self.schema.primary_key]
+            labels_primary_key = [
+                label for label in row.field_names if label.lower() in lower_primary_key
+            ]
+        return labels_primary_key
+
+    # Read
     def read_cells(self, *, size: Optional[int] = None) -> List[List[Any]]:
         """Read lists into memory
 

diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py
@@ -1,3 +1,6 @@
+import pytest
+
+import frictionless
 from frictionless import Schema, fields
 from frictionless.resources import TableResource
 
@@ -37,3 +40,44 @@ def test_missing_label():
         assert header == ["id", "name", "extra"]
         assert header.labels == ["id", "name"]
         assert header.valid is False
+
+
+@pytest.mark.parametrize(
+    "source, required, valid_report, nb_errors, types_errors_expected, header_case",
+    [
+        ([["B"], ["foo"]], {"required": True}, False, 1, ["missing-label"], True),
+        ([["B"], ["foo"]], {}, False, 1, ["missing-label"], True),
+        ([["a"], ["foo"]], {"required": True}, False, 1, ["missing-label"], True),
+        ([["a"], ["foo"]], {}, False, 1, ["missing-label"], True),
+        # Ignore header_case
+        ([["B"], ["foo"]], {"required": True}, False, 1, ["missing-label"], False),
+        ([["B"], ["foo"]], {}, False, 1, ["missing-label"], False),
+        ([["a"], ["foo"]], {"required": True}, True, 0, [], False),
+        ([["a"], ["foo"]], {}, True, 0, [], False),
+    ],
+)
+def test_missing_primary_key_label_with_shema_sync_issue_1633(
+    source, required, valid_report, nb_errors, types_errors_expected, header_case
+):
+    schema_descriptor = {
+        "$schema": "https://frictionlessdata.io/schemas/table-schema.json",
+        "fields": [{"name": "A", "constraints": required}],
+        "primaryKey": ["A"],
+    }
+
+    resource = TableResource(
+        source=source,
+        schema=Schema.from_descriptor(schema_descriptor),
+        detector=frictionless.Detector(schema_sync=True),
+        dialect=frictionless.Dialect(header_case=header_case),
+    )
+
+    report = frictionless.validate(resource)
+
+    assert report.valid == valid_report
+
+    if not report.valid:
+        errors = report.tasks[0].errors
+        assert len(errors) == nb_errors
+        for error, type_expected in zip(errors, types_errors_expected):
+            assert error.type == type_expected