From 7861f0e6e3dd9e8392d1d8d57f850c7b5862b81a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Am=C3=A9lie=20Rondot?=
 <119412389+amelie-rondot@users.noreply.github.com>
Date: Fri, 30 Aug 2024 13:22:48 +0200
Subject: [PATCH] Fix 1633 unexpected key error with primary key in schema and
 1635 unexpected missing label error when ignoring header case (#1641)

- fixes #1633
- fixes #1635

---

This PR fixes unexpected `missing-label` error in validation report
which occurs when validating tabular data in this specific case (#1635):
The data contains a named column corresponding to a required field in
the schema used for validation, written without respecting
case-sensitivity.

For example:

```
data = [["aa", "BB"], ["a", "b"]]
schema = {
        "$schema": "https://frictionlessdata.io/schemas/table-schema.json",
        "fields": [
            {"name": "AA", "constraints": {"required": True}},
            {"name": "bb", "constraints": {"required": True}}
        ]
    }
```

It adds some test cases:

- one test case to ensure that validating this tabular data with this
schema and using `schema_sync=True` and
`dialect=Dialect(header_case=False)` options, returns a valid report.
- one test case to ensure that if validation is case-sensitive, if a
name column does not respect case a missing-label occurs in validation
report.
- another test case to ensure that validating with missing required
column "AA" with this schema and using `schema_sync=True` and
`dialect=Dialect(header_case=False)` options, returns an invalid report
with `missing-label` error related to field "AA".

This PR also fixes unexpected KeyError raised when a primary key used in
shema is missing in the tabular data to validate(#1633).
For example:
```
data = [["B"], ["b"]]
schema = {
        "$schema": "https://frictionlessdata.io/schemas/table-schema.json",
        "fields": [
            {"name": "A"},
            {"name": "B"}
        ],
        "primaryKey": ["A"]
    }
```
It adds some test cases:
- two test cases to ensure that when a label related to the schema
primary key is missing, the validation report is invalid with single
missing-label error:
- a test case with the schema containing the primary key field not
specified as 'required'
- a test case with the schema containing the primary key field specified
as 'required'
- a test case to deal with insensitivity case in the data label
corresponding to the primary key schema field: the validation report is
valid

Finally, I suggest some refactoring in this PR:
- refactoring removing missing required label from field info
refactoring removing missing required label from field info
- refactoring creating schema fields among labels part when using
`schema_sync` option in `detect_schema()` `Detector` method
- refactoring to introduce intermediary `TableResource` methods to
create `cells` related to primary key schema.

---------

Co-authored-by: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com>
---
 frictionless/detector/detector.py             | 88 ++++++++++++++---
 .../resource/__spec__/test_security.py        |  8 +-
 frictionless/resources/table.py               | 94 +++++++++++++++----
 frictionless/table/__spec__/test_header.py    | 44 +++++++++
 .../__spec__/resource/test_schema.py          | 82 ++++++++++++++--
 5 files changed, 276 insertions(+), 40 deletions(-)

diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py
index dd656427d5..fc3e47e0a1 100644
--- a/frictionless/detector/detector.py
+++ b/frictionless/detector/detector.py
@@ -298,6 +298,7 @@ def detect_schema(
         labels: Optional[List[str]] = None,
         schema: Optional[Schema] = None,
         field_candidates: List[Dict[str, Any]] = settings.DEFAULT_FIELD_CANDIDATES,
+        **options: Any,
     ) -> Schema:
         """Detect schema from fragment
 
@@ -405,20 +406,29 @@ def detect_schema(
         # Sync schema
         if self.schema_sync:
             if labels:
+                case_sensitive = options["header_case"]
+
+                if not case_sensitive:
+                    labels = [label.lower() for label in labels]
+
                 if len(labels) != len(set(labels)):
                     note = '"schema_sync" requires unique labels in the header'
                     raise FrictionlessException(note)
-                mapping = {field.name: field for field in schema.fields}  # type: ignore
-                schema.clear_fields()
-                for name in labels:
-                    field = mapping.get(name)
-                    if not field:
-                        field = Field.from_descriptor({"name": name, "type": "any"})
-                    schema.add_field(field)
-                # For required fields that are missing
-                for _, field in mapping.items():
-                    if field and field.required and field.name not in labels:
-                        schema.add_field(field)
+
+                mapped_fields = self.mapped_schema_fields_names(
+                    schema.fields,  # type: ignore
+                    case_sensitive,
+                )
+
+                self.rearrange_schema_fields_given_labels(
+                    mapped_fields,
+                    schema,
+                    labels,
+                )
+
+                self.add_missing_required_labels_to_schema_fields(
+                    mapped_fields, schema, labels, case_sensitive
+                )
 
         # Patch schema
         if self.schema_patch:
@@ -432,4 +442,58 @@ def detect_schema(
                 field_descriptor.update(field_patch)
             schema = Schema.from_descriptor(descriptor)
 
-        return schema  # type: ignore
+        return schema
+
+    @staticmethod
+    def mapped_schema_fields_names(
+        fields: List[Field], case_sensitive: bool
+    ) -> Dict[str, Field]:
+        """Create a dictionnary to map field names with schema fields"""
+        if case_sensitive:
+            return {field.name: field for field in fields}
+        else:
+            return {field.name.lower(): field for field in fields}
+
+    @staticmethod
+    def rearrange_schema_fields_given_labels(
+        fields_mapping: Dict[str, Field],
+        schema: Schema,
+        labels: List[str],
+    ):
+        """Rearrange fields according to the order of labels. All fields
+        missing from labels are dropped"""
+        schema.clear_fields()
+
+        for name in labels:
+            default_field = Field.from_descriptor({"name": name, "type": "any"})
+            field = fields_mapping.get(name, default_field)
+            schema.add_field(field)
+
+    def add_missing_required_labels_to_schema_fields(
+        self,
+        fields_mapping: Dict[str, Field],
+        schema: Schema,
+        labels: List[str],
+        case_sensitive: bool,
+    ):
+        """This method aims to add missing required labels and
+        primary key field not in labels to schema fields.
+        """
+        for name, field in fields_mapping.items():
+            if (
+                self.field_is_required(field, schema, case_sensitive)
+                and name not in labels
+            ):
+                schema.add_field(field)
+
+    @staticmethod
+    def field_is_required(
+        field: Field,
+        schema: Schema,
+        case_sensitive: bool,
+    ) -> bool:
+        if case_sensitive:
+            return field.required or field.name in schema.primary_key
+        else:
+            lower_primary_key = [pk.lower() for pk in schema.primary_key]
+            return field.required or field.name.lower() in lower_primary_key
diff --git a/frictionless/resource/__spec__/test_security.py b/frictionless/resource/__spec__/test_security.py
index e44916e727..36c2fc2b3b 100644
--- a/frictionless/resource/__spec__/test_security.py
+++ b/frictionless/resource/__spec__/test_security.py
@@ -27,9 +27,11 @@ def test_resource_source_path_error_bad_path_not_safe_traversing():
         Resource(
             {
                 "name": "name",
-                "path": "data/../data/table.csv"
-                if not platform.type == "windows"
-                else "data\\..\\table.csv",
+                "path": (
+                    "data/../data/table.csv"
+                    if not platform.type == "windows"
+                    else "data\\..\\table.csv"
+                ),
             }
         )
     error = excinfo.value.error
diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py
index 98a76f3473..fdf9ffce4b 100644
--- a/frictionless/resources/table.py
+++ b/frictionless/resources/table.py
@@ -5,6 +5,8 @@
 import warnings
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
+from frictionless.schema.field import Field
+
 from .. import errors, helpers, settings
 from ..analyzer import Analyzer
 from ..dialect import Dialect
@@ -204,6 +206,7 @@ def __open_schema(self):
             labels=self.labels,
             schema=self.schema,
             field_candidates=system.detect_field_candidates(),
+            header_case=self.dialect.header_case,
         )
         self.stats.fields = len(self.schema.fields)
 
@@ -331,15 +334,21 @@ def row_stream():
 
                 # Primary Key Error
                 if is_integrity and self.schema.primary_key:
-                    cells = tuple(row[name] for name in self.schema.primary_key)
-                    if set(cells) == {None}:
-                        note = 'cells composing the primary keys are all "None"'
-                        error = errors.PrimaryKeyError.from_row(row, note=note)
-                        row.errors.append(error)
+                    try:
+                        cells = self.primary_key_cells(row, self.dialect.header_case)
+                    except KeyError:
+                        # Row does not have primary_key as label
+                        # There should already be a missing-label error in
+                        # in self.header corresponding to the schema primary key
+                        assert not self.header.valid
                     else:
-                        match = memory_primary.get(cells)
-                        memory_primary[cells] = row.row_number
-                        if match:
+                        if set(cells) == {None}:
+                            note = 'cells composing the primary keys are all "None"'
+                            error = errors.PrimaryKeyError.from_row(row, note=note)
+                            row.errors.append(error)
+                        else:
+                            match = memory_primary.get(cells)
+                            memory_primary[cells] = row.row_number
                             if match:
                                 note = "the same as in the row at position %s" % match
                                 error = errors.PrimaryKeyError.from_row(row, note=note)
@@ -386,20 +395,71 @@ def row_stream():
                 # Yield row
                 yield row
 
-        # NB: missing required labels are not included in the
-        # field_info parameter used for row creation
         if self.detector.schema_sync:
+            # Missing required labels are not included in the
+            # field_info parameter used for row creation
             for field in self.schema.fields:
-                if field.name not in self.labels and field.name in field_info["names"]:
-                    field_index = field_info["names"].index(field.name)
-                    del field_info["names"][field_index]
-                    del field_info["objects"][field_index]
-                    del field_info["mapping"][field.name]
-        # # Create row stream
+                self.remove_missing_required_label_from_field_info(field, field_info)
+
+        # Create row stream
         self.__row_stream = row_stream()
 
-    # Read
+    def remove_missing_required_label_from_field_info(
+        self, field: Field, field_info: Dict[str, Any]
+    ):
+        is_case_sensitive = self.dialect.header_case
+        if self.label_is_missing(
+            field.name, field_info["names"], self.labels, is_case_sensitive
+        ):
+            self.remove_field_from_field_info(field.name, field_info)
+
+    @staticmethod
+    def label_is_missing(
+        field_name: str,
+        expected_field_names: List[str],
+        table_labels: types.ILabels,
+        case_sensitive: bool,
+    ) -> bool:
+        """Check if a schema field name is missing from the TableResource
+        labels.
+        """
+        if not case_sensitive:
+            field_name = field_name.lower()
+            table_labels = [label.lower() for label in table_labels]
+            expected_field_names = [
+                field_name.lower() for field_name in expected_field_names
+            ]
+
+        return field_name not in table_labels and field_name in expected_field_names
 
+    @staticmethod
+    def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]):
+        field_index = field_info["names"].index(field_name)
+        del field_info["names"][field_index]
+        del field_info["objects"][field_index]
+        del field_info["mapping"][field_name]
+
+    def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[Any, ...]:
+        """Create a tuple containg all cells from a given row associated to primary
+        keys"""
+        return tuple(row[label] for label in self.primary_key_labels(row, case_sensitive))
+
+    def primary_key_labels(
+        self,
+        row: Row,
+        case_sensitive: bool,
+    ) -> List[str]:
+        """Create a list of TableResource labels that are primary keys"""
+        if case_sensitive:
+            labels_primary_key = self.schema.primary_key
+        else:
+            lower_primary_key = [pk.lower() for pk in self.schema.primary_key]
+            labels_primary_key = [
+                label for label in row.field_names if label.lower() in lower_primary_key
+            ]
+        return labels_primary_key
+
+    # Read
     def read_cells(self, *, size: Optional[int] = None) -> List[List[Any]]:
         """Read lists into memory
 
diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py
index e3fce0a921..b4c43600b1 100644
--- a/frictionless/table/__spec__/test_header.py
+++ b/frictionless/table/__spec__/test_header.py
@@ -1,3 +1,6 @@
+import pytest
+
+import frictionless
 from frictionless import Schema, fields
 from frictionless.resources import TableResource
 
@@ -37,3 +40,44 @@ def test_missing_label():
         assert header == ["id", "name", "extra"]
         assert header.labels == ["id", "name"]
         assert header.valid is False
+
+
+@pytest.mark.parametrize(
+    "source, required, valid_report, nb_errors, types_errors_expected, header_case",
+    [
+        ([["B"], ["foo"]], {"required": True}, False, 1, ["missing-label"], True),
+        ([["B"], ["foo"]], {}, False, 1, ["missing-label"], True),
+        ([["a"], ["foo"]], {"required": True}, False, 1, ["missing-label"], True),
+        ([["a"], ["foo"]], {}, False, 1, ["missing-label"], True),
+        # Ignore header_case
+        ([["B"], ["foo"]], {"required": True}, False, 1, ["missing-label"], False),
+        ([["B"], ["foo"]], {}, False, 1, ["missing-label"], False),
+        ([["a"], ["foo"]], {"required": True}, True, 0, [], False),
+        ([["a"], ["foo"]], {}, True, 0, [], False),
+    ],
+)
+def test_missing_primary_key_label_with_shema_sync_issue_1633(
+    source, required, valid_report, nb_errors, types_errors_expected, header_case
+):
+    schema_descriptor = {
+        "$schema": "https://frictionlessdata.io/schemas/table-schema.json",
+        "fields": [{"name": "A", "constraints": required}],
+        "primaryKey": ["A"],
+    }
+
+    resource = TableResource(
+        source=source,
+        schema=Schema.from_descriptor(schema_descriptor),
+        detector=frictionless.Detector(schema_sync=True),
+        dialect=frictionless.Dialect(header_case=header_case),
+    )
+
+    report = frictionless.validate(resource)
+
+    assert report.valid == valid_report
+
+    if not report.valid:
+        errors = report.tasks[0].errors
+        assert len(errors) == nb_errors
+        for error, type_expected in zip(errors, types_errors_expected):
+            assert error.type == type_expected
diff --git a/frictionless/validator/__spec__/resource/test_schema.py b/frictionless/validator/__spec__/resource/test_schema.py
index 616f0a2728..b386f05c3b 100644
--- a/frictionless/validator/__spec__/resource/test_schema.py
+++ b/frictionless/validator/__spec__/resource/test_schema.py
@@ -3,7 +3,14 @@
 import pytest
 
 import frictionless
-from frictionless import Checklist, Detector, FrictionlessException, Schema, fields
+from frictionless import (
+    Checklist,
+    Detector,
+    Dialect,
+    FrictionlessException,
+    Schema,
+    fields,
+)
 from frictionless.resources import TableResource
 
 # General
@@ -310,7 +317,7 @@ def test_resource_validate_less_actual_fields_with_required_constraint_issue_950
 
 
 def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_1611():
-    schema_descriptor_1 = {
+    schema_A_required = {
         "$schema": "https://frictionlessdata.io/schemas/table-schema.json",
         "fields": [
             {
@@ -324,20 +331,20 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16
         ],
     }
 
-    schema_descriptor_2 = deepcopy(schema_descriptor_1)
+    schema_AC_required = deepcopy(schema_A_required)
     # Add required constraint on "C" field
-    schema_descriptor_2["fields"][2]["constraints"] = {"required": True}
+    schema_AC_required["fields"][2]["constraints"] = {"required": True}
 
     test_cases = [
         {
-            "schema": schema_descriptor_1,
+            "schema": schema_A_required,
             "source": [["B", "C"], ["b", "c"]],
             "expected_flattened_report": [
                 [None, 3, "A", "missing-label"],
             ],
         },
         {
-            "schema": schema_descriptor_2,
+            "schema": schema_AC_required,
             "source": [["B"], ["b"]],
             "expected_flattened_report": [
                 [None, 2, "A", "missing-label"],
@@ -345,7 +352,7 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16
             ],
         },
         {
-            "schema": schema_descriptor_2,
+            "schema": schema_AC_required,
             "source": [
                 ["A", "B"],
                 ["a", "b"],
@@ -362,14 +369,73 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16
             ],
         },
     ]
+
     for tc in test_cases:
         schema = Schema.from_descriptor(tc["schema"])
         resource = TableResource(
             tc["source"], schema=schema, detector=Detector(schema_sync=True)
         )
+
         report = frictionless.validate(resource)
-        print(report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"]))
+
         assert (
             report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])
             == tc["expected_flattened_report"]
         )
+
+
+def test_validate_resource_ignoring_header_case_issue_1635():
+    schema_descriptor = {
+        "$schema": "https://frictionlessdata.io/schemas/table-schema.json",
+        "fields": [
+            {
+                "name": "aa",
+                "title": "Field A",
+                "type": "string",
+                "constraints": {"required": True},
+            },
+            {
+                "name": "BB",
+                "title": "Field B",
+                "type": "string",
+                "constraints": {"required": True},
+            },
+        ],
+    }
+
+    test_cases = [
+        {
+            "source": [["AA", "bb"], ["a", "b"]],
+            "header_case": False,
+            "expected_valid_report": True,
+            "expected_flattened_report": [],
+        },
+        {
+            "source": [["AA", "bb"], ["a", "b"]],
+            "header_case": True,
+            "expected_valid_report": False,
+            "expected_flattened_report": [
+                [None, 3, "aa", "missing-label"],
+                [None, 4, "BB", "missing-label"],
+            ],
+        },
+        {
+            "source": [["bb"], ["foo"]],
+            "header_case": False,
+            "expected_valid_report": False,
+            "expected_flattened_report": [[None, 2, "aa", "missing-label"]],
+        },
+    ]
+
+    for tc in test_cases:
+        resource = TableResource(
+            tc["source"],
+            schema=Schema.from_descriptor(schema_descriptor),
+            detector=Detector(schema_sync=True),
+            dialect=Dialect(header_case=tc["header_case"]),
+        )
+        report = frictionless.validate(resource)
+        assert report.valid == tc["expected_valid_report"]
+        assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == tc[
+            "expected_flattened_report"
+        ]