From 7861f0e6e3dd9e8392d1d8d57f850c7b5862b81a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= <119412389+amelie-rondot@users.noreply.github.com> Date: Fri, 30 Aug 2024 13:22:48 +0200 Subject: [PATCH] Fix 1633 unexpected key error with primary key in schema and 1635 unexpected missing label error when ignoring header case (#1641) - fixes #1633 - fixes #1635 --- This PR fixes unexpected `missing-label` error in validation report which occurs when validating tabular data in this specific case (#1635): The data contains a named column corresponding to a required field in the schema used for validation, written without respecting case-sensitivity. For example: ``` data = [["aa", "BB"], ["a", "b"]] schema = { "$schema": "https://frictionlessdata.io/schemas/table-schema.json", "fields": [ {"name": "AA", "constraints": {"required": True}}, {"name": "bb", "constraints": {"required": True}} ] } ``` It adds some test cases: - one test case to ensure that validating this tabular data with this schema and using `schema_sync=True` and `dialect=Dialect(header_case=False)` options, returns a valid report. - one test case to ensure that if validation is case-sensitive, if a name column does not respect case a missing-label occurs in validation report. - another test case to ensure that validating with missing required column "AA" with this schema and using `schema_sync=True` and `dialect=Dialect(header_case=False)` options, returns an invalid report with `missing-label` error related to field "AA". This PR also fixes unexpected KeyError raised when a primary key used in shema is missing in the tabular data to validate(#1633). For example: ``` data = [["B"], ["b"]] schema = { "$schema": "https://frictionlessdata.io/schemas/table-schema.json", "fields": [ {"name": "A"}, {"name": "B"} ], "primaryKey": ["A"] } ``` It adds some test cases: - two test cases to ensure that when a label related to the schema primary key is missing, the validation report is invalid with single missing-label error: - a test case with the schema containing the primary key field not specified as 'required' - a test case with the schema containing the primary key field specified as 'required' - a test case to deal with insensitivity case in the data label corresponding to the primary key schema field: the validation report is valid Finally, I suggest some refactoring in this PR: - refactoring removing missing required label from field info refactoring removing missing required label from field info - refactoring creating schema fields among labels part when using `schema_sync` option in `detect_schema()` `Detector` method - refactoring to introduce intermediary `TableResource` methods to create `cells` related to primary key schema. --------- Co-authored-by: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com> --- frictionless/detector/detector.py | 88 ++++++++++++++--- .../resource/__spec__/test_security.py | 8 +- frictionless/resources/table.py | 94 +++++++++++++++---- frictionless/table/__spec__/test_header.py | 44 +++++++++ .../__spec__/resource/test_schema.py | 82 ++++++++++++++-- 5 files changed, 276 insertions(+), 40 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index dd656427d5..fc3e47e0a1 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -298,6 +298,7 @@ def detect_schema( labels: Optional[List[str]] = None, schema: Optional[Schema] = None, field_candidates: List[Dict[str, Any]] = settings.DEFAULT_FIELD_CANDIDATES, + **options: Any, ) -> Schema: """Detect schema from fragment @@ -405,20 +406,29 @@ def detect_schema( # Sync schema if self.schema_sync: if labels: + case_sensitive = options["header_case"] + + if not case_sensitive: + labels = [label.lower() for label in labels] + if len(labels) != len(set(labels)): note = '"schema_sync" requires unique labels in the header' raise FrictionlessException(note) - mapping = {field.name: field for field in schema.fields} # type: ignore - schema.clear_fields() - for name in labels: - field = mapping.get(name) - if not field: - field = Field.from_descriptor({"name": name, "type": "any"}) - schema.add_field(field) - # For required fields that are missing - for _, field in mapping.items(): - if field and field.required and field.name not in labels: - schema.add_field(field) + + mapped_fields = self.mapped_schema_fields_names( + schema.fields, # type: ignore + case_sensitive, + ) + + self.rearrange_schema_fields_given_labels( + mapped_fields, + schema, + labels, + ) + + self.add_missing_required_labels_to_schema_fields( + mapped_fields, schema, labels, case_sensitive + ) # Patch schema if self.schema_patch: @@ -432,4 +442,58 @@ def detect_schema( field_descriptor.update(field_patch) schema = Schema.from_descriptor(descriptor) - return schema # type: ignore + return schema + + @staticmethod + def mapped_schema_fields_names( + fields: List[Field], case_sensitive: bool + ) -> Dict[str, Field]: + """Create a dictionnary to map field names with schema fields""" + if case_sensitive: + return {field.name: field for field in fields} + else: + return {field.name.lower(): field for field in fields} + + @staticmethod + def rearrange_schema_fields_given_labels( + fields_mapping: Dict[str, Field], + schema: Schema, + labels: List[str], + ): + """Rearrange fields according to the order of labels. All fields + missing from labels are dropped""" + schema.clear_fields() + + for name in labels: + default_field = Field.from_descriptor({"name": name, "type": "any"}) + field = fields_mapping.get(name, default_field) + schema.add_field(field) + + def add_missing_required_labels_to_schema_fields( + self, + fields_mapping: Dict[str, Field], + schema: Schema, + labels: List[str], + case_sensitive: bool, + ): + """This method aims to add missing required labels and + primary key field not in labels to schema fields. + """ + for name, field in fields_mapping.items(): + if ( + self.field_is_required(field, schema, case_sensitive) + and name not in labels + ): + schema.add_field(field) + + @staticmethod + def field_is_required( + field: Field, + schema: Schema, + case_sensitive: bool, + ) -> bool: + if case_sensitive: + return field.required or field.name in schema.primary_key + else: + lower_primary_key = [pk.lower() for pk in schema.primary_key] + return field.required or field.name.lower() in lower_primary_key diff --git a/frictionless/resource/__spec__/test_security.py b/frictionless/resource/__spec__/test_security.py index e44916e727..36c2fc2b3b 100644 --- a/frictionless/resource/__spec__/test_security.py +++ b/frictionless/resource/__spec__/test_security.py @@ -27,9 +27,11 @@ def test_resource_source_path_error_bad_path_not_safe_traversing(): Resource( { "name": "name", - "path": "data/../data/table.csv" - if not platform.type == "windows" - else "data\\..\\table.csv", + "path": ( + "data/../data/table.csv" + if not platform.type == "windows" + else "data\\..\\table.csv" + ), } ) error = excinfo.value.error diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 98a76f3473..fdf9ffce4b 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -5,6 +5,8 @@ import warnings from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from frictionless.schema.field import Field + from .. import errors, helpers, settings from ..analyzer import Analyzer from ..dialect import Dialect @@ -204,6 +206,7 @@ def __open_schema(self): labels=self.labels, schema=self.schema, field_candidates=system.detect_field_candidates(), + header_case=self.dialect.header_case, ) self.stats.fields = len(self.schema.fields) @@ -331,15 +334,21 @@ def row_stream(): # Primary Key Error if is_integrity and self.schema.primary_key: - cells = tuple(row[name] for name in self.schema.primary_key) - if set(cells) == {None}: - note = 'cells composing the primary keys are all "None"' - error = errors.PrimaryKeyError.from_row(row, note=note) - row.errors.append(error) + try: + cells = self.primary_key_cells(row, self.dialect.header_case) + except KeyError: + # Row does not have primary_key as label + # There should already be a missing-label error in + # in self.header corresponding to the schema primary key + assert not self.header.valid else: - match = memory_primary.get(cells) - memory_primary[cells] = row.row_number - if match: + if set(cells) == {None}: + note = 'cells composing the primary keys are all "None"' + error = errors.PrimaryKeyError.from_row(row, note=note) + row.errors.append(error) + else: + match = memory_primary.get(cells) + memory_primary[cells] = row.row_number if match: note = "the same as in the row at position %s" % match error = errors.PrimaryKeyError.from_row(row, note=note) @@ -386,20 +395,71 @@ def row_stream(): # Yield row yield row - # NB: missing required labels are not included in the - # field_info parameter used for row creation if self.detector.schema_sync: + # Missing required labels are not included in the + # field_info parameter used for row creation for field in self.schema.fields: - if field.name not in self.labels and field.name in field_info["names"]: - field_index = field_info["names"].index(field.name) - del field_info["names"][field_index] - del field_info["objects"][field_index] - del field_info["mapping"][field.name] - # # Create row stream + self.remove_missing_required_label_from_field_info(field, field_info) + + # Create row stream self.__row_stream = row_stream() - # Read + def remove_missing_required_label_from_field_info( + self, field: Field, field_info: Dict[str, Any] + ): + is_case_sensitive = self.dialect.header_case + if self.label_is_missing( + field.name, field_info["names"], self.labels, is_case_sensitive + ): + self.remove_field_from_field_info(field.name, field_info) + + @staticmethod + def label_is_missing( + field_name: str, + expected_field_names: List[str], + table_labels: types.ILabels, + case_sensitive: bool, + ) -> bool: + """Check if a schema field name is missing from the TableResource + labels. + """ + if not case_sensitive: + field_name = field_name.lower() + table_labels = [label.lower() for label in table_labels] + expected_field_names = [ + field_name.lower() for field_name in expected_field_names + ] + + return field_name not in table_labels and field_name in expected_field_names + @staticmethod + def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]): + field_index = field_info["names"].index(field_name) + del field_info["names"][field_index] + del field_info["objects"][field_index] + del field_info["mapping"][field_name] + + def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[Any, ...]: + """Create a tuple containg all cells from a given row associated to primary + keys""" + return tuple(row[label] for label in self.primary_key_labels(row, case_sensitive)) + + def primary_key_labels( + self, + row: Row, + case_sensitive: bool, + ) -> List[str]: + """Create a list of TableResource labels that are primary keys""" + if case_sensitive: + labels_primary_key = self.schema.primary_key + else: + lower_primary_key = [pk.lower() for pk in self.schema.primary_key] + labels_primary_key = [ + label for label in row.field_names if label.lower() in lower_primary_key + ] + return labels_primary_key + + # Read def read_cells(self, *, size: Optional[int] = None) -> List[List[Any]]: """Read lists into memory diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index e3fce0a921..b4c43600b1 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -1,3 +1,6 @@ +import pytest + +import frictionless from frictionless import Schema, fields from frictionless.resources import TableResource @@ -37,3 +40,44 @@ def test_missing_label(): assert header == ["id", "name", "extra"] assert header.labels == ["id", "name"] assert header.valid is False + + +@pytest.mark.parametrize( + "source, required, valid_report, nb_errors, types_errors_expected, header_case", + [ + ([["B"], ["foo"]], {"required": True}, False, 1, ["missing-label"], True), + ([["B"], ["foo"]], {}, False, 1, ["missing-label"], True), + ([["a"], ["foo"]], {"required": True}, False, 1, ["missing-label"], True), + ([["a"], ["foo"]], {}, False, 1, ["missing-label"], True), + # Ignore header_case + ([["B"], ["foo"]], {"required": True}, False, 1, ["missing-label"], False), + ([["B"], ["foo"]], {}, False, 1, ["missing-label"], False), + ([["a"], ["foo"]], {"required": True}, True, 0, [], False), + ([["a"], ["foo"]], {}, True, 0, [], False), + ], +) +def test_missing_primary_key_label_with_shema_sync_issue_1633( + source, required, valid_report, nb_errors, types_errors_expected, header_case +): + schema_descriptor = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [{"name": "A", "constraints": required}], + "primaryKey": ["A"], + } + + resource = TableResource( + source=source, + schema=Schema.from_descriptor(schema_descriptor), + detector=frictionless.Detector(schema_sync=True), + dialect=frictionless.Dialect(header_case=header_case), + ) + + report = frictionless.validate(resource) + + assert report.valid == valid_report + + if not report.valid: + errors = report.tasks[0].errors + assert len(errors) == nb_errors + for error, type_expected in zip(errors, types_errors_expected): + assert error.type == type_expected diff --git a/frictionless/validator/__spec__/resource/test_schema.py b/frictionless/validator/__spec__/resource/test_schema.py index 616f0a2728..b386f05c3b 100644 --- a/frictionless/validator/__spec__/resource/test_schema.py +++ b/frictionless/validator/__spec__/resource/test_schema.py @@ -3,7 +3,14 @@ import pytest import frictionless -from frictionless import Checklist, Detector, FrictionlessException, Schema, fields +from frictionless import ( + Checklist, + Detector, + Dialect, + FrictionlessException, + Schema, + fields, +) from frictionless.resources import TableResource # General @@ -310,7 +317,7 @@ def test_resource_validate_less_actual_fields_with_required_constraint_issue_950 def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_1611(): - schema_descriptor_1 = { + schema_A_required = { "$schema": "https://frictionlessdata.io/schemas/table-schema.json", "fields": [ { @@ -324,20 +331,20 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 ], } - schema_descriptor_2 = deepcopy(schema_descriptor_1) + schema_AC_required = deepcopy(schema_A_required) # Add required constraint on "C" field - schema_descriptor_2["fields"][2]["constraints"] = {"required": True} + schema_AC_required["fields"][2]["constraints"] = {"required": True} test_cases = [ { - "schema": schema_descriptor_1, + "schema": schema_A_required, "source": [["B", "C"], ["b", "c"]], "expected_flattened_report": [ [None, 3, "A", "missing-label"], ], }, { - "schema": schema_descriptor_2, + "schema": schema_AC_required, "source": [["B"], ["b"]], "expected_flattened_report": [ [None, 2, "A", "missing-label"], @@ -345,7 +352,7 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 ], }, { - "schema": schema_descriptor_2, + "schema": schema_AC_required, "source": [ ["A", "B"], ["a", "b"], @@ -362,14 +369,73 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 ], }, ] + for tc in test_cases: schema = Schema.from_descriptor(tc["schema"]) resource = TableResource( tc["source"], schema=schema, detector=Detector(schema_sync=True) ) + report = frictionless.validate(resource) - print(report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) + assert ( report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"]) == tc["expected_flattened_report"] ) + + +def test_validate_resource_ignoring_header_case_issue_1635(): + schema_descriptor = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [ + { + "name": "aa", + "title": "Field A", + "type": "string", + "constraints": {"required": True}, + }, + { + "name": "BB", + "title": "Field B", + "type": "string", + "constraints": {"required": True}, + }, + ], + } + + test_cases = [ + { + "source": [["AA", "bb"], ["a", "b"]], + "header_case": False, + "expected_valid_report": True, + "expected_flattened_report": [], + }, + { + "source": [["AA", "bb"], ["a", "b"]], + "header_case": True, + "expected_valid_report": False, + "expected_flattened_report": [ + [None, 3, "aa", "missing-label"], + [None, 4, "BB", "missing-label"], + ], + }, + { + "source": [["bb"], ["foo"]], + "header_case": False, + "expected_valid_report": False, + "expected_flattened_report": [[None, 2, "aa", "missing-label"]], + }, + ] + + for tc in test_cases: + resource = TableResource( + tc["source"], + schema=Schema.from_descriptor(schema_descriptor), + detector=Detector(schema_sync=True), + dialect=Dialect(header_case=tc["header_case"]), + ) + report = frictionless.validate(resource) + assert report.valid == tc["expected_valid_report"] + assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == tc[ + "expected_flattened_report" + ]