Merge pull request #325 from gaiaresources/BDRSPS-1010

BDRSPS-1010 Rejigged the extra fields schema creation logic
gaiaresources · Nov 19, 2024 · 531a89c · 531a89c
2 parents c184d97 + 6b69e6c
commit 531a89c
Show file tree

Hide file tree

Showing 2 changed files with 121 additions and 14 deletions.
diff --git a/abis_mapping/base/mapper.py b/abis_mapping/base/mapper.py
@@ -10,6 +10,7 @@
 
 # Third-Party
 import frictionless
+import frictionless.errors
 import rdflib
 import rdflib.term
 
@@ -238,21 +239,22 @@ def extra_fields_schema(
         """Creates a schema with all extra fields found in data.
 
         The fields of data are expected to be the same or a superset of the template's
-        official schema. It is expected that validation has occurred prior to calling.
+        official schema.
 
         Args:
-            data (frictionless.Row | types.ReadableType): Row or data expected to
+            data: Row or data expected to
                 contain more columns than included in the template's schema.
-            full_schema (bool): Flag to indicate whether full schema for row or data
+            full_schema: Flag to indicate whether full schema for row or data
                 should be returned or just the difference.
 
         Returns:
-            frictionless.Schema: A schema object, the fields of which are only
-                the extra fields not a part of a template's official schema if full_schema = False
-                else the schema will be the concatenation of the official schema fields and the
-                extra fields.
+            A schema object, the fields of which are only the extra fields not a part of a
+                template's official schema if full_schema = False else the schema will be the
+                concatenation of the official schema fields and the extra fields. Extra fields
+                are deemed to be any fields not named within the existing schema, as well as
+                any fields that are duplicated within the labels of the supplied data.
         """
-        # Construct official schema
+        # Construct schema
         existing_schema: frictionless.Schema = frictionless.Schema.from_descriptor(cls.schema())
 
         if isinstance(data, frictionless.Row):
@@ -273,21 +275,29 @@ def extra_fields_schema(
 
         # Find list of extra fieldnames
         existing_fieldnames = existing_schema.field_names
-        if len(actual_fieldnames) > len(existing_fieldnames):
-            extra_fieldnames = actual_fieldnames[len(existing_fieldnames) :]
-        else:
-            extra_fieldnames = []
+
+        # Collection for unseen fieldnames, allowing for duplicates to be created.
+        unseen_existing = [*existing_fieldnames]
+
+        # Get extra fieldnames
+        extra_fieldnames: list[str] = []
+        for fn in actual_fieldnames:
+            if fn not in unseen_existing:
+                extra_fieldnames.append(fn)
+            if fn in unseen_existing:
+                unseen_existing.remove(fn)
 
         # Construct list of extra Fields with type of string
         extra_fields = [
             frictionless.Field.from_descriptor({"name": fieldname, "type": "string"}) for fieldname in extra_fieldnames
         ]
 
         if full_schema:
-            # Append the extra fields onto the official schema and return
+            # Append the extra fields onto the official schema
             for field in extra_fields:
                 existing_schema.add_field(field)
 
+            # Return
             return existing_schema
 
         # Create difference schema and return

diff --git a/tests/base/test_mapper.py b/tests/base/test_mapper.py
@@ -238,7 +238,6 @@ def test_extra_fields_schema_raw_data(mocker: pytest_mock.MockerFixture) -> None
 
     # Get mapper
     mapper = base.mapper.ABISMapper
-    assert mapper is not None
 
     # Get data
     csv_data = data_to_csv(data)
@@ -263,6 +262,104 @@ def test_extra_fields_schema_raw_data(mocker: pytest_mock.MockerFixture) -> None
     assert set(full_schema.field_names) == set(existing_schema.field_names) | expected_extra_fieldnames
 
 
+def test_extra_fields_schema_inserted_fields_midway(mocker: pytest_mock.MockerFixture) -> None:
+    """Tests extra_fields_schema validation fails correctly for extra columns entered midway.
+
+    Args:
+        mocker: The mocker fixture.
+    """
+    # Construct dataset
+    data = [
+        {"A": 123, "C": 321.6546454654654, "D": True, "E": "something", "B": 321},
+        {"A": 321, "C": 6.54654e-15, "D": False, "E": "another thing", "B": 123},
+    ]
+
+    # Construct base schema descriptor
+    descriptor = {"fields": [{"name": "A", "type": "integer"}, {"name": "B", "type": "integer"}]}
+
+    # Construct csv with extra fields between "A" and "B"
+    csv_data = data_to_csv(data)
+
+    # Patch schema method and return descriptor
+    mocker.patch.object(base.mapper.ABISMapper, "schema").return_value = descriptor
+
+    # Construct schema (includes extra fields)
+    schema = base.mapper.ABISMapper.extra_fields_schema(csv_data, full_schema=True)
+
+    # Construct resource
+    resource = frictionless.Resource(
+        source=csv_data,
+        format="csv",
+        schema=schema,
+        encoding="utf-8",
+    )
+
+    # Schema may look nonsensical but the method appends extra
+    # fields (i.e any fields not named originally within the defined schema and any duplicate names)
+    # to the end of the original schema and the subsequent label checks on the data will
+    # reveal the issue. This is the same behaviour as inserting new columns into raw data
+    # with a defined schema for a frictionless resource in any case.
+    assert [f.name for f in schema.fields] == ["A", "B", "C", "D", "E"]
+
+    # Validate
+    report = resource.validate()
+
+    # Confirm error types
+    errors = report.flatten(["type"])
+    assert set([e for err in errors for e in err]) == {"incorrect-label", "type-error"}
+
+
+def test_extra_fields_schema_inserted_fields_duplicate_original(mocker: pytest_mock.MockerFixture) -> None:
+    """Tests extra_fields_schema fails correctly for extra columns that are duplicates of original fields.
+
+    Args:
+        mocker: The mocker fixture.
+    """
+    # Create dataset
+    data = [
+        "A,B,C,B",
+        "123,321,321.654654654,333",
+        "321,123,6.545454,111",
+    ]
+
+    # Create bytes object
+    csv_data = "\n".join(data).encode("utf-8")
+
+    # Construct base schema descriptor
+    descriptor = {"fields": [{"name": "A", "type": "integer"}, {"name": "B", "type": "integer"}]}
+
+    # Patch schema method and return descriptor
+    mocker.patch.object(base.mapper.ABISMapper, "schema").return_value = descriptor
+
+    # Construct schema with extra fields
+    schema = base.mapper.ABISMapper.extra_fields_schema(csv_data, full_schema=True)
+
+    # Schema should have duplicate B field
+    assert [(f.name, f.type) for f in schema.fields] == [
+        ("A", "integer"),
+        ("B", "integer"),
+        ("C", "string"),
+        # Frictionless automatically appends '2' to avoid duplication.
+        # However the csv_data labels will still raise validation errors.
+        ("B2", "string"),
+    ]
+
+    # Construct resource
+    resource = frictionless.Resource(
+        source=csv_data,
+        format="csv",
+        schema=schema,
+        encoding="utf-8",
+    )
+
+    # Validate
+    report = resource.validate()
+
+    # Confirm error types
+    errors = report.flatten(["type"])
+    assert set([e for err in errors for e in err]) == {"duplicate-label"}
+
+
 def test_extract_extra_fields(mocker: pytest_mock.MockerFixture) -> None:
     """Tests extraction of extra fields from a row.