Skip to content

Commit

Permalink
Merge pull request #325 from gaiaresources/BDRSPS-1010
Browse files Browse the repository at this point in the history
BDRSPS-1010 Rejigged the extra fields schema creation logic
  • Loading branch information
Lincoln-GR authored Nov 19, 2024
2 parents c184d97 + 6b69e6c commit 531a89c
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 14 deletions.
36 changes: 23 additions & 13 deletions abis_mapping/base/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

# Third-Party
import frictionless
import frictionless.errors
import rdflib
import rdflib.term

Expand Down Expand Up @@ -238,21 +239,22 @@ def extra_fields_schema(
"""Creates a schema with all extra fields found in data.
The fields of data are expected to be the same or a superset of the template's
official schema. It is expected that validation has occurred prior to calling.
official schema.
Args:
data (frictionless.Row | types.ReadableType): Row or data expected to
data: Row or data expected to
contain more columns than included in the template's schema.
full_schema (bool): Flag to indicate whether full schema for row or data
full_schema: Flag to indicate whether full schema for row or data
should be returned or just the difference.
Returns:
frictionless.Schema: A schema object, the fields of which are only
the extra fields not a part of a template's official schema if full_schema = False
else the schema will be the concatenation of the official schema fields and the
extra fields.
A schema object, the fields of which are only the extra fields not a part of a
template's official schema if full_schema = False else the schema will be the
concatenation of the official schema fields and the extra fields. Extra fields
are deemed to be any fields not named within the existing schema, as well as
any fields that are duplicated within the labels of the supplied data.
"""
# Construct official schema
# Construct schema
existing_schema: frictionless.Schema = frictionless.Schema.from_descriptor(cls.schema())

if isinstance(data, frictionless.Row):
Expand All @@ -273,21 +275,29 @@ def extra_fields_schema(

# Find list of extra fieldnames
existing_fieldnames = existing_schema.field_names
if len(actual_fieldnames) > len(existing_fieldnames):
extra_fieldnames = actual_fieldnames[len(existing_fieldnames) :]
else:
extra_fieldnames = []

# Collection for unseen fieldnames, allowing for duplicates to be created.
unseen_existing = [*existing_fieldnames]

# Get extra fieldnames
extra_fieldnames: list[str] = []
for fn in actual_fieldnames:
if fn not in unseen_existing:
extra_fieldnames.append(fn)
if fn in unseen_existing:
unseen_existing.remove(fn)

# Construct list of extra Fields with type of string
extra_fields = [
frictionless.Field.from_descriptor({"name": fieldname, "type": "string"}) for fieldname in extra_fieldnames
]

if full_schema:
# Append the extra fields onto the official schema and return
# Append the extra fields onto the official schema
for field in extra_fields:
existing_schema.add_field(field)

# Return
return existing_schema

# Create difference schema and return
Expand Down
99 changes: 98 additions & 1 deletion tests/base/test_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,6 @@ def test_extra_fields_schema_raw_data(mocker: pytest_mock.MockerFixture) -> None

# Get mapper
mapper = base.mapper.ABISMapper
assert mapper is not None

# Get data
csv_data = data_to_csv(data)
Expand All @@ -263,6 +262,104 @@ def test_extra_fields_schema_raw_data(mocker: pytest_mock.MockerFixture) -> None
assert set(full_schema.field_names) == set(existing_schema.field_names) | expected_extra_fieldnames


def test_extra_fields_schema_inserted_fields_midway(mocker: pytest_mock.MockerFixture) -> None:
"""Tests extra_fields_schema validation fails correctly for extra columns entered midway.
Args:
mocker: The mocker fixture.
"""
# Construct dataset
data = [
{"A": 123, "C": 321.6546454654654, "D": True, "E": "something", "B": 321},
{"A": 321, "C": 6.54654e-15, "D": False, "E": "another thing", "B": 123},
]

# Construct base schema descriptor
descriptor = {"fields": [{"name": "A", "type": "integer"}, {"name": "B", "type": "integer"}]}

# Construct csv with extra fields between "A" and "B"
csv_data = data_to_csv(data)

# Patch schema method and return descriptor
mocker.patch.object(base.mapper.ABISMapper, "schema").return_value = descriptor

# Construct schema (includes extra fields)
schema = base.mapper.ABISMapper.extra_fields_schema(csv_data, full_schema=True)

# Construct resource
resource = frictionless.Resource(
source=csv_data,
format="csv",
schema=schema,
encoding="utf-8",
)

# Schema may look nonsensical but the method appends extra
# fields (i.e any fields not named originally within the defined schema and any duplicate names)
# to the end of the original schema and the subsequent label checks on the data will
# reveal the issue. This is the same behaviour as inserting new columns into raw data
# with a defined schema for a frictionless resource in any case.
assert [f.name for f in schema.fields] == ["A", "B", "C", "D", "E"]

# Validate
report = resource.validate()

# Confirm error types
errors = report.flatten(["type"])
assert set([e for err in errors for e in err]) == {"incorrect-label", "type-error"}


def test_extra_fields_schema_inserted_fields_duplicate_original(mocker: pytest_mock.MockerFixture) -> None:
"""Tests extra_fields_schema fails correctly for extra columns that are duplicates of original fields.
Args:
mocker: The mocker fixture.
"""
# Create dataset
data = [
"A,B,C,B",
"123,321,321.654654654,333",
"321,123,6.545454,111",
]

# Create bytes object
csv_data = "\n".join(data).encode("utf-8")

# Construct base schema descriptor
descriptor = {"fields": [{"name": "A", "type": "integer"}, {"name": "B", "type": "integer"}]}

# Patch schema method and return descriptor
mocker.patch.object(base.mapper.ABISMapper, "schema").return_value = descriptor

# Construct schema with extra fields
schema = base.mapper.ABISMapper.extra_fields_schema(csv_data, full_schema=True)

# Schema should have duplicate B field
assert [(f.name, f.type) for f in schema.fields] == [
("A", "integer"),
("B", "integer"),
("C", "string"),
# Frictionless automatically appends '2' to avoid duplication.
# However the csv_data labels will still raise validation errors.
("B2", "string"),
]

# Construct resource
resource = frictionless.Resource(
source=csv_data,
format="csv",
schema=schema,
encoding="utf-8",
)

# Validate
report = resource.validate()

# Confirm error types
errors = report.flatten(["type"])
assert set([e for err in errors for e in err]) == {"duplicate-label"}


def test_extract_extra_fields(mocker: pytest_mock.MockerFixture) -> None:
"""Tests extraction of extra fields from a row.
Expand Down

0 comments on commit 531a89c

Please sign in to comment.