Skip to content

Commit

Permalink
Merge pull request #349 from gaiaresources/BDRSPS-1068
Browse files Browse the repository at this point in the history
BDRSPS-1068 Add custom string field to treat whitespace-only cells as empty
  • Loading branch information
joecrowleygaia authored Nov 28, 2024
2 parents c3d5200 + 0c80286 commit 2809657
Show file tree
Hide file tree
Showing 4 changed files with 240 additions and 0 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,15 @@ python docs/tables/vocabs.py <template_id> [-o] <output_file>
where `output_file` (optional) corresponds to a location to store the resulting csv.
Default output is standard out
if no `output_file` argument provided.

## Frictionless

The project uses the [Frictionless Framework](https://framework.frictionlessdata.io/) to validate and read the
template CSV files.
We have a number of plugins defined in the [`/abis_mapping/plugins`](/abis_mapping/plugins) directory which
customize and extend how Frictionless works.

### Overridden String Field Class

In particular, we have the [`abis_mapping/plugins/string_customized.py`](/abis_mapping/plugins/string_customized.py)
plugin which overrides the default Frictionless string field to use our custom class.
1 change: 1 addition & 0 deletions abis_mapping/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from . import related_site_id_part_of_lookup
from . import required
from . import sites_geometry
from . import string_customized
from . import tabular
from . import timestamp
from . import wkt
83 changes: 83 additions & 0 deletions abis_mapping/plugins/string_customized.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""Provides the customised string frictionless plugin and field."""

# Third-Party
import attrs
import frictionless
import frictionless.fields

# Typing
from typing import Any


class CustomizedStringPlugin(frictionless.Plugin):
"""Customized String Plugin."""

def select_field_class(
self,
type: str | None = None, # noqa: A002
) -> type[frictionless.Field] | None:
"""This hook allows a plugin to override the class used for a field, depending on the field's type.
Args:
type: The type of the field.
Returns:
Return the class to use for the field, or None.
"""
# Override the class for string fields with our custom class.
if type == "string":
return CustomizedStringField
# Not a string field, don't override the class.
# Frictionless will fall back to other plugins or the built-in field classes.
return None


@attrs.define(kw_only=True, repr=False)
class CustomizedStringField(frictionless.fields.StringField):
"""Custom String Field.
Compared to the normal string field, this class
1. Converts any whitespace-only cell to the empty string before validating and reading it.
This is so whitespace-only cells are treated the same as empty cells.
"""

# Class attributes
type = "string"
builtin = False

# Read

def create_cell_reader(self) -> frictionless.schema.ICellReader:
"""Override the way that cells are read.
NOTE we override create_cell_reader() rather than create_value_reader(), so that
1. An all-whitespace cell is converted to an emtpy string BEFORE frictionless
checks if the cell is an "empty value".
This means an all-whitespace cell will be None when our code reads the
value when mapping, the same as an empty cell.
2. This also means that an all-whitespace cell will fail validation when the
field is required.
Returns:
A function to read cell contents.
"""
# get the cell_reader for the parent class (StringField)
default_cell_reader = super().create_cell_reader()

# define our custom cell reader
def cell_reader(cell: Any) -> tuple[Any, frictionless.schema.INotes]:
# first convert whitespace-only cell to empty string
if isinstance(cell, str) and cell.isspace():
cell = ""

# then let frictionless do its normal conversions/validations
return default_cell_reader(cell)

return cell_reader


# Register Plugin
frictionless.system.register(
name="customized-string",
plugin=CustomizedStringPlugin(),
)
144 changes: 144 additions & 0 deletions tests/plugins/test_customized_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import frictionless

from abis_mapping import plugins


def test_customized_string_plugin() -> None:
"""Tests the customized string plugin"""
# Instantiate plugin
plugin = plugins.string_customized.CustomizedStringPlugin()

# Incorrect type
result = plugin.select_field_class("notAType")
assert result is None

# Type this plugin overrides field class for
result = plugin.select_field_class("string")
assert result is plugins.string_customized.CustomizedStringField


def test_customized_string_registered() -> None:
"""Tests the customized string class is used for string fields."""
# Create schema with string field
schema = frictionless.Schema.from_descriptor({"fields": [{"name": "foo", "type": "string"}]})

# Extract string field
field = schema.get_field("foo")

# field is our custom class
assert isinstance(field, plugins.string_customized.CustomizedStringField)


def test_customized_string_field() -> None:
"""Tests the customized string field."""
# Instantiate the field
field = plugins.string_customized.CustomizedStringField(name="TestField")

# Normal string field behavior
assert field.read_cell("") == (None, None)
assert field.read_cell("foo") == ("foo", None)
assert field.read_cell(" foo") == (" foo", None)
assert field.read_cell("foo ") == ("foo ", None)

# All whitespace value is customized to None
assert field.read_cell(" ") == (None, None)
assert field.read_cell(" ") == (None, None)
assert field.read_cell("\n ") == (None, None)
assert field.read_cell(" \r\n") == (None, None)
assert field.read_cell(" \t ") == (None, None)


def test_customized_string_field_with_required_constraint() -> None:
"""Tests the customized string field with a required constraint."""
# Instantiate the field
required_field = plugins.string_customized.CustomizedStringField(
name="TestField",
constraints={"required": True},
)

# normal string field behavior
assert required_field.read_cell("foo") == ("foo", None)
assert required_field.read_cell("") == (None, {"required": 'constraint "required" is "True"'})

# Normalization is applied before checking constraint, making these invalid.
assert required_field.read_cell(" ") == (None, {"required": 'constraint "required" is "True"'})
assert required_field.read_cell("\t") == (None, {"required": 'constraint "required" is "True"'})


def test_using_customized_string_field_with_resource() -> None:
"""Test reading a valid csv where the cells are string fields."""
# Create schema with some string fields
schema = frictionless.Schema.from_descriptor(
{
"fields": [
{"name": "ID", "type": "string", "constraints": {"unique": True}},
{"name": "foo", "type": "string"},
{"name": "bar", "type": "string"},
],
},
)
data = b"\r\n".join(
[
b"ID,foo,bar",
b"AA,fff,\t ",
b"BB, ,bar",
],
)
resource = frictionless.Resource(
data=data,
schema=schema,
format="csv",
encoding="utf-8",
)

report = resource.validate()
assert report.valid
with resource.open() as r:
assert list(r.row_stream) == [
{"ID": "AA", "foo": "fff", "bar": None},
{"ID": "BB", "foo": None, "bar": "bar"},
]


def test_customized_string_field_as_unique_field() -> None:
"""Tests that checking if a unique field has all distinct values is done after normalization is applied."""
# Create schema with unique string field
schema = frictionless.Schema.from_descriptor(
{
"fields": [
{"name": "ID", "type": "string", "constraints": {"unique": True}},
{"name": "foo", "type": "string"},
],
},
)
# The unique "ID" field has duplicate raw values,
# but these are converted to null by our custom class,
# and so are not included in the unique check.
data = b"\r\n".join(
[
b"ID,foo",
b"AA,22",
b" ,33",
b" ,44",
b"\t,55",
b"\t,66",
],
)
resource = frictionless.Resource(
data=data,
schema=schema,
format="csv",
encoding="utf-8",
)

report = resource.validate()

assert report.valid
with resource.open() as r:
assert list(r.row_stream) == [
{"ID": "AA", "foo": "22"},
{"ID": None, "foo": "33"},
{"ID": None, "foo": "44"},
{"ID": None, "foo": "55"},
{"ID": None, "foo": "66"},
]

0 comments on commit 2809657

Please sign in to comment.