From a0dd00c17b9170476ab794d9072bac284e9cd64a Mon Sep 17 00:00:00 2001 From: Lincoln Puzey Date: Thu, 28 Nov 2024 11:29:28 +0800 Subject: [PATCH 1/2] BDRSPS-1068 Add custom string field to treat whitespace-only cells as empty --- abis_mapping/plugins/__init__.py | 1 + abis_mapping/plugins/string_customized.py | 83 +++++++++++++ tests/plugins/test_customized_string.py | 144 ++++++++++++++++++++++ 3 files changed, 228 insertions(+) create mode 100644 abis_mapping/plugins/string_customized.py create mode 100644 tests/plugins/test_customized_string.py diff --git a/abis_mapping/plugins/__init__.py b/abis_mapping/plugins/__init__.py index f5bc8466..d03d6103 100644 --- a/abis_mapping/plugins/__init__.py +++ b/abis_mapping/plugins/__init__.py @@ -13,6 +13,7 @@ from . import mutual_inclusion from . import required from . import sites_geometry +from . import string_customized from . import tabular from . import timestamp from . import wkt diff --git a/abis_mapping/plugins/string_customized.py b/abis_mapping/plugins/string_customized.py new file mode 100644 index 00000000..32a397d3 --- /dev/null +++ b/abis_mapping/plugins/string_customized.py @@ -0,0 +1,83 @@ +"""Provides the customised string frictionless plugin and field.""" + +# Third-Party +import attrs +import frictionless +import frictionless.fields + +# Typing +from typing import Any + + +class CustomizedStringPlugin(frictionless.Plugin): + """Customized String Plugin.""" + + def select_field_class( + self, + type: str | None = None, # noqa: A002 + ) -> type[frictionless.Field] | None: + """This hook allows a plugin to override the class used for a field, depending on the field's type. + + Args: + type: The type of the field. + + Returns: + Return the class to use for the field, or None. + """ + # Override the class for string fields with our custom class. + if type == "string": + return CustomizedStringField + # Not a string field, don't override the class. + # Frictionless will fall back to other plugins or the built-in field classes. + return None + + +@attrs.define(kw_only=True, repr=False) +class CustomizedStringField(frictionless.fields.StringField): + """Custom String Field. + + Compared to the normal string field, this class + 1. Converts any whitespace-only cell to the empty string before validating and reading it. + This is so whitespace-only cells are treated the same as empty cells. + """ + + # Class attributes + type = "string" + builtin = False + + # Read + + def create_cell_reader(self) -> frictionless.schema.ICellReader: + """Override the way that cells are read. + + NOTE we override create_cell_reader() rather than create_value_reader(), so that + 1. An all-whitespace cell is converted to an emtpy string BEFORE frictionless + checks if the cell is an "empty value". + This means an all-whitespace cell will be None when our code reads the + value when mapping, the same as an empty cell. + 2. This also means that an all-whitespace cell will fail validation when the + field is required. + + Returns: + A function to read cell contents. + """ + # get the cell_reader for the parent class (StringField) + default_cell_reader = super().create_cell_reader() + + # define our custom cell reader + def cell_reader(cell: Any) -> tuple[Any, frictionless.schema.INotes]: + # first convert whitespace-only cell to empty string + if isinstance(cell, str) and cell.isspace(): + cell = "" + + # then let frictionless do its normal conversions/validations + return default_cell_reader(cell) + + return cell_reader + + +# Register Plugin +frictionless.system.register( + name="customized-string", + plugin=CustomizedStringPlugin(), +) diff --git a/tests/plugins/test_customized_string.py b/tests/plugins/test_customized_string.py new file mode 100644 index 00000000..03eddbd6 --- /dev/null +++ b/tests/plugins/test_customized_string.py @@ -0,0 +1,144 @@ +import frictionless + +from abis_mapping import plugins + + +def test_customized_string_plugin() -> None: + """Tests the customized string plugin""" + # Instantiate plugin + plugin = plugins.string_customized.CustomizedStringPlugin() + + # Incorrect type + result = plugin.select_field_class("notAType") + assert result is None + + # Type this plugin overrides field class for + result = plugin.select_field_class("string") + assert result is plugins.string_customized.CustomizedStringField + + +def test_customized_string_registered() -> None: + """Tests the customized string class is used for string fields.""" + # Create schema with string field + schema = frictionless.Schema.from_descriptor({"fields": [{"name": "foo", "type": "string"}]}) + + # Extract string field + field = schema.get_field("foo") + + # field is our custom class + assert isinstance(field, plugins.string_customized.CustomizedStringField) + + +def test_customized_string_field() -> None: + """Tests the customized string field.""" + # Instantiate the field + field = plugins.string_customized.CustomizedStringField(name="TestField") + + # Normal string field behavior + assert field.read_cell("") == (None, None) + assert field.read_cell("foo") == ("foo", None) + assert field.read_cell(" foo") == (" foo", None) + assert field.read_cell("foo ") == ("foo ", None) + + # All whitespace value is customized to None + assert field.read_cell(" ") == (None, None) + assert field.read_cell(" ") == (None, None) + assert field.read_cell("\n ") == (None, None) + assert field.read_cell(" \r\n") == (None, None) + assert field.read_cell(" \t ") == (None, None) + + +def test_customized_string_field_with_required_constraint() -> None: + """Tests the customized string field with a required constraint.""" + # Instantiate the field + required_field = plugins.string_customized.CustomizedStringField( + name="TestField", + constraints={"required": True}, + ) + + # normal string field behavior + assert required_field.read_cell("foo") == ("foo", None) + assert required_field.read_cell("") == (None, {"required": 'constraint "required" is "True"'}) + + # Normalization is applied before checking constraint, making these invalid. + assert required_field.read_cell(" ") == (None, {"required": 'constraint "required" is "True"'}) + assert required_field.read_cell("\t") == (None, {"required": 'constraint "required" is "True"'}) + + +def test_using_customized_string_field_with_resource() -> None: + """Test reading a valid csv where the cells are string fields.""" + # Create schema with some string fields + schema = frictionless.Schema.from_descriptor( + { + "fields": [ + {"name": "ID", "type": "string", "constraints": {"unique": True}}, + {"name": "foo", "type": "string"}, + {"name": "bar", "type": "string"}, + ], + }, + ) + data = b"\r\n".join( + [ + b"ID,foo,bar", + b"AA,fff,\t ", + b"BB, ,bar", + ], + ) + resource = frictionless.Resource( + data=data, + schema=schema, + format="csv", + encoding="utf-8", + ) + + report = resource.validate() + assert report.valid + with resource.open() as r: + assert list(r.row_stream) == [ + {"ID": "AA", "foo": "fff", "bar": None}, + {"ID": "BB", "foo": None, "bar": "bar"}, + ] + + +def test_customized_string_field_as_unique_field() -> None: + """Tests that checking if a unique field has all distinct values is done after normalization is applied.""" + # Create schema with unique string field + schema = frictionless.Schema.from_descriptor( + { + "fields": [ + {"name": "ID", "type": "string", "constraints": {"unique": True}}, + {"name": "foo", "type": "string"}, + ], + }, + ) + # The unique "ID" field has duplicate raw values, + # but these are converted to null by our custom class, + # and so are not included in the unique check. + data = b"\r\n".join( + [ + b"ID,foo", + b"AA,22", + b" ,33", + b" ,44", + b"\t,55", + b"\t,66", + ], + ) + resource = frictionless.Resource( + data=data, + schema=schema, + format="csv", + encoding="utf-8", + ) + + report = resource.validate() + + assert report.valid + with resource.open() as r: + assert list(r.row_stream) == [ + {"ID": "AA", "foo": "22"}, + {"ID": None, "foo": "33"}, + {"ID": None, "foo": "44"}, + {"ID": None, "foo": "55"}, + {"ID": None, "foo": "66"}, + ] From 0c80286fe3cfb60f8c3d5216b51b9ecc443b0781 Mon Sep 17 00:00:00 2001 From: Lincoln Puzey Date: Thu, 28 Nov 2024 12:45:27 +0800 Subject: [PATCH 2/2] BDRSPS-1068 Update README.md with note about frictionless and plugins --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index b378fe3f..bce0598b 100644 --- a/README.md +++ b/README.md @@ -92,3 +92,15 @@ python docs/tables/vocabs.py [-o] where `output_file` (optional) corresponds to a location to store the resulting csv. Default output is standard out if no `output_file` argument provided. + +## Frictionless + +The project uses the [Frictionless Framework](https://framework.frictionlessdata.io/) to validate and read the +template CSV files. +We have a number of plugins defined in the [`/abis_mapping/plugins`](/abis_mapping/plugins) directory which +customize and extend how Frictionless works. + +### Overridden String Field Class + +In particular, we have the [`abis_mapping/plugins/string_customized.py`](/abis_mapping/plugins/string_customized.py) +plugin which overrides the default Frictionless string field to use our custom class.