Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BDRSPS-1068 Add custom string field to treat whitespace-only cells as empty #349

Merged
merged 2 commits into from
Nov 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,15 @@ python docs/tables/vocabs.py <template_id> [-o] <output_file>
where `output_file` (optional) corresponds to a location to store the resulting csv.
Default output is standard out
if no `output_file` argument provided.

## Frictionless

The project uses the [Frictionless Framework](https://framework.frictionlessdata.io/) to validate and read the
template CSV files.
We have a number of plugins defined in the [`/abis_mapping/plugins`](/abis_mapping/plugins) directory which
customize and extend how Frictionless works.

### Overridden String Field Class

In particular, we have the [`abis_mapping/plugins/string_customized.py`](/abis_mapping/plugins/string_customized.py)
plugin which overrides the default Frictionless string field to use our custom class.
Comment on lines +96 to +106
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

1 change: 1 addition & 0 deletions abis_mapping/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from . import mutual_inclusion
from . import required
from . import sites_geometry
from . import string_customized
from . import tabular
from . import timestamp
from . import wkt
83 changes: 83 additions & 0 deletions abis_mapping/plugins/string_customized.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""Provides the customised string frictionless plugin and field."""

# Third-Party
import attrs
import frictionless
import frictionless.fields

# Typing
from typing import Any


class CustomizedStringPlugin(frictionless.Plugin):
"""Customized String Plugin."""

def select_field_class(
self,
type: str | None = None, # noqa: A002
) -> type[frictionless.Field] | None:
"""This hook allows a plugin to override the class used for a field, depending on the field's type.

Args:
type: The type of the field.

Returns:
Return the class to use for the field, or None.
"""
# Override the class for string fields with our custom class.
if type == "string":
return CustomizedStringField
# Not a string field, don't override the class.
# Frictionless will fall back to other plugins or the built-in field classes.
return None


@attrs.define(kw_only=True, repr=False)
class CustomizedStringField(frictionless.fields.StringField):
"""Custom String Field.

Compared to the normal string field, this class
1. Converts any whitespace-only cell to the empty string before validating and reading it.
This is so whitespace-only cells are treated the same as empty cells.
"""

# Class attributes
type = "string"
builtin = False

# Read

def create_cell_reader(self) -> frictionless.schema.ICellReader:
"""Override the way that cells are read.

NOTE we override create_cell_reader() rather than create_value_reader(), so that
1. An all-whitespace cell is converted to an emtpy string BEFORE frictionless
checks if the cell is an "empty value".
This means an all-whitespace cell will be None when our code reads the
value when mapping, the same as an empty cell.
2. This also means that an all-whitespace cell will fail validation when the
field is required.

Returns:
A function to read cell contents.
"""
# get the cell_reader for the parent class (StringField)
default_cell_reader = super().create_cell_reader()

# define our custom cell reader
def cell_reader(cell: Any) -> tuple[Any, frictionless.schema.INotes]:
# first convert whitespace-only cell to empty string
if isinstance(cell, str) and cell.isspace():
cell = ""

# then let frictionless do its normal conversions/validations
return default_cell_reader(cell)

return cell_reader


# Register Plugin
frictionless.system.register(
name="customized-string",
plugin=CustomizedStringPlugin(),
)
144 changes: 144 additions & 0 deletions tests/plugins/test_customized_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import frictionless

from abis_mapping import plugins


def test_customized_string_plugin() -> None:
"""Tests the customized string plugin"""
# Instantiate plugin
plugin = plugins.string_customized.CustomizedStringPlugin()

# Incorrect type
result = plugin.select_field_class("notAType")
assert result is None

# Type this plugin overrides field class for
result = plugin.select_field_class("string")
assert result is plugins.string_customized.CustomizedStringField


def test_customized_string_registered() -> None:
"""Tests the customized string class is used for string fields."""
# Create schema with string field
schema = frictionless.Schema.from_descriptor({"fields": [{"name": "foo", "type": "string"}]})

# Extract string field
field = schema.get_field("foo")

# field is our custom class
assert isinstance(field, plugins.string_customized.CustomizedStringField)


def test_customized_string_field() -> None:
"""Tests the customized string field."""
# Instantiate the field
field = plugins.string_customized.CustomizedStringField(name="TestField")

# Normal string field behavior
assert field.read_cell("") == (None, None)
assert field.read_cell("foo") == ("foo", None)
assert field.read_cell(" foo") == (" foo", None)
assert field.read_cell("foo ") == ("foo ", None)

# All whitespace value is customized to None
assert field.read_cell(" ") == (None, None)
assert field.read_cell(" ") == (None, None)
assert field.read_cell("\n ") == (None, None)
assert field.read_cell(" \r\n") == (None, None)
assert field.read_cell(" \t ") == (None, None)


def test_customized_string_field_with_required_constraint() -> None:
"""Tests the customized string field with a required constraint."""
# Instantiate the field
required_field = plugins.string_customized.CustomizedStringField(
name="TestField",
constraints={"required": True},
)

# normal string field behavior
assert required_field.read_cell("foo") == ("foo", None)
assert required_field.read_cell("") == (None, {"required": 'constraint "required" is "True"'})

# Normalization is applied before checking constraint, making these invalid.
assert required_field.read_cell(" ") == (None, {"required": 'constraint "required" is "True"'})
assert required_field.read_cell("\t") == (None, {"required": 'constraint "required" is "True"'})


def test_using_customized_string_field_with_resource() -> None:
"""Test reading a valid csv where the cells are string fields."""
# Create schema with some string fields
schema = frictionless.Schema.from_descriptor(
{
"fields": [
{"name": "ID", "type": "string", "constraints": {"unique": True}},
{"name": "foo", "type": "string"},
{"name": "bar", "type": "string"},
],
},
)
data = b"\r\n".join(
[
b"ID,foo,bar",
b"AA,fff,\t ",
b"BB, ,bar",
],
)
resource = frictionless.Resource(
data=data,
schema=schema,
format="csv",
encoding="utf-8",
)

report = resource.validate()
assert report.valid
with resource.open() as r:
assert list(r.row_stream) == [
{"ID": "AA", "foo": "fff", "bar": None},
{"ID": "BB", "foo": None, "bar": "bar"},
]


def test_customized_string_field_as_unique_field() -> None:
"""Tests that checking if a unique field has all distinct values is done after normalization is applied."""
# Create schema with unique string field
schema = frictionless.Schema.from_descriptor(
{
"fields": [
{"name": "ID", "type": "string", "constraints": {"unique": True}},
{"name": "foo", "type": "string"},
],
},
)
# The unique "ID" field has duplicate raw values,
# but these are converted to null by our custom class,
# and so are not included in the unique check.
data = b"\r\n".join(
[
b"ID,foo",
b"AA,22",
b" ,33",
b" ,44",
b"\t,55",
b"\t,66",
],
)
resource = frictionless.Resource(
data=data,
schema=schema,
format="csv",
encoding="utf-8",
)

report = resource.validate()

assert report.valid
with resource.open() as r:
assert list(r.row_stream) == [
{"ID": "AA", "foo": "22"},
{"ID": None, "foo": "33"},
{"ID": None, "foo": "44"},
{"ID": None, "foo": "55"},
{"ID": None, "foo": "66"},
]