-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #349 from gaiaresources/BDRSPS-1068
BDRSPS-1068 Add custom string field to treat whitespace-only cells as empty
- Loading branch information
Showing
4 changed files
with
240 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
"""Provides the customised string frictionless plugin and field.""" | ||
|
||
# Third-Party | ||
import attrs | ||
import frictionless | ||
import frictionless.fields | ||
|
||
# Typing | ||
from typing import Any | ||
|
||
|
||
class CustomizedStringPlugin(frictionless.Plugin): | ||
"""Customized String Plugin.""" | ||
|
||
def select_field_class( | ||
self, | ||
type: str | None = None, # noqa: A002 | ||
) -> type[frictionless.Field] | None: | ||
"""This hook allows a plugin to override the class used for a field, depending on the field's type. | ||
Args: | ||
type: The type of the field. | ||
Returns: | ||
Return the class to use for the field, or None. | ||
""" | ||
# Override the class for string fields with our custom class. | ||
if type == "string": | ||
return CustomizedStringField | ||
# Not a string field, don't override the class. | ||
# Frictionless will fall back to other plugins or the built-in field classes. | ||
return None | ||
|
||
|
||
@attrs.define(kw_only=True, repr=False) | ||
class CustomizedStringField(frictionless.fields.StringField): | ||
"""Custom String Field. | ||
Compared to the normal string field, this class | ||
1. Converts any whitespace-only cell to the empty string before validating and reading it. | ||
This is so whitespace-only cells are treated the same as empty cells. | ||
""" | ||
|
||
# Class attributes | ||
type = "string" | ||
builtin = False | ||
|
||
# Read | ||
|
||
def create_cell_reader(self) -> frictionless.schema.ICellReader: | ||
"""Override the way that cells are read. | ||
NOTE we override create_cell_reader() rather than create_value_reader(), so that | ||
1. An all-whitespace cell is converted to an emtpy string BEFORE frictionless | ||
checks if the cell is an "empty value". | ||
This means an all-whitespace cell will be None when our code reads the | ||
value when mapping, the same as an empty cell. | ||
2. This also means that an all-whitespace cell will fail validation when the | ||
field is required. | ||
Returns: | ||
A function to read cell contents. | ||
""" | ||
# get the cell_reader for the parent class (StringField) | ||
default_cell_reader = super().create_cell_reader() | ||
|
||
# define our custom cell reader | ||
def cell_reader(cell: Any) -> tuple[Any, frictionless.schema.INotes]: | ||
# first convert whitespace-only cell to empty string | ||
if isinstance(cell, str) and cell.isspace(): | ||
cell = "" | ||
|
||
# then let frictionless do its normal conversions/validations | ||
return default_cell_reader(cell) | ||
|
||
return cell_reader | ||
|
||
|
||
# Register Plugin | ||
frictionless.system.register( | ||
name="customized-string", | ||
plugin=CustomizedStringPlugin(), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import frictionless | ||
|
||
from abis_mapping import plugins | ||
|
||
|
||
def test_customized_string_plugin() -> None: | ||
"""Tests the customized string plugin""" | ||
# Instantiate plugin | ||
plugin = plugins.string_customized.CustomizedStringPlugin() | ||
|
||
# Incorrect type | ||
result = plugin.select_field_class("notAType") | ||
assert result is None | ||
|
||
# Type this plugin overrides field class for | ||
result = plugin.select_field_class("string") | ||
assert result is plugins.string_customized.CustomizedStringField | ||
|
||
|
||
def test_customized_string_registered() -> None: | ||
"""Tests the customized string class is used for string fields.""" | ||
# Create schema with string field | ||
schema = frictionless.Schema.from_descriptor({"fields": [{"name": "foo", "type": "string"}]}) | ||
|
||
# Extract string field | ||
field = schema.get_field("foo") | ||
|
||
# field is our custom class | ||
assert isinstance(field, plugins.string_customized.CustomizedStringField) | ||
|
||
|
||
def test_customized_string_field() -> None: | ||
"""Tests the customized string field.""" | ||
# Instantiate the field | ||
field = plugins.string_customized.CustomizedStringField(name="TestField") | ||
|
||
# Normal string field behavior | ||
assert field.read_cell("") == (None, None) | ||
assert field.read_cell("foo") == ("foo", None) | ||
assert field.read_cell(" foo") == (" foo", None) | ||
assert field.read_cell("foo ") == ("foo ", None) | ||
|
||
# All whitespace value is customized to None | ||
assert field.read_cell(" ") == (None, None) | ||
assert field.read_cell(" ") == (None, None) | ||
assert field.read_cell("\n ") == (None, None) | ||
assert field.read_cell(" \r\n") == (None, None) | ||
assert field.read_cell(" \t ") == (None, None) | ||
|
||
|
||
def test_customized_string_field_with_required_constraint() -> None: | ||
"""Tests the customized string field with a required constraint.""" | ||
# Instantiate the field | ||
required_field = plugins.string_customized.CustomizedStringField( | ||
name="TestField", | ||
constraints={"required": True}, | ||
) | ||
|
||
# normal string field behavior | ||
assert required_field.read_cell("foo") == ("foo", None) | ||
assert required_field.read_cell("") == (None, {"required": 'constraint "required" is "True"'}) | ||
|
||
# Normalization is applied before checking constraint, making these invalid. | ||
assert required_field.read_cell(" ") == (None, {"required": 'constraint "required" is "True"'}) | ||
assert required_field.read_cell("\t") == (None, {"required": 'constraint "required" is "True"'}) | ||
|
||
|
||
def test_using_customized_string_field_with_resource() -> None: | ||
"""Test reading a valid csv where the cells are string fields.""" | ||
# Create schema with some string fields | ||
schema = frictionless.Schema.from_descriptor( | ||
{ | ||
"fields": [ | ||
{"name": "ID", "type": "string", "constraints": {"unique": True}}, | ||
{"name": "foo", "type": "string"}, | ||
{"name": "bar", "type": "string"}, | ||
], | ||
}, | ||
) | ||
data = b"\r\n".join( | ||
[ | ||
b"ID,foo,bar", | ||
b"AA,fff,\t ", | ||
b"BB, ,bar", | ||
], | ||
) | ||
resource = frictionless.Resource( | ||
data=data, | ||
schema=schema, | ||
format="csv", | ||
encoding="utf-8", | ||
) | ||
|
||
report = resource.validate() | ||
assert report.valid | ||
with resource.open() as r: | ||
assert list(r.row_stream) == [ | ||
{"ID": "AA", "foo": "fff", "bar": None}, | ||
{"ID": "BB", "foo": None, "bar": "bar"}, | ||
] | ||
|
||
|
||
def test_customized_string_field_as_unique_field() -> None: | ||
"""Tests that checking if a unique field has all distinct values is done after normalization is applied.""" | ||
# Create schema with unique string field | ||
schema = frictionless.Schema.from_descriptor( | ||
{ | ||
"fields": [ | ||
{"name": "ID", "type": "string", "constraints": {"unique": True}}, | ||
{"name": "foo", "type": "string"}, | ||
], | ||
}, | ||
) | ||
# The unique "ID" field has duplicate raw values, | ||
# but these are converted to null by our custom class, | ||
# and so are not included in the unique check. | ||
data = b"\r\n".join( | ||
[ | ||
b"ID,foo", | ||
b"AA,22", | ||
b" ,33", | ||
b" ,44", | ||
b"\t,55", | ||
b"\t,66", | ||
], | ||
) | ||
resource = frictionless.Resource( | ||
data=data, | ||
schema=schema, | ||
format="csv", | ||
encoding="utf-8", | ||
) | ||
|
||
report = resource.validate() | ||
|
||
assert report.valid | ||
with resource.open() as r: | ||
assert list(r.row_stream) == [ | ||
{"ID": "AA", "foo": "22"}, | ||
{"ID": None, "foo": "33"}, | ||
{"ID": None, "foo": "44"}, | ||
{"ID": None, "foo": "55"}, | ||
{"ID": None, "foo": "66"}, | ||
] |