-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BDRSPS-1068 Add custom string field to treat whitespace-only cells as empty #349
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
"""Provides the customised string frictionless plugin and field.""" | ||
|
||
# Third-Party | ||
import attrs | ||
import frictionless | ||
import frictionless.fields | ||
|
||
# Typing | ||
from typing import Any | ||
|
||
|
||
class CustomizedStringPlugin(frictionless.Plugin): | ||
"""Customized String Plugin.""" | ||
|
||
def select_field_class( | ||
self, | ||
type: str | None = None, # noqa: A002 | ||
) -> type[frictionless.Field] | None: | ||
"""This hook allows a plugin to override the class used for a field, depending on the field's type. | ||
|
||
Args: | ||
type: The type of the field. | ||
|
||
Returns: | ||
Return the class to use for the field, or None. | ||
""" | ||
# Override the class for string fields with our custom class. | ||
if type == "string": | ||
return CustomizedStringField | ||
# Not a string field, don't override the class. | ||
# Frictionless will fall back to other plugins or the built-in field classes. | ||
return None | ||
|
||
|
||
@attrs.define(kw_only=True, repr=False) | ||
class CustomizedStringField(frictionless.fields.StringField): | ||
"""Custom String Field. | ||
|
||
Compared to the normal string field, this class | ||
1. Converts any whitespace-only cell to the empty string before validating and reading it. | ||
This is so whitespace-only cells are treated the same as empty cells. | ||
""" | ||
|
||
# Class attributes | ||
type = "string" | ||
builtin = False | ||
|
||
# Read | ||
|
||
def create_cell_reader(self) -> frictionless.schema.ICellReader: | ||
"""Override the way that cells are read. | ||
|
||
NOTE we override create_cell_reader() rather than create_value_reader(), so that | ||
1. An all-whitespace cell is converted to an emtpy string BEFORE frictionless | ||
checks if the cell is an "empty value". | ||
This means an all-whitespace cell will be None when our code reads the | ||
value when mapping, the same as an empty cell. | ||
2. This also means that an all-whitespace cell will fail validation when the | ||
field is required. | ||
|
||
Returns: | ||
A function to read cell contents. | ||
""" | ||
# get the cell_reader for the parent class (StringField) | ||
default_cell_reader = super().create_cell_reader() | ||
|
||
# define our custom cell reader | ||
def cell_reader(cell: Any) -> tuple[Any, frictionless.schema.INotes]: | ||
# first convert whitespace-only cell to empty string | ||
if isinstance(cell, str) and cell.isspace(): | ||
cell = "" | ||
|
||
# then let frictionless do its normal conversions/validations | ||
return default_cell_reader(cell) | ||
|
||
return cell_reader | ||
|
||
|
||
# Register Plugin | ||
frictionless.system.register( | ||
name="customized-string", | ||
plugin=CustomizedStringPlugin(), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import frictionless | ||
|
||
from abis_mapping import plugins | ||
|
||
|
||
def test_customized_string_plugin() -> None: | ||
"""Tests the customized string plugin""" | ||
# Instantiate plugin | ||
plugin = plugins.string_customized.CustomizedStringPlugin() | ||
|
||
# Incorrect type | ||
result = plugin.select_field_class("notAType") | ||
assert result is None | ||
|
||
# Type this plugin overrides field class for | ||
result = plugin.select_field_class("string") | ||
assert result is plugins.string_customized.CustomizedStringField | ||
|
||
|
||
def test_customized_string_registered() -> None: | ||
"""Tests the customized string class is used for string fields.""" | ||
# Create schema with string field | ||
schema = frictionless.Schema.from_descriptor({"fields": [{"name": "foo", "type": "string"}]}) | ||
|
||
# Extract string field | ||
field = schema.get_field("foo") | ||
|
||
# field is our custom class | ||
assert isinstance(field, plugins.string_customized.CustomizedStringField) | ||
|
||
|
||
def test_customized_string_field() -> None: | ||
"""Tests the customized string field.""" | ||
# Instantiate the field | ||
field = plugins.string_customized.CustomizedStringField(name="TestField") | ||
|
||
# Normal string field behavior | ||
assert field.read_cell("") == (None, None) | ||
assert field.read_cell("foo") == ("foo", None) | ||
assert field.read_cell(" foo") == (" foo", None) | ||
assert field.read_cell("foo ") == ("foo ", None) | ||
|
||
# All whitespace value is customized to None | ||
assert field.read_cell(" ") == (None, None) | ||
assert field.read_cell(" ") == (None, None) | ||
assert field.read_cell("\n ") == (None, None) | ||
assert field.read_cell(" \r\n") == (None, None) | ||
assert field.read_cell(" \t ") == (None, None) | ||
|
||
|
||
def test_customized_string_field_with_required_constraint() -> None: | ||
"""Tests the customized string field with a required constraint.""" | ||
# Instantiate the field | ||
required_field = plugins.string_customized.CustomizedStringField( | ||
name="TestField", | ||
constraints={"required": True}, | ||
) | ||
|
||
# normal string field behavior | ||
assert required_field.read_cell("foo") == ("foo", None) | ||
assert required_field.read_cell("") == (None, {"required": 'constraint "required" is "True"'}) | ||
|
||
# Normalization is applied before checking constraint, making these invalid. | ||
assert required_field.read_cell(" ") == (None, {"required": 'constraint "required" is "True"'}) | ||
assert required_field.read_cell("\t") == (None, {"required": 'constraint "required" is "True"'}) | ||
|
||
|
||
def test_using_customized_string_field_with_resource() -> None: | ||
"""Test reading a valid csv where the cells are string fields.""" | ||
# Create schema with some string fields | ||
schema = frictionless.Schema.from_descriptor( | ||
{ | ||
"fields": [ | ||
{"name": "ID", "type": "string", "constraints": {"unique": True}}, | ||
{"name": "foo", "type": "string"}, | ||
{"name": "bar", "type": "string"}, | ||
], | ||
}, | ||
) | ||
data = b"\r\n".join( | ||
[ | ||
b"ID,foo,bar", | ||
b"AA,fff,\t ", | ||
b"BB, ,bar", | ||
], | ||
) | ||
resource = frictionless.Resource( | ||
data=data, | ||
schema=schema, | ||
format="csv", | ||
encoding="utf-8", | ||
) | ||
|
||
report = resource.validate() | ||
assert report.valid | ||
with resource.open() as r: | ||
assert list(r.row_stream) == [ | ||
{"ID": "AA", "foo": "fff", "bar": None}, | ||
{"ID": "BB", "foo": None, "bar": "bar"}, | ||
] | ||
|
||
|
||
def test_customized_string_field_as_unique_field() -> None: | ||
"""Tests that checking if a unique field has all distinct values is done after normalization is applied.""" | ||
# Create schema with unique string field | ||
schema = frictionless.Schema.from_descriptor( | ||
{ | ||
"fields": [ | ||
{"name": "ID", "type": "string", "constraints": {"unique": True}}, | ||
{"name": "foo", "type": "string"}, | ||
], | ||
}, | ||
) | ||
# The unique "ID" field has duplicate raw values, | ||
# but these are converted to null by our custom class, | ||
# and so are not included in the unique check. | ||
data = b"\r\n".join( | ||
[ | ||
b"ID,foo", | ||
b"AA,22", | ||
b" ,33", | ||
b" ,44", | ||
b"\t,55", | ||
b"\t,66", | ||
], | ||
) | ||
resource = frictionless.Resource( | ||
data=data, | ||
schema=schema, | ||
format="csv", | ||
encoding="utf-8", | ||
) | ||
|
||
report = resource.validate() | ||
|
||
assert report.valid | ||
with resource.open() as r: | ||
assert list(r.row_stream) == [ | ||
{"ID": "AA", "foo": "22"}, | ||
{"ID": None, "foo": "33"}, | ||
{"ID": None, "foo": "44"}, | ||
{"ID": None, "foo": "55"}, | ||
{"ID": None, "foo": "66"}, | ||
] |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍