Skip to content

Commit

Permalink
Fix 1633 unexpected key error with primary key in schema and 1635 une…
Browse files Browse the repository at this point in the history
…xpected missing label error when ignoring header case (#1641)

- fixes #1633 
- fixes #1635 

---

This PR fixes unexpected `missing-label` error in validation report
which occurs when validating tabular data in this specific case (#1635):
The data contains a named column corresponding to a required field in
the schema used for validation, written without respecting
case-sensitivity.

For example:

```
data = [["aa", "BB"], ["a", "b"]]
schema = {
        "$schema": "https://frictionlessdata.io/schemas/table-schema.json",
        "fields": [
            {"name": "AA", "constraints": {"required": True}},
            {"name": "bb", "constraints": {"required": True}}
        ]
    }
```

It adds some test cases: 

- one test case to ensure that validating this tabular data with this
schema and using `schema_sync=True` and
`dialect=Dialect(header_case=False)` options, returns a valid report.
- one test case to ensure that if validation is case-sensitive, if a
name column does not respect case a missing-label occurs in validation
report.
- another test case to ensure that validating with missing required
column "AA" with this schema and using `schema_sync=True` and
`dialect=Dialect(header_case=False)` options, returns an invalid report
with `missing-label` error related to field "AA".

This PR also fixes unexpected KeyError raised when a primary key used in
shema is missing in the tabular data to validate(#1633).
For example:
```
data = [["B"], ["b"]]
schema = {
        "$schema": "https://frictionlessdata.io/schemas/table-schema.json",
        "fields": [
            {"name": "A"},
            {"name": "B"}
        ],
        "primaryKey": ["A"]
    }
```
It adds some test cases: 
- two test cases to ensure that when a label related to the schema
primary key is missing, the validation report is invalid with single
missing-label error:
- a test case with the schema containing the primary key field not
specified as 'required'
- a test case with the schema containing the primary key field specified
as 'required'
- a test case to deal with insensitivity case in the data label
corresponding to the primary key schema field: the validation report is
valid

Finally, I suggest some refactoring in this PR:
- refactoring removing missing required label from field info
refactoring removing missing required label from field info
- refactoring creating schema fields among labels part when using
`schema_sync` option in `detect_schema()` `Detector` method
- refactoring to introduce intermediary `TableResource` methods to
create `cells` related to primary key schema.

---------

Co-authored-by: Pierre Camilleri <[email protected]>
  • Loading branch information
amelie-rondot and pierrecamilleri authored Aug 30, 2024
1 parent 4d334f0 commit 7861f0e
Show file tree
Hide file tree
Showing 5 changed files with 276 additions and 40 deletions.
88 changes: 76 additions & 12 deletions frictionless/detector/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ def detect_schema(
labels: Optional[List[str]] = None,
schema: Optional[Schema] = None,
field_candidates: List[Dict[str, Any]] = settings.DEFAULT_FIELD_CANDIDATES,
**options: Any,
) -> Schema:
"""Detect schema from fragment
Expand Down Expand Up @@ -405,20 +406,29 @@ def detect_schema(
# Sync schema
if self.schema_sync:
if labels:
case_sensitive = options["header_case"]

if not case_sensitive:
labels = [label.lower() for label in labels]

if len(labels) != len(set(labels)):
note = '"schema_sync" requires unique labels in the header'
raise FrictionlessException(note)
mapping = {field.name: field for field in schema.fields} # type: ignore
schema.clear_fields()
for name in labels:
field = mapping.get(name)
if not field:
field = Field.from_descriptor({"name": name, "type": "any"})
schema.add_field(field)
# For required fields that are missing
for _, field in mapping.items():
if field and field.required and field.name not in labels:
schema.add_field(field)

mapped_fields = self.mapped_schema_fields_names(
schema.fields, # type: ignore
case_sensitive,
)

self.rearrange_schema_fields_given_labels(
mapped_fields,
schema,
labels,
)

self.add_missing_required_labels_to_schema_fields(
mapped_fields, schema, labels, case_sensitive
)

# Patch schema
if self.schema_patch:
Expand All @@ -432,4 +442,58 @@ def detect_schema(
field_descriptor.update(field_patch)
schema = Schema.from_descriptor(descriptor)

return schema # type: ignore
return schema

@staticmethod
def mapped_schema_fields_names(
fields: List[Field], case_sensitive: bool
) -> Dict[str, Field]:
"""Create a dictionnary to map field names with schema fields"""
if case_sensitive:
return {field.name: field for field in fields}
else:
return {field.name.lower(): field for field in fields}

@staticmethod
def rearrange_schema_fields_given_labels(
fields_mapping: Dict[str, Field],
schema: Schema,
labels: List[str],
):
"""Rearrange fields according to the order of labels. All fields
missing from labels are dropped"""
schema.clear_fields()

for name in labels:
default_field = Field.from_descriptor({"name": name, "type": "any"})
field = fields_mapping.get(name, default_field)
schema.add_field(field)

def add_missing_required_labels_to_schema_fields(
self,
fields_mapping: Dict[str, Field],
schema: Schema,
labels: List[str],
case_sensitive: bool,
):
"""This method aims to add missing required labels and
primary key field not in labels to schema fields.
"""
for name, field in fields_mapping.items():
if (
self.field_is_required(field, schema, case_sensitive)
and name not in labels
):
schema.add_field(field)

@staticmethod
def field_is_required(
field: Field,
schema: Schema,
case_sensitive: bool,
) -> bool:
if case_sensitive:
return field.required or field.name in schema.primary_key
else:
lower_primary_key = [pk.lower() for pk in schema.primary_key]
return field.required or field.name.lower() in lower_primary_key
8 changes: 5 additions & 3 deletions frictionless/resource/__spec__/test_security.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ def test_resource_source_path_error_bad_path_not_safe_traversing():
Resource(
{
"name": "name",
"path": "data/../data/table.csv"
if not platform.type == "windows"
else "data\\..\\table.csv",
"path": (
"data/../data/table.csv"
if not platform.type == "windows"
else "data\\..\\table.csv"
),
}
)
error = excinfo.value.error
Expand Down
94 changes: 77 additions & 17 deletions frictionless/resources/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import warnings
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union

from frictionless.schema.field import Field

from .. import errors, helpers, settings
from ..analyzer import Analyzer
from ..dialect import Dialect
Expand Down Expand Up @@ -204,6 +206,7 @@ def __open_schema(self):
labels=self.labels,
schema=self.schema,
field_candidates=system.detect_field_candidates(),
header_case=self.dialect.header_case,
)
self.stats.fields = len(self.schema.fields)

Expand Down Expand Up @@ -331,15 +334,21 @@ def row_stream():

# Primary Key Error
if is_integrity and self.schema.primary_key:
cells = tuple(row[name] for name in self.schema.primary_key)
if set(cells) == {None}:
note = 'cells composing the primary keys are all "None"'
error = errors.PrimaryKeyError.from_row(row, note=note)
row.errors.append(error)
try:
cells = self.primary_key_cells(row, self.dialect.header_case)
except KeyError:
# Row does not have primary_key as label
# There should already be a missing-label error in
# in self.header corresponding to the schema primary key
assert not self.header.valid
else:
match = memory_primary.get(cells)
memory_primary[cells] = row.row_number
if match:
if set(cells) == {None}:
note = 'cells composing the primary keys are all "None"'
error = errors.PrimaryKeyError.from_row(row, note=note)
row.errors.append(error)
else:
match = memory_primary.get(cells)
memory_primary[cells] = row.row_number
if match:
note = "the same as in the row at position %s" % match
error = errors.PrimaryKeyError.from_row(row, note=note)
Expand Down Expand Up @@ -386,20 +395,71 @@ def row_stream():
# Yield row
yield row

# NB: missing required labels are not included in the
# field_info parameter used for row creation
if self.detector.schema_sync:
# Missing required labels are not included in the
# field_info parameter used for row creation
for field in self.schema.fields:
if field.name not in self.labels and field.name in field_info["names"]:
field_index = field_info["names"].index(field.name)
del field_info["names"][field_index]
del field_info["objects"][field_index]
del field_info["mapping"][field.name]
# # Create row stream
self.remove_missing_required_label_from_field_info(field, field_info)

# Create row stream
self.__row_stream = row_stream()

# Read
def remove_missing_required_label_from_field_info(
self, field: Field, field_info: Dict[str, Any]
):
is_case_sensitive = self.dialect.header_case
if self.label_is_missing(
field.name, field_info["names"], self.labels, is_case_sensitive
):
self.remove_field_from_field_info(field.name, field_info)

@staticmethod
def label_is_missing(
field_name: str,
expected_field_names: List[str],
table_labels: types.ILabels,
case_sensitive: bool,
) -> bool:
"""Check if a schema field name is missing from the TableResource
labels.
"""
if not case_sensitive:
field_name = field_name.lower()
table_labels = [label.lower() for label in table_labels]
expected_field_names = [
field_name.lower() for field_name in expected_field_names
]

return field_name not in table_labels and field_name in expected_field_names

@staticmethod
def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]):
field_index = field_info["names"].index(field_name)
del field_info["names"][field_index]
del field_info["objects"][field_index]
del field_info["mapping"][field_name]

def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[Any, ...]:
"""Create a tuple containg all cells from a given row associated to primary
keys"""
return tuple(row[label] for label in self.primary_key_labels(row, case_sensitive))

def primary_key_labels(
self,
row: Row,
case_sensitive: bool,
) -> List[str]:
"""Create a list of TableResource labels that are primary keys"""
if case_sensitive:
labels_primary_key = self.schema.primary_key
else:
lower_primary_key = [pk.lower() for pk in self.schema.primary_key]
labels_primary_key = [
label for label in row.field_names if label.lower() in lower_primary_key
]
return labels_primary_key

# Read
def read_cells(self, *, size: Optional[int] = None) -> List[List[Any]]:
"""Read lists into memory
Expand Down
44 changes: 44 additions & 0 deletions frictionless/table/__spec__/test_header.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import pytest

import frictionless
from frictionless import Schema, fields
from frictionless.resources import TableResource

Expand Down Expand Up @@ -37,3 +40,44 @@ def test_missing_label():
assert header == ["id", "name", "extra"]
assert header.labels == ["id", "name"]
assert header.valid is False


@pytest.mark.parametrize(
"source, required, valid_report, nb_errors, types_errors_expected, header_case",
[
([["B"], ["foo"]], {"required": True}, False, 1, ["missing-label"], True),
([["B"], ["foo"]], {}, False, 1, ["missing-label"], True),
([["a"], ["foo"]], {"required": True}, False, 1, ["missing-label"], True),
([["a"], ["foo"]], {}, False, 1, ["missing-label"], True),
# Ignore header_case
([["B"], ["foo"]], {"required": True}, False, 1, ["missing-label"], False),
([["B"], ["foo"]], {}, False, 1, ["missing-label"], False),
([["a"], ["foo"]], {"required": True}, True, 0, [], False),
([["a"], ["foo"]], {}, True, 0, [], False),
],
)
def test_missing_primary_key_label_with_shema_sync_issue_1633(
source, required, valid_report, nb_errors, types_errors_expected, header_case
):
schema_descriptor = {
"$schema": "https://frictionlessdata.io/schemas/table-schema.json",
"fields": [{"name": "A", "constraints": required}],
"primaryKey": ["A"],
}

resource = TableResource(
source=source,
schema=Schema.from_descriptor(schema_descriptor),
detector=frictionless.Detector(schema_sync=True),
dialect=frictionless.Dialect(header_case=header_case),
)

report = frictionless.validate(resource)

assert report.valid == valid_report

if not report.valid:
errors = report.tasks[0].errors
assert len(errors) == nb_errors
for error, type_expected in zip(errors, types_errors_expected):
assert error.type == type_expected
Loading

0 comments on commit 7861f0e

Please sign in to comment.