From 154e4a049a6509bfafa191627ca9f2452dc2728b Mon Sep 17 00:00:00 2001 From: Miles Mason Winther <42948872+mmwinther@users.noreply.github.com> Date: Thu, 14 Mar 2024 16:04:08 +0100 Subject: [PATCH] Filter out columns called __index_level_0__ (#227) --- src/datadoc/backend/dataset_parser.py | 12 +++++++---- tests/backend/test_dataset_parser.py | 31 +++++++++++++++++++++++---- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/src/datadoc/backend/dataset_parser.py b/src/datadoc/backend/dataset_parser.py index b8cf382b..bdf16678 100644 --- a/src/datadoc/backend/dataset_parser.py +++ b/src/datadoc/backend/dataset_parser.py @@ -12,14 +12,15 @@ from abc import abstractmethod import pandas as pd -import pyarrow.parquet as pq from datadoc_model.model import LanguageStringType from datadoc_model.model import Variable +from pyarrow import parquet as pq from datadoc import state from datadoc.enums import DataType if t.TYPE_CHECKING: + import pyarrow as pa from cloudpathlib import CloudPath KNOWN_INTEGER_TYPES = ( @@ -169,13 +170,16 @@ def __init__(self, dataset: pathlib.Path | CloudPath) -> None: def get_fields(self) -> list[Variable]: """Extract the fields from this dataset.""" with self.dataset.open(mode="rb") as f: - data_table = pq.read_table(f) # type: ignore [arg-type] + # Type stubs for pyarrow are incorrect see https://github.com/zen-xu/pyarrow-stubs/issues/4 + schema: pa.Schema = pq.read_schema(f) # type: ignore # noqa: PGH003 return [ Variable( - short_name=data_field.name, + short_name=data_field.name.strip(), data_type=self.transform_data_type(str(data_field.type)), ) - for data_field in data_table.schema + for data_field in schema + if data_field.name + != "__index_level_0__" # Index columns should not be documented ] diff --git a/tests/backend/test_dataset_parser.py b/tests/backend/test_dataset_parser.py index 4c325696..f80a0b38 100644 --- a/tests/backend/test_dataset_parser.py +++ b/tests/backend/test_dataset_parser.py @@ -1,7 +1,9 @@ """Tests for the DatasetParser class.""" +import io import pathlib +import pandas as pd import pytest from datadoc_model.model import LanguageStringType from datadoc_model.model import Variable @@ -82,10 +84,7 @@ def test_dataset_parser_unsupported_files(file: pathlib.Path): def test_transform_datatype_unknown_type(): - expected = None - input_data = "definitely not a known data type" - actual = DatasetParser.transform_data_type(input_data) - assert actual == expected + assert DatasetParser.transform_data_type("definitely not a known data type") is None @pytest.mark.parametrize( @@ -101,3 +100,27 @@ def test_transform_datatype_unknown_type(): def test_transform_datatype(expected: DataType, concrete_type: str): actual = DatasetParser.transform_data_type(concrete_type) assert actual == expected + + +@pytest.fixture() +def parquet_with_index_column(tmp_path): + """Create a parquet file with a column called __index_level_0__.""" + test_data = pd.read_csv( + io.StringIO( + """a b +1 4 +2 5 +3 6 +""", + ), + sep="\t", + ) + + output_path = tmp_path / "test_with_index.parquet" + test_data.query("b % 2 == 0").to_parquet(output_path, engine="pyarrow") + return output_path + + +def test_parquet_with_index_column(parquet_with_index_column: pathlib.Path): + fields = DatasetParser.for_file(parquet_with_index_column).get_fields() + assert not any(f.short_name == "__index_level_0__" for f in fields)