Skip to content

Commit

Permalink
Merge branch 'master' into feat/update-dataset-tab-with-ssb-components
Browse files Browse the repository at this point in the history
  • Loading branch information
tilen1976 committed Mar 15, 2024
2 parents 2c83672 + 154e4a0 commit e0769d8
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 8 deletions.
12 changes: 8 additions & 4 deletions src/datadoc/backend/dataset_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@
from abc import abstractmethod

import pandas as pd
import pyarrow.parquet as pq
from datadoc_model.model import LanguageStringType
from datadoc_model.model import Variable
from pyarrow import parquet as pq

from datadoc import state
from datadoc.enums import DataType

if t.TYPE_CHECKING:
import pyarrow as pa
from cloudpathlib import CloudPath

KNOWN_INTEGER_TYPES = (
Expand Down Expand Up @@ -169,13 +170,16 @@ def __init__(self, dataset: pathlib.Path | CloudPath) -> None:
def get_fields(self) -> list[Variable]:
"""Extract the fields from this dataset."""
with self.dataset.open(mode="rb") as f:
data_table = pq.read_table(f) # type: ignore [arg-type]
# Type stubs for pyarrow are incorrect see https://github.com/zen-xu/pyarrow-stubs/issues/4
schema: pa.Schema = pq.read_schema(f) # type: ignore # noqa: PGH003
return [
Variable(
short_name=data_field.name,
short_name=data_field.name.strip(),
data_type=self.transform_data_type(str(data_field.type)),
)
for data_field in data_table.schema
for data_field in schema
if data_field.name
!= "__index_level_0__" # Index columns should not be documented
]


Expand Down
31 changes: 27 additions & 4 deletions tests/backend/test_dataset_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Tests for the DatasetParser class."""

import io
import pathlib

import pandas as pd
import pytest
from datadoc_model.model import LanguageStringType
from datadoc_model.model import Variable
Expand Down Expand Up @@ -82,10 +84,7 @@ def test_dataset_parser_unsupported_files(file: pathlib.Path):


def test_transform_datatype_unknown_type():
expected = None
input_data = "definitely not a known data type"
actual = DatasetParser.transform_data_type(input_data)
assert actual == expected
assert DatasetParser.transform_data_type("definitely not a known data type") is None


@pytest.mark.parametrize(
Expand All @@ -101,3 +100,27 @@ def test_transform_datatype_unknown_type():
def test_transform_datatype(expected: DataType, concrete_type: str):
actual = DatasetParser.transform_data_type(concrete_type)
assert actual == expected


@pytest.fixture()
def parquet_with_index_column(tmp_path):
"""Create a parquet file with a column called __index_level_0__."""
test_data = pd.read_csv(
io.StringIO(
"""a b
1 4
2 5
3 6
""",
),
sep="\t",
)

output_path = tmp_path / "test_with_index.parquet"
test_data.query("b % 2 == 0").to_parquet(output_path, engine="pyarrow")
return output_path


def test_parquet_with_index_column(parquet_with_index_column: pathlib.Path):
fields = DatasetParser.for_file(parquet_with_index_column).get_fields()
assert not any(f.short_name == "__index_level_0__" for f in fields)

0 comments on commit e0769d8

Please sign in to comment.