Skip to content

Commit

Permalink
preview-csv-dataset (#129)
Browse files Browse the repository at this point in the history
* preview func for csv dataset

Signed-off-by: huongg <[email protected]>

* preview for excel and parquet dataset

Signed-off-by: huongg <[email protected]>

* tests for _preview

Signed-off-by: huongg <[email protected]>

* update var name in the loop

Signed-off-by: huongg <[email protected]>

* create a copy of data and re-use _load

Signed-off-by: huongg <[email protected]>

* remove parquet dataset preview

Signed-off-by: huongg <[email protected]>

* default value for nrows

Co-authored-by: Antony Milne <[email protected]>

* set defaul val for excel nrows

Signed-off-by: huongg <[email protected]>

* use orient='tight' in to_dict()

Signed-off-by: huongg <[email protected]>

* use split instead of tight as current pandas does not support it

Signed-off-by: huongg <[email protected]>

* add preview tests for excel and csv

Signed-off-by: huongg <[email protected]>

* formatting

Signed-off-by: huongg <[email protected]>

* remove unused urllib

Signed-off-by: huongg <[email protected]>

* pylint: disable=protected-access

Signed-off-by: huongg <[email protected]>

* ignore import error

Signed-off-by: huongg <[email protected]>

* formatting

Signed-off-by: huongg <[email protected]>

* remove ignore import error as it happens locally only

Signed-off-by: huongg <[email protected]>

* fix lint error

Signed-off-by: huongg <[email protected]>

---------

Signed-off-by: huongg <[email protected]>
Co-authored-by: Antony Milne <[email protected]>
  • Loading branch information
Huongg and antonymilne authored Mar 24, 2023
1 parent 22dae20 commit 7b5f222
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 0 deletions.
8 changes: 8 additions & 0 deletions kedro-datasets/kedro_datasets/pandas/csv_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,11 @@ def _invalidate_cache(self) -> None:
"""Invalidate underlying filesystem caches."""
filepath = get_filepath_str(self._filepath, self._protocol)
self._fs.invalidate_cache(filepath)

def _preview(self, nrows: int = 40) -> Dict:
# Create a copy so it doesn't contaminate the original dataset
dataset_copy = self._copy()
dataset_copy._load_args["nrows"] = nrows # pylint: disable=protected-access
data = dataset_copy.load()

return data.to_dict(orient="split")
8 changes: 8 additions & 0 deletions kedro-datasets/kedro_datasets/pandas/excel_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,11 @@ def _invalidate_cache(self) -> None:
"""Invalidate underlying filesystem caches."""
filepath = get_filepath_str(self._filepath, self._protocol)
self._fs.invalidate_cache(filepath)

def _preview(self, nrows: int = 40) -> Dict:
# Create a copy so it doesn't contaminate the original dataset
dataset_copy = self._copy()
dataset_copy._load_args["nrows"] = nrows # pylint: disable=protected-access
data = dataset_copy.load()

return data.to_dict(orient="split")
43 changes: 43 additions & 0 deletions kedro-datasets/tests/pandas/test_csv_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,49 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path):
assert "storage_options" not in ds._save_args
assert "storage_options" not in ds._load_args

@pytest.mark.parametrize(
"nrows,expected",
[
(
0,
{
"index": [],
"columns": ["col1", "col2", "col3"],
"data": [],
},
),
(
1,
{
"index": [0],
"columns": ["col1", "col2", "col3"],
"data": [[1, 4, 5]],
},
),
(
None,
{
"index": [0, 1],
"columns": ["col1", "col2", "col3"],
"data": [[1, 4, 5], [2, 5, 6]],
},
),
(
10,
{
"index": [0, 1],
"columns": ["col1", "col2", "col3"],
"data": [[1, 4, 5], [2, 5, 6]],
},
),
],
)
def test_preview(self, csv_data_set, dummy_dataframe, nrows, expected):
"""Test _preview returns the correct data structure."""
csv_data_set.save(dummy_dataframe)
previewed = csv_data_set._preview(nrows=nrows)
assert previewed == expected

def test_load_missing_file(self, csv_data_set):
"""Check the error when trying to load missing file."""
pattern = r"Failed while loading data from data set CSVDataSet\(.*\)"
Expand Down
43 changes: 43 additions & 0 deletions kedro-datasets/tests/pandas/test_excel_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,49 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path):
assert "storage_options" not in ds._save_args
assert "storage_options" not in ds._load_args

@pytest.mark.parametrize(
"nrows,expected",
[
(
0,
{
"index": [],
"columns": ["col1", "col2", "col3"],
"data": [],
},
),
(
1,
{
"index": [0],
"columns": ["col1", "col2", "col3"],
"data": [[1, 4, 5]],
},
),
(
None,
{
"index": [0, 1],
"columns": ["col1", "col2", "col3"],
"data": [[1, 4, 5], [2, 5, 6]],
},
),
(
10,
{
"index": [0, 1],
"columns": ["col1", "col2", "col3"],
"data": [[1, 4, 5], [2, 5, 6]],
},
),
],
)
def test_preview(self, excel_data_set, dummy_dataframe, nrows, expected):
"""Test _preview returns the correct data structure."""
excel_data_set.save(dummy_dataframe)
previewed = excel_data_set._preview(nrows=nrows)
assert previewed == expected

def test_load_missing_file(self, excel_data_set):
"""Check the error when trying to load missing file."""
pattern = r"Failed while loading data from data set ExcelDataSet\(.*\)"
Expand Down

0 comments on commit 7b5f222

Please sign in to comment.