From 7b5f222e8bc2eb71cc2120669cf550d26b1f8252 Mon Sep 17 00:00:00 2001 From: Huong Nguyen <32060364+Huongg@users.noreply.github.com> Date: Fri, 24 Mar 2023 12:45:26 +0000 Subject: [PATCH] preview-csv-dataset (#129) * preview func for csv dataset Signed-off-by: huongg * preview for excel and parquet dataset Signed-off-by: huongg * tests for _preview Signed-off-by: huongg * update var name in the loop Signed-off-by: huongg * create a copy of data and re-use _load Signed-off-by: huongg * remove parquet dataset preview Signed-off-by: huongg * default value for nrows Co-authored-by: Antony Milne <49395058+AntonyMilneQB@users.noreply.github.com> * set defaul val for excel nrows Signed-off-by: huongg * use orient='tight' in to_dict() Signed-off-by: huongg * use split instead of tight as current pandas does not support it Signed-off-by: huongg * add preview tests for excel and csv Signed-off-by: huongg * formatting Signed-off-by: huongg * remove unused urllib Signed-off-by: huongg * pylint: disable=protected-access Signed-off-by: huongg * ignore import error Signed-off-by: huongg * formatting Signed-off-by: huongg * remove ignore import error as it happens locally only Signed-off-by: huongg * fix lint error Signed-off-by: huongg --------- Signed-off-by: huongg Co-authored-by: Antony Milne <49395058+AntonyMilneQB@users.noreply.github.com> --- .../kedro_datasets/pandas/csv_dataset.py | 8 ++++ .../kedro_datasets/pandas/excel_dataset.py | 8 ++++ .../tests/pandas/test_csv_dataset.py | 43 +++++++++++++++++++ .../tests/pandas/test_excel_dataset.py | 43 +++++++++++++++++++ 4 files changed, 102 insertions(+) diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 7b20813f3..3c65d49b5 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -188,3 +188,11 @@ def _invalidate_cache(self) -> None: """Invalidate underlying filesystem caches.""" filepath = get_filepath_str(self._filepath, self._protocol) self._fs.invalidate_cache(filepath) + + def _preview(self, nrows: int = 40) -> Dict: + # Create a copy so it doesn't contaminate the original dataset + dataset_copy = self._copy() + dataset_copy._load_args["nrows"] = nrows # pylint: disable=protected-access + data = dataset_copy.load() + + return data.to_dict(orient="split") diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index 4a981bc11..45aee3192 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -257,3 +257,11 @@ def _invalidate_cache(self) -> None: """Invalidate underlying filesystem caches.""" filepath = get_filepath_str(self._filepath, self._protocol) self._fs.invalidate_cache(filepath) + + def _preview(self, nrows: int = 40) -> Dict: + # Create a copy so it doesn't contaminate the original dataset + dataset_copy = self._copy() + dataset_copy._load_args["nrows"] = nrows # pylint: disable=protected-access + data = dataset_copy.load() + + return data.to_dict(orient="split") diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 267144ecc..5cc1ee36b 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -137,6 +137,49 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): assert "storage_options" not in ds._save_args assert "storage_options" not in ds._load_args + @pytest.mark.parametrize( + "nrows,expected", + [ + ( + 0, + { + "index": [], + "columns": ["col1", "col2", "col3"], + "data": [], + }, + ), + ( + 1, + { + "index": [0], + "columns": ["col1", "col2", "col3"], + "data": [[1, 4, 5]], + }, + ), + ( + None, + { + "index": [0, 1], + "columns": ["col1", "col2", "col3"], + "data": [[1, 4, 5], [2, 5, 6]], + }, + ), + ( + 10, + { + "index": [0, 1], + "columns": ["col1", "col2", "col3"], + "data": [[1, 4, 5], [2, 5, 6]], + }, + ), + ], + ) + def test_preview(self, csv_data_set, dummy_dataframe, nrows, expected): + """Test _preview returns the correct data structure.""" + csv_data_set.save(dummy_dataframe) + previewed = csv_data_set._preview(nrows=nrows) + assert previewed == expected + def test_load_missing_file(self, csv_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set CSVDataSet\(.*\)" diff --git a/kedro-datasets/tests/pandas/test_excel_dataset.py b/kedro-datasets/tests/pandas/test_excel_dataset.py index c568d15d0..1080cc9b6 100644 --- a/kedro-datasets/tests/pandas/test_excel_dataset.py +++ b/kedro-datasets/tests/pandas/test_excel_dataset.py @@ -121,6 +121,49 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): assert "storage_options" not in ds._save_args assert "storage_options" not in ds._load_args + @pytest.mark.parametrize( + "nrows,expected", + [ + ( + 0, + { + "index": [], + "columns": ["col1", "col2", "col3"], + "data": [], + }, + ), + ( + 1, + { + "index": [0], + "columns": ["col1", "col2", "col3"], + "data": [[1, 4, 5]], + }, + ), + ( + None, + { + "index": [0, 1], + "columns": ["col1", "col2", "col3"], + "data": [[1, 4, 5], [2, 5, 6]], + }, + ), + ( + 10, + { + "index": [0, 1], + "columns": ["col1", "col2", "col3"], + "data": [[1, 4, 5], [2, 5, 6]], + }, + ), + ], + ) + def test_preview(self, excel_data_set, dummy_dataframe, nrows, expected): + """Test _preview returns the correct data structure.""" + excel_data_set.save(dummy_dataframe) + previewed = excel_data_set._preview(nrows=nrows) + assert previewed == expected + def test_load_missing_file(self, excel_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set ExcelDataSet\(.*\)"