diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 7b20813f3..3c65d49b5 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -188,3 +188,11 @@ def _invalidate_cache(self) -> None: """Invalidate underlying filesystem caches.""" filepath = get_filepath_str(self._filepath, self._protocol) self._fs.invalidate_cache(filepath) + + def _preview(self, nrows: int = 40) -> Dict: + # Create a copy so it doesn't contaminate the original dataset + dataset_copy = self._copy() + dataset_copy._load_args["nrows"] = nrows # pylint: disable=protected-access + data = dataset_copy.load() + + return data.to_dict(orient="split") diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index 4a981bc11..45aee3192 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -257,3 +257,11 @@ def _invalidate_cache(self) -> None: """Invalidate underlying filesystem caches.""" filepath = get_filepath_str(self._filepath, self._protocol) self._fs.invalidate_cache(filepath) + + def _preview(self, nrows: int = 40) -> Dict: + # Create a copy so it doesn't contaminate the original dataset + dataset_copy = self._copy() + dataset_copy._load_args["nrows"] = nrows # pylint: disable=protected-access + data = dataset_copy.load() + + return data.to_dict(orient="split") diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 267144ecc..5cc1ee36b 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -137,6 +137,49 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): assert "storage_options" not in ds._save_args assert "storage_options" not in ds._load_args + @pytest.mark.parametrize( + "nrows,expected", + [ + ( + 0, + { + "index": [], + "columns": ["col1", "col2", "col3"], + "data": [], + }, + ), + ( + 1, + { + "index": [0], + "columns": ["col1", "col2", "col3"], + "data": [[1, 4, 5]], + }, + ), + ( + None, + { + "index": [0, 1], + "columns": ["col1", "col2", "col3"], + "data": [[1, 4, 5], [2, 5, 6]], + }, + ), + ( + 10, + { + "index": [0, 1], + "columns": ["col1", "col2", "col3"], + "data": [[1, 4, 5], [2, 5, 6]], + }, + ), + ], + ) + def test_preview(self, csv_data_set, dummy_dataframe, nrows, expected): + """Test _preview returns the correct data structure.""" + csv_data_set.save(dummy_dataframe) + previewed = csv_data_set._preview(nrows=nrows) + assert previewed == expected + def test_load_missing_file(self, csv_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set CSVDataSet\(.*\)" diff --git a/kedro-datasets/tests/pandas/test_excel_dataset.py b/kedro-datasets/tests/pandas/test_excel_dataset.py index c568d15d0..1080cc9b6 100644 --- a/kedro-datasets/tests/pandas/test_excel_dataset.py +++ b/kedro-datasets/tests/pandas/test_excel_dataset.py @@ -121,6 +121,49 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): assert "storage_options" not in ds._save_args assert "storage_options" not in ds._load_args + @pytest.mark.parametrize( + "nrows,expected", + [ + ( + 0, + { + "index": [], + "columns": ["col1", "col2", "col3"], + "data": [], + }, + ), + ( + 1, + { + "index": [0], + "columns": ["col1", "col2", "col3"], + "data": [[1, 4, 5]], + }, + ), + ( + None, + { + "index": [0, 1], + "columns": ["col1", "col2", "col3"], + "data": [[1, 4, 5], [2, 5, 6]], + }, + ), + ( + 10, + { + "index": [0, 1], + "columns": ["col1", "col2", "col3"], + "data": [[1, 4, 5], [2, 5, 6]], + }, + ), + ], + ) + def test_preview(self, excel_data_set, dummy_dataframe, nrows, expected): + """Test _preview returns the correct data structure.""" + excel_data_set.save(dummy_dataframe) + previewed = excel_data_set._preview(nrows=nrows) + assert previewed == expected + def test_load_missing_file(self, excel_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set ExcelDataSet\(.*\)"