preview-csv-dataset (#129)

* preview func for csv dataset Signed-off-by: huongg <[email protected]> * preview for excel and parquet dataset Signed-off-by: huongg <[email protected]> * tests for _preview Signed-off-by: huongg <[email protected]> * update var name in the loop Signed-off-by: huongg <[email protected]> * create a copy of data and re-use _load Signed-off-by: huongg <[email protected]> * remove parquet dataset preview Signed-off-by: huongg <[email protected]> * default value for nrows Co-authored-by: Antony Milne <[email protected]> * set defaul val for excel nrows Signed-off-by: huongg <[email protected]> * use orient='tight' in to_dict() Signed-off-by: huongg <[email protected]> * use split instead of tight as current pandas does not support it Signed-off-by: huongg <[email protected]> * add preview tests for excel and csv Signed-off-by: huongg <[email protected]> * formatting Signed-off-by: huongg <[email protected]> * remove unused urllib Signed-off-by: huongg <[email protected]> * pylint: disable=protected-access Signed-off-by: huongg <[email protected]> * ignore import error Signed-off-by: huongg <[email protected]> * formatting Signed-off-by: huongg <[email protected]> * remove ignore import error as it happens locally only Signed-off-by: huongg <[email protected]> * fix lint error Signed-off-by: huongg <[email protected]> --------- Signed-off-by: huongg <[email protected]> Co-authored-by: Antony Milne <[email protected]>
kedro-org · Mar 24, 2023 · 7b5f222 · 7b5f222
1 parent 22dae20
commit 7b5f222
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 0 deletions.
diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py
@@ -188,3 +188,11 @@ def _invalidate_cache(self) -> None:
         """Invalidate underlying filesystem caches."""
         filepath = get_filepath_str(self._filepath, self._protocol)
         self._fs.invalidate_cache(filepath)
+
+    def _preview(self, nrows: int = 40) -> Dict:
+        # Create a copy so it doesn't contaminate the original dataset
+        dataset_copy = self._copy()
+        dataset_copy._load_args["nrows"] = nrows  # pylint: disable=protected-access
+        data = dataset_copy.load()
+
+        return data.to_dict(orient="split")
diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py
@@ -257,3 +257,11 @@ def _invalidate_cache(self) -> None:
         """Invalidate underlying filesystem caches."""
         filepath = get_filepath_str(self._filepath, self._protocol)
         self._fs.invalidate_cache(filepath)
+
+    def _preview(self, nrows: int = 40) -> Dict:
+        # Create a copy so it doesn't contaminate the original dataset
+        dataset_copy = self._copy()
+        dataset_copy._load_args["nrows"] = nrows  # pylint: disable=protected-access
+        data = dataset_copy.load()
+
+        return data.to_dict(orient="split")
diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py
@@ -137,6 +137,49 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path):
         assert "storage_options" not in ds._save_args
         assert "storage_options" not in ds._load_args
 
+    @pytest.mark.parametrize(
+        "nrows,expected",
+        [
+            (
+                0,
+                {
+                    "index": [],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [],
+                },
+            ),
+            (
+                1,
+                {
+                    "index": [0],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [[1, 4, 5]],
+                },
+            ),
+            (
+                None,
+                {
+                    "index": [0, 1],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [[1, 4, 5], [2, 5, 6]],
+                },
+            ),
+            (
+                10,
+                {
+                    "index": [0, 1],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [[1, 4, 5], [2, 5, 6]],
+                },
+            ),
+        ],
+    )
+    def test_preview(self, csv_data_set, dummy_dataframe, nrows, expected):
+        """Test _preview returns the correct data structure."""
+        csv_data_set.save(dummy_dataframe)
+        previewed = csv_data_set._preview(nrows=nrows)
+        assert previewed == expected
+
     def test_load_missing_file(self, csv_data_set):
         """Check the error when trying to load missing file."""
         pattern = r"Failed while loading data from data set CSVDataSet\(.*\)"

diff --git a/kedro-datasets/tests/pandas/test_excel_dataset.py b/kedro-datasets/tests/pandas/test_excel_dataset.py
@@ -121,6 +121,49 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path):
         assert "storage_options" not in ds._save_args
         assert "storage_options" not in ds._load_args
 
+    @pytest.mark.parametrize(
+        "nrows,expected",
+        [
+            (
+                0,
+                {
+                    "index": [],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [],
+                },
+            ),
+            (
+                1,
+                {
+                    "index": [0],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [[1, 4, 5]],
+                },
+            ),
+            (
+                None,
+                {
+                    "index": [0, 1],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [[1, 4, 5], [2, 5, 6]],
+                },
+            ),
+            (
+                10,
+                {
+                    "index": [0, 1],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [[1, 4, 5], [2, 5, 6]],
+                },
+            ),
+        ],
+    )
+    def test_preview(self, excel_data_set, dummy_dataframe, nrows, expected):
+        """Test _preview returns the correct data structure."""
+        excel_data_set.save(dummy_dataframe)
+        previewed = excel_data_set._preview(nrows=nrows)
+        assert previewed == expected
+
     def test_load_missing_file(self, excel_data_set):
         """Check the error when trying to load missing file."""
         pattern = r"Failed while loading data from data set ExcelDataSet\(.*\)"