From 7b5f222e8bc2eb71cc2120669cf550d26b1f8252 Mon Sep 17 00:00:00 2001
From: Huong Nguyen <32060364+Huongg@users.noreply.github.com>
Date: Fri, 24 Mar 2023 12:45:26 +0000
Subject: [PATCH] preview-csv-dataset (#129)

* preview func for csv dataset

Signed-off-by: huongg <huongg1409@gmail.com>

* preview for excel and parquet dataset

Signed-off-by: huongg <huongg1409@gmail.com>

* tests for _preview

Signed-off-by: huongg <huongg1409@gmail.com>

* update var name in the loop

Signed-off-by: huongg <huongg1409@gmail.com>

* create a copy of data and re-use _load

Signed-off-by: huongg <huongg1409@gmail.com>

* remove parquet dataset preview

Signed-off-by: huongg <huongg1409@gmail.com>

* default value for nrows

Co-authored-by: Antony Milne <49395058+AntonyMilneQB@users.noreply.github.com>

* set defaul val for excel nrows

Signed-off-by: huongg <huongg1409@gmail.com>

* use orient='tight' in to_dict()

Signed-off-by: huongg <huongg1409@gmail.com>

* use split instead of tight as current pandas does not support it

Signed-off-by: huongg <huongg1409@gmail.com>

* add preview tests for excel and csv

Signed-off-by: huongg <huongg1409@gmail.com>

* formatting

Signed-off-by: huongg <huongg1409@gmail.com>

* remove unused urllib

Signed-off-by: huongg <huongg1409@gmail.com>

* pylint: disable=protected-access

Signed-off-by: huongg <huongg1409@gmail.com>

* ignore import error

Signed-off-by: huongg <huongg1409@gmail.com>

* formatting

Signed-off-by: huongg <huongg1409@gmail.com>

* remove ignore import error as it happens locally only

Signed-off-by: huongg <huongg1409@gmail.com>

* fix lint error

Signed-off-by: huongg <huongg1409@gmail.com>

---------

Signed-off-by: huongg <huongg1409@gmail.com>
Co-authored-by: Antony Milne <49395058+AntonyMilneQB@users.noreply.github.com>
---
 .../kedro_datasets/pandas/csv_dataset.py      |  8 ++++
 .../kedro_datasets/pandas/excel_dataset.py    |  8 ++++
 .../tests/pandas/test_csv_dataset.py          | 43 +++++++++++++++++++
 .../tests/pandas/test_excel_dataset.py        | 43 +++++++++++++++++++
 4 files changed, 102 insertions(+)

diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py
index 7b20813f3..3c65d49b5 100644
--- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py
+++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py
@@ -188,3 +188,11 @@ def _invalidate_cache(self) -> None:
         """Invalidate underlying filesystem caches."""
         filepath = get_filepath_str(self._filepath, self._protocol)
         self._fs.invalidate_cache(filepath)
+
+    def _preview(self, nrows: int = 40) -> Dict:
+        # Create a copy so it doesn't contaminate the original dataset
+        dataset_copy = self._copy()
+        dataset_copy._load_args["nrows"] = nrows  # pylint: disable=protected-access
+        data = dataset_copy.load()
+
+        return data.to_dict(orient="split")
diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py
index 4a981bc11..45aee3192 100644
--- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py
+++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py
@@ -257,3 +257,11 @@ def _invalidate_cache(self) -> None:
         """Invalidate underlying filesystem caches."""
         filepath = get_filepath_str(self._filepath, self._protocol)
         self._fs.invalidate_cache(filepath)
+
+    def _preview(self, nrows: int = 40) -> Dict:
+        # Create a copy so it doesn't contaminate the original dataset
+        dataset_copy = self._copy()
+        dataset_copy._load_args["nrows"] = nrows  # pylint: disable=protected-access
+        data = dataset_copy.load()
+
+        return data.to_dict(orient="split")
diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py
index 267144ecc..5cc1ee36b 100644
--- a/kedro-datasets/tests/pandas/test_csv_dataset.py
+++ b/kedro-datasets/tests/pandas/test_csv_dataset.py
@@ -137,6 +137,49 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path):
         assert "storage_options" not in ds._save_args
         assert "storage_options" not in ds._load_args
 
+    @pytest.mark.parametrize(
+        "nrows,expected",
+        [
+            (
+                0,
+                {
+                    "index": [],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [],
+                },
+            ),
+            (
+                1,
+                {
+                    "index": [0],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [[1, 4, 5]],
+                },
+            ),
+            (
+                None,
+                {
+                    "index": [0, 1],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [[1, 4, 5], [2, 5, 6]],
+                },
+            ),
+            (
+                10,
+                {
+                    "index": [0, 1],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [[1, 4, 5], [2, 5, 6]],
+                },
+            ),
+        ],
+    )
+    def test_preview(self, csv_data_set, dummy_dataframe, nrows, expected):
+        """Test _preview returns the correct data structure."""
+        csv_data_set.save(dummy_dataframe)
+        previewed = csv_data_set._preview(nrows=nrows)
+        assert previewed == expected
+
     def test_load_missing_file(self, csv_data_set):
         """Check the error when trying to load missing file."""
         pattern = r"Failed while loading data from data set CSVDataSet\(.*\)"
diff --git a/kedro-datasets/tests/pandas/test_excel_dataset.py b/kedro-datasets/tests/pandas/test_excel_dataset.py
index c568d15d0..1080cc9b6 100644
--- a/kedro-datasets/tests/pandas/test_excel_dataset.py
+++ b/kedro-datasets/tests/pandas/test_excel_dataset.py
@@ -121,6 +121,49 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path):
         assert "storage_options" not in ds._save_args
         assert "storage_options" not in ds._load_args
 
+    @pytest.mark.parametrize(
+        "nrows,expected",
+        [
+            (
+                0,
+                {
+                    "index": [],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [],
+                },
+            ),
+            (
+                1,
+                {
+                    "index": [0],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [[1, 4, 5]],
+                },
+            ),
+            (
+                None,
+                {
+                    "index": [0, 1],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [[1, 4, 5], [2, 5, 6]],
+                },
+            ),
+            (
+                10,
+                {
+                    "index": [0, 1],
+                    "columns": ["col1", "col2", "col3"],
+                    "data": [[1, 4, 5], [2, 5, 6]],
+                },
+            ),
+        ],
+    )
+    def test_preview(self, excel_data_set, dummy_dataframe, nrows, expected):
+        """Test _preview returns the correct data structure."""
+        excel_data_set.save(dummy_dataframe)
+        previewed = excel_data_set._preview(nrows=nrows)
+        assert previewed == expected
+
     def test_load_missing_file(self, excel_data_set):
         """Check the error when trying to load missing file."""
         pattern = r"Failed while loading data from data set ExcelDataSet\(.*\)"