♻️ Import from kedro_datasets instead of kedro.extras (#458)

Galileo-Galilei · Oct 21, 2023 · afad466 · afad466
1 parent 57caf45
commit afad466
Show file tree

Hide file tree

Showing 16 changed files with 50 additions and 49 deletions.
diff --git a/docs/source/03_getting_started/02_first_steps.md b/docs/source/03_getting_started/02_first_steps.md
@@ -33,7 +33,7 @@ kedro run
 If the pipeline executes properly, you should see the following log:
 
 ```console
-2020-07-13 21:29:25,401 - kedro.io.data_catalog - INFO - Loading data from `example_iris_data` (CSVDataSet)...
+2020-07-13 21:29:25,401 - kedro.io.data_catalog - INFO - Loading data from `example_iris_data` (CSVDataset)...
 2020-07-13 21:29:25,562 - kedro.io.data_catalog - INFO - Loading data from `params:example_test_data_ratio` (MemoryDataSet)...
 2020-07-13 21:29:25,969 - kedro.pipeline.node - INFO - Running node: split_data([example_iris_data,params:example_test_data_ratio]) -> [example_test_x,example_test_y,example_train_x,example_train_y]
 2020-07-13 21:29:26,053 - kedro.io.data_catalog - INFO - Saving data to `example_train_x` (MemoryDataSet)...
@@ -124,7 +124,7 @@ First, open the ``catalog.yml`` file which should like this:
 # template. Please feel free to remove it once you remove the example pipeline.
 
 example_iris_data:
-  type: pandas.CSVDataSet
+  type: pandas.CSVDataset
   filepath: data/01_raw/iris.csv
 
 ```
@@ -136,13 +136,13 @@ And persist the model as a pickle with the ``MlflowArtifactDataset`` class:
 # template. Please feel free to remove it once you remove the example pipeline.
 
 example_iris_data:
-  type: pandas.CSVDataSet
+  type: pandas.CSVDataset
   filepath: data/01_raw/iris.csv
 
 example_model:
   type: kedro_mlflow.io.artifacts.MlflowArtifactDataset
   data_set:
-    type: pickle.PickleDataSet
+    type: pickle.PickleDataset
     filepath: data/06_models/trained_model.pkl
 ```
 

diff --git a/docs/source/04_experimentation_tracking/03_version_datasets.md b/docs/source/04_experimentation_tracking/03_version_datasets.md
@@ -20,7 +20,7 @@ Since it is an ``AbstractDataSet``, it can be used with the YAML API. Assume tha
 
 ```yaml
 my_dataset_to_version:
-    type: pandas.CSVDataSet
+    type: pandas.CSVDataset
     filepath: /path/to/a/destination/file.csv
 ```
 
@@ -30,7 +30,7 @@ You can change it to:
 my_dataset_to_version:
     type: kedro_mlflow.io.artifacts.MlflowArtifactDataset
     data_set:
-        type: pandas.CSVDataSet  # or any valid kedro DataSet
+        type: pandas.CSVDataset  # or any valid kedro DataSet
         filepath: /path/to/a/LOCAL/destination/file.csv # must be a local file, wherever you want to log the data in the end
 ```
 
@@ -46,7 +46,7 @@ The ``MlflowArtifactDataset`` takes a ``data_set`` argument which is a python di
 my_dataset_to_version:
     type: kedro_mlflow.io.artifacts.MlflowArtifactDataset
     data_set:
-        type: pandas.CSVDataSet  # or any valid kedro DataSet
+        type: pandas.CSVDataset  # or any valid kedro DataSet
         filepath: /path/to/a/local/destination/file.csv
         load_args:
             sep: ;
@@ -61,11 +61,11 @@ Like all Kedro ``AbstractDataSet``, ``MlflowArtifactDataset`` is callable in the
 
 ```python
 from kedro_mlflow.io.artifacts import MlflowArtifactDataset
-from kedro.extras.datasets.pandas import CSVDataSet
+from kedro_datasets.pandas import CSVDataset
 
 csv_dataset = MlflowArtifactDataSet(
     data_set={
-        "type": CSVDataSet,  # either a string "pandas.CSVDataSet" or the class
+        "type": CSVDataset,  # either a string "pandas.CSVDataset" or the class
         "filepath": r"/path/to/a/local/destination/file.csv",
     }
 )
@@ -91,7 +91,7 @@ The ``MlflowArtifactDataset`` has an extra attribute ``run_id`` which specifies
 my_dataset_to_version:
     type: kedro_mlflow.io.artifacts.MlflowArtifactDataset
     data_set:
-        type: pandas.CSVDataSet  # or any valid kedro DataSet
+        type: pandas.CSVDataset  # or any valid kedro DataSet
         filepath: /path/to/a/local/destination/file.csv  # must be a local filepath, no matter what is your actual mlflow storage (S3 or other)
     run_id: 13245678910111213  # a valid mlflow run to log in. If None, default to active run
 ```
@@ -106,7 +106,7 @@ You may want to reuse th artifact of a previous run to reuse it in another one,
 my_dataset_to_reload:
     type: kedro_mlflow.io.artifacts.MlflowArtifactDataset
     data_set:
-        type: pandas.CSVDataSet  # or any valid kedro DataSet
+        type: pandas.CSVDataset  # or any valid kedro DataSet
         filepath: /path/to/a/local/destination/file.csv # must be a local filepath, no matter what is your actual mlflow storage (S3 or other)
     run_id: 13245678910111213  # a valid mlflow run with the existing artifact. It must be named "file.csv"
 ```
@@ -121,7 +121,7 @@ With below example, the artifact will be logged in mlflow within a `reporting` f
 my_dataset_to_version:
     type: kedro_mlflow.io.artifacts.MlflowArtifactDataset
     data_set:
-        type: pandas.CSVDataSet  # or any valid kedro DataSet
+        type: pandas.CSVDataset  # or any valid kedro DataSet
         filepath: /path/to/a/local/destination/file.csv
     artifact_path: reporting  # relative path where the remote artifact must be stored. if None, saved in root folder.
 ```
diff --git a/docs/source/05_pipeline_serving/04_hook_pipeline_ml.md b/docs/source/05_pipeline_serving/04_hook_pipeline_ml.md
@@ -14,7 +14,6 @@ For consistency, you may want to log an inference pipeline (including some data
 
 
     def register_pipelines(self) -> Dict[str, Pipeline]:
-
         ml_pipeline = create_ml_pipeline()
         training_pipeline_ml = pipeline_ml_factory(
             training=ml_pipeline.only_nodes_with_tags("training"),
@@ -37,7 +36,7 @@ For consistency, you may want to log an inference pipeline (including some data
 
     ```yaml
     label_encoder:
-    type: pickle.PickleDataSet  # <- This must be any Kedro Dataset other than "MemoryDataSet"
+    type: pickle.PickleDataset  # <- This must be any Kedro Dataset other than "MemoryDataSet"
     filepath: data/06_models/label_encoder.pkl  # <- This must be a local path, no matter what is your mlflow storage (S3 or other)
     ```
 

diff --git a/docs/source/07_python_objects/01_DataSets.md b/docs/source/07_python_objects/01_DataSets.md
@@ -8,7 +8,7 @@
 my_dataset_to_version:
     type: kedro_mlflow.io.artifacts.MlflowArtifactDataset
     data_set:
-        type: pandas.CSVDataSet  # or any valid kedro DataSet
+        type: pandas.CSVDataset  # or any valid kedro DataSet
         filepath: /path/to/a/local/destination/file.csv
 ```
 
@@ -18,7 +18,7 @@ or with additional parameters:
 my_dataset_to_version:
     type: kedro_mlflow.io.artifacts.MlflowArtifactDataset
     data_set:
-        type: pandas.CSVDataSet  # or any valid kedro DataSet
+        type: pandas.CSVDataset  # or any valid kedro DataSet
         filepath: /path/to/a/local/destination/file.csv
         load_args:
             sep: ;
@@ -33,10 +33,10 @@ or with the python API:
 
 ```python
 from kedro_mlflow.io.artifacts import MlflowArtifactDataset
-from kedro.extras.datasets.pandas import CSVDataSet
+from kedro_datasets.pandas import CSVDataset
 
 csv_dataset = MlflowArtifactDataset(
-    data_set={"type": CSVDataSet, "filepath": r"/path/to/a/local/destination/file.csv"}
+    data_set={"type": CSVDataset, "filepath": r"/path/to/a/local/destination/file.csv"}
 )
 csv_dataset.save(data=pd.DataFrame({"a": [1, 2], "b": [3, 4]}))
 ```

diff --git a/kedro_mlflow/mlflow/kedro_pipeline_model.py b/kedro_mlflow/mlflow/kedro_pipeline_model.py
@@ -2,11 +2,11 @@
 from pathlib import Path
 from typing import Dict, Optional, Union
 
-from kedro.extras.datasets.pickle import PickleDataSet
 from kedro.framework.hooks import _create_hook_manager
 from kedro.io import DataCatalog, MemoryDataSet
 from kedro.pipeline import Pipeline
 from kedro.runner import AbstractRunner, SequentialRunner
+from kedro_datasets.pickle import PickleDataSet
 from mlflow.pyfunc import PythonModel
 
 from kedro_mlflow.pipeline.pipeline_ml import PipelineML

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 kedro>=0.18.1, <0.19.0
+kedro_datasets
 mlflow>=1.0.0, <3.0.0
 pydantic>=1.0.0, <2.0.0
diff --git a/setup.py b/setup.py
@@ -54,6 +54,7 @@ def _parse_requirements(path, encoding="utf-8"):
             "flake8==6.0.0",  # ensure consistency with pre-commit
             "black==23.7.0",  # pin black version because it is not compatible with a pip range (because of non semver version number)
             "isort==5.12.0",  # ensure consistency with pre-commit
+            "kedro-datasets[pandas]",
         ],
         "dev": [
             "pre-commit>=2.0.0,<4.0.0",

diff --git a/tests/framework/cli/test_cli_modelify.py b/tests/framework/cli/test_cli_modelify.py
@@ -78,7 +78,7 @@ def register_pipelines():
 
     catalog_yml = f"""
     trained_model:
-        type: pickle.PickleDataSet
+        type: pickle.PickleDataset
         filepath: {model_filepath}
     """
 
@@ -111,10 +111,10 @@ def kp_for_modelify_persistent_input(kp_for_modelify):
     data_filepath = (kp_for_modelify / "data" / "my_input_data.pkl").as_posix()
     catalog_yml = f"""
     trained_model:
-        type: pickle.PickleDataSet
+        type: pickle.PickleDataset
         filepath: {model_filepath}
     my_input_data:
-        type: pickle.PickleDataSet
+        type: pickle.PickleDataset
         filepath: {data_filepath}
     """
 
@@ -182,7 +182,7 @@ def register_pipelines():
 
     catalog_yml = f"""
     trained_model:
-        type: pickle.PickleDataSet
+        type: pickle.PickleDataset
         filepath: {model_filepath}
     """
 

diff --git a/tests/framework/hooks/test_hook_deactivate_tracking.py b/tests/framework/hooks/test_hook_deactivate_tracking.py
@@ -80,7 +80,7 @@ def catalog_config(kedro_project_path):
         "artifact_data": {
             "type": "kedro_mlflow.io.artifacts.MlflowArtifactDataset",
             "data_set": {
-                "type": "pickle.PickleDataSet",
+                "type": "pickle.PickleDataset",
                 "filepath": fake_data_filepath,
             },
         },

diff --git a/tests/framework/hooks/test_hook_log_metrics.py b/tests/framework/hooks/test_hook_log_metrics.py
@@ -1,12 +1,12 @@
 import mlflow
 import pandas as pd
 import pytest
-from kedro.extras.datasets.pickle import PickleDataSet
 from kedro.framework.session import KedroSession
 from kedro.framework.startup import bootstrap_project
 from kedro.io import DataCatalog, MemoryDataSet
 from kedro.pipeline import Pipeline, node
 from kedro.runner import SequentialRunner
+from kedro_datasets.pickle import PickleDataSet
 
 from kedro_mlflow.framework.hooks.mlflow_hook import MlflowHook
 from kedro_mlflow.io.metrics import (

diff --git a/tests/framework/hooks/test_hook_pipeline_ml.py b/tests/framework/hooks/test_hook_pipeline_ml.py
@@ -3,12 +3,12 @@
 import mlflow
 import pandas as pd
 import pytest
-from kedro.extras.datasets.pickle import PickleDataSet
 from kedro.framework.session import KedroSession
 from kedro.framework.startup import bootstrap_project
 from kedro.io import DataCatalog, MemoryDataSet
 from kedro.pipeline import Pipeline, node
 from kedro.runner import SequentialRunner
+from kedro_datasets.pickle import PickleDataSet
 from mlflow.models import infer_signature
 from mlflow.tracking import MlflowClient
 

diff --git a/tests/io/artifacts/test_mlflow_artifact_dataset.py b/tests/io/artifacts/test_mlflow_artifact_dataset.py
@@ -3,9 +3,9 @@
 import mlflow
 import pandas as pd
 import pytest
-from kedro.extras.datasets.pandas import CSVDataSet
-from kedro.extras.datasets.pickle import PickleDataSet
 from kedro.io import PartitionedDataSet
+from kedro_datasets.pandas import CSVDataset
+from kedro_datasets.pickle import PickleDataSet
 from mlflow.tracking import MlflowClient
 from pytest_lazyfixture import lazy_fixture
 
@@ -30,12 +30,12 @@ def df2():
 @pytest.mark.parametrize(
     "dataset,extension,data,artifact_path",
     [
-        (CSVDataSet, ".csv", lazy_fixture("df1"), None),
-        ("pandas.CSVDataSet", ".csv", lazy_fixture("df1"), None),
+        (CSVDataset, ".csv", lazy_fixture("df1"), None),
+        ("pandas.CSVDataset", ".csv", lazy_fixture("df1"), None),
         (PickleDataSet, ".pkl", lazy_fixture("df1"), None),
         ("pickle.PickleDataSet", ".pkl", lazy_fixture("df1"), None),
-        (CSVDataSet, ".csv", lazy_fixture("df1"), "artifact_dir"),
-        ("pandas.CSVDataSet", ".csv", lazy_fixture("df1"), "artifact_dir"),
+        (CSVDataset, ".csv", lazy_fixture("df1"), "artifact_dir"),
+        ("pandas.CSVDataset", ".csv", lazy_fixture("df1"), "artifact_dir"),
         (PickleDataSet, ".pkl", lazy_fixture("df1"), "artifact_dir"),
         (
             "pickle.PickleDataSet",
@@ -99,7 +99,7 @@ def test_artifact_dataset_save_with_run_id(
 
     # then same scenario but the run_id where data is saved is specified
     mlflow_csv_dataset = MlflowArtifactDataset(
-        data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix()),
+        data_set=dict(type=CSVDataset, filepath=(tmp_path / "df1.csv").as_posix()),
         run_id=run_id,
     )
     mlflow_csv_dataset.save(df1)
@@ -136,7 +136,7 @@ def test_is_versioned_dataset_logged_correctly_in_mlflow(tmp_path, tracking_uri,
 
         mlflow_csv_dataset = MlflowArtifactDataset(
             data_set=dict(
-                type=CSVDataSet,
+                type=CSVDataset,
                 filepath=(tmp_path / "df1.csv").as_posix(),
                 versioned=True,
             ),
@@ -200,7 +200,7 @@ def test_artifact_dataset_logging_deactivation(tmp_path, tracking_uri):
 
 def test_mlflow_artifact_logging_deactivation_is_bool(tmp_path):
     mlflow_csv_dataset = MlflowArtifactDataset(
-        data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix())
+        data_set=dict(type=CSVDataset, filepath=(tmp_path / "df1.csv").as_posix())
     )
 
     with pytest.raises(ValueError, match="_logging_activated must be a boolean"):
@@ -212,7 +212,7 @@ def test_artifact_dataset_load_with_run_id(tmp_path, tracking_uri, df1, df2):
 
     # define the logger
     mlflow_csv_dataset = MlflowArtifactDataset(
-        data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df.csv").as_posix())
+        data_set=dict(type=CSVDataset, filepath=(tmp_path / "df.csv").as_posix())
     )
 
     # create a first run, save a first dataset
@@ -240,7 +240,7 @@ def test_artifact_dataset_load_with_run_id_and_artifact_path(
 
     # save first and retrieve run id
     mlflow_csv_dataset1 = MlflowArtifactDataset(
-        data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix()),
+        data_set=dict(type=CSVDataset, filepath=(tmp_path / "df1.csv").as_posix()),
         artifact_path=artifact_path,
     )
     with mlflow.start_run():
@@ -251,7 +251,7 @@ def test_artifact_dataset_load_with_run_id_and_artifact_path(
         ).unlink()  # we need to delete the data, else it is automatically reused instead of downloading
     # same as before, but a closed run_id is specified
     mlflow_csv_dataset2 = MlflowArtifactDataset(
-        data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix()),
+        data_set=dict(type=CSVDataset, filepath=(tmp_path / "df1.csv").as_posix()),
         artifact_path=artifact_path,
         run_id=first_run_id,
     )
@@ -274,7 +274,7 @@ def test_partitioned_dataset_save_and_reload(
         data_set=dict(
             type=PartitionedDataSet,
             path=(tmp_path / "df_dir").as_posix(),
-            dataset="pandas.CSVDataSet",
+            dataset="pandas.CSVDataset",
             filename_suffix=".csv",
         ),
     )

diff --git a/tests/io/models/test_mlflow_model_logger_dataset.py b/tests/io/models/test_mlflow_model_logger_dataset.py
@@ -3,10 +3,10 @@
 import mlflow
 import pandas as pd
 import pytest
-from kedro.extras.datasets.pickle import PickleDataSet
 from kedro.io import DataCatalog, MemoryDataSet
 from kedro.io.core import DataSetError
 from kedro.pipeline import Pipeline, node
+from kedro_datasets.pickle import PickleDataSet
 from mlflow.tracking import MlflowClient
 from sklearn.linear_model import LinearRegression
 

diff --git a/tests/io/models/test_mlflow_model_saver_dataset.py b/tests/io/models/test_mlflow_model_saver_dataset.py
@@ -3,9 +3,9 @@
 import mlflow
 import pandas as pd
 import pytest
-from kedro.extras.datasets.pickle import PickleDataSet
 from kedro.io import DataCatalog, MemoryDataSet
 from kedro.pipeline import Pipeline, node
+from kedro_datasets.pickle import PickleDataSet
 from sklearn.linear_model import LinearRegression
 
 from kedro_mlflow.io.models import MlflowModelSaverDataSet

diff --git a/tests/mlflow/test_kedro_pipeline_model.py b/tests/mlflow/test_kedro_pipeline_model.py
@@ -4,9 +4,9 @@
 import mlflow
 import pandas as pd
 import pytest
-from kedro.extras.datasets.pickle import PickleDataSet
 from kedro.io import DataCatalog, MemoryDataSet
 from kedro.pipeline import Pipeline, node
+from kedro_datasets.pickle import PickleDataSet
 from sklearn.linear_model import LinearRegression
 
 from kedro_mlflow.io.models import MlflowModelSaverDataSet