diff --git a/docs/source/03_getting_started/02_first_steps.md b/docs/source/03_getting_started/02_first_steps.md index 11465a9e..ed0ab074 100644 --- a/docs/source/03_getting_started/02_first_steps.md +++ b/docs/source/03_getting_started/02_first_steps.md @@ -33,7 +33,7 @@ kedro run If the pipeline executes properly, you should see the following log: ```console -2020-07-13 21:29:25,401 - kedro.io.data_catalog - INFO - Loading data from `example_iris_data` (CSVDataSet)... +2020-07-13 21:29:25,401 - kedro.io.data_catalog - INFO - Loading data from `example_iris_data` (CSVDataset)... 2020-07-13 21:29:25,562 - kedro.io.data_catalog - INFO - Loading data from `params:example_test_data_ratio` (MemoryDataSet)... 2020-07-13 21:29:25,969 - kedro.pipeline.node - INFO - Running node: split_data([example_iris_data,params:example_test_data_ratio]) -> [example_test_x,example_test_y,example_train_x,example_train_y] 2020-07-13 21:29:26,053 - kedro.io.data_catalog - INFO - Saving data to `example_train_x` (MemoryDataSet)... @@ -124,7 +124,7 @@ First, open the ``catalog.yml`` file which should like this: # template. Please feel free to remove it once you remove the example pipeline. example_iris_data: - type: pandas.CSVDataSet + type: pandas.CSVDataset filepath: data/01_raw/iris.csv ``` @@ -136,13 +136,13 @@ And persist the model as a pickle with the ``MlflowArtifactDataset`` class: # template. Please feel free to remove it once you remove the example pipeline. example_iris_data: - type: pandas.CSVDataSet + type: pandas.CSVDataset filepath: data/01_raw/iris.csv example_model: type: kedro_mlflow.io.artifacts.MlflowArtifactDataset data_set: - type: pickle.PickleDataSet + type: pickle.PickleDataset filepath: data/06_models/trained_model.pkl ``` diff --git a/docs/source/04_experimentation_tracking/03_version_datasets.md b/docs/source/04_experimentation_tracking/03_version_datasets.md index aa5fcf21..e4d95725 100644 --- a/docs/source/04_experimentation_tracking/03_version_datasets.md +++ b/docs/source/04_experimentation_tracking/03_version_datasets.md @@ -20,7 +20,7 @@ Since it is an ``AbstractDataSet``, it can be used with the YAML API. Assume tha ```yaml my_dataset_to_version: - type: pandas.CSVDataSet + type: pandas.CSVDataset filepath: /path/to/a/destination/file.csv ``` @@ -30,7 +30,7 @@ You can change it to: my_dataset_to_version: type: kedro_mlflow.io.artifacts.MlflowArtifactDataset data_set: - type: pandas.CSVDataSet # or any valid kedro DataSet + type: pandas.CSVDataset # or any valid kedro DataSet filepath: /path/to/a/LOCAL/destination/file.csv # must be a local file, wherever you want to log the data in the end ``` @@ -46,7 +46,7 @@ The ``MlflowArtifactDataset`` takes a ``data_set`` argument which is a python di my_dataset_to_version: type: kedro_mlflow.io.artifacts.MlflowArtifactDataset data_set: - type: pandas.CSVDataSet # or any valid kedro DataSet + type: pandas.CSVDataset # or any valid kedro DataSet filepath: /path/to/a/local/destination/file.csv load_args: sep: ; @@ -61,11 +61,11 @@ Like all Kedro ``AbstractDataSet``, ``MlflowArtifactDataset`` is callable in the ```python from kedro_mlflow.io.artifacts import MlflowArtifactDataset -from kedro.extras.datasets.pandas import CSVDataSet +from kedro_datasets.pandas import CSVDataset csv_dataset = MlflowArtifactDataSet( data_set={ - "type": CSVDataSet, # either a string "pandas.CSVDataSet" or the class + "type": CSVDataset, # either a string "pandas.CSVDataset" or the class "filepath": r"/path/to/a/local/destination/file.csv", } ) @@ -91,7 +91,7 @@ The ``MlflowArtifactDataset`` has an extra attribute ``run_id`` which specifies my_dataset_to_version: type: kedro_mlflow.io.artifacts.MlflowArtifactDataset data_set: - type: pandas.CSVDataSet # or any valid kedro DataSet + type: pandas.CSVDataset # or any valid kedro DataSet filepath: /path/to/a/local/destination/file.csv # must be a local filepath, no matter what is your actual mlflow storage (S3 or other) run_id: 13245678910111213 # a valid mlflow run to log in. If None, default to active run ``` @@ -106,7 +106,7 @@ You may want to reuse th artifact of a previous run to reuse it in another one, my_dataset_to_reload: type: kedro_mlflow.io.artifacts.MlflowArtifactDataset data_set: - type: pandas.CSVDataSet # or any valid kedro DataSet + type: pandas.CSVDataset # or any valid kedro DataSet filepath: /path/to/a/local/destination/file.csv # must be a local filepath, no matter what is your actual mlflow storage (S3 or other) run_id: 13245678910111213 # a valid mlflow run with the existing artifact. It must be named "file.csv" ``` @@ -121,7 +121,7 @@ With below example, the artifact will be logged in mlflow within a `reporting` f my_dataset_to_version: type: kedro_mlflow.io.artifacts.MlflowArtifactDataset data_set: - type: pandas.CSVDataSet # or any valid kedro DataSet + type: pandas.CSVDataset # or any valid kedro DataSet filepath: /path/to/a/local/destination/file.csv artifact_path: reporting # relative path where the remote artifact must be stored. if None, saved in root folder. ``` diff --git a/docs/source/05_pipeline_serving/04_hook_pipeline_ml.md b/docs/source/05_pipeline_serving/04_hook_pipeline_ml.md index 2ec5f726..7ac60d24 100644 --- a/docs/source/05_pipeline_serving/04_hook_pipeline_ml.md +++ b/docs/source/05_pipeline_serving/04_hook_pipeline_ml.md @@ -14,7 +14,6 @@ For consistency, you may want to log an inference pipeline (including some data def register_pipelines(self) -> Dict[str, Pipeline]: - ml_pipeline = create_ml_pipeline() training_pipeline_ml = pipeline_ml_factory( training=ml_pipeline.only_nodes_with_tags("training"), @@ -37,7 +36,7 @@ For consistency, you may want to log an inference pipeline (including some data ```yaml label_encoder: - type: pickle.PickleDataSet # <- This must be any Kedro Dataset other than "MemoryDataSet" + type: pickle.PickleDataset # <- This must be any Kedro Dataset other than "MemoryDataSet" filepath: data/06_models/label_encoder.pkl # <- This must be a local path, no matter what is your mlflow storage (S3 or other) ``` diff --git a/docs/source/07_python_objects/01_DataSets.md b/docs/source/07_python_objects/01_DataSets.md index cf1c4a8b..0282d809 100644 --- a/docs/source/07_python_objects/01_DataSets.md +++ b/docs/source/07_python_objects/01_DataSets.md @@ -8,7 +8,7 @@ my_dataset_to_version: type: kedro_mlflow.io.artifacts.MlflowArtifactDataset data_set: - type: pandas.CSVDataSet # or any valid kedro DataSet + type: pandas.CSVDataset # or any valid kedro DataSet filepath: /path/to/a/local/destination/file.csv ``` @@ -18,7 +18,7 @@ or with additional parameters: my_dataset_to_version: type: kedro_mlflow.io.artifacts.MlflowArtifactDataset data_set: - type: pandas.CSVDataSet # or any valid kedro DataSet + type: pandas.CSVDataset # or any valid kedro DataSet filepath: /path/to/a/local/destination/file.csv load_args: sep: ; @@ -33,10 +33,10 @@ or with the python API: ```python from kedro_mlflow.io.artifacts import MlflowArtifactDataset -from kedro.extras.datasets.pandas import CSVDataSet +from kedro_datasets.pandas import CSVDataset csv_dataset = MlflowArtifactDataset( - data_set={"type": CSVDataSet, "filepath": r"/path/to/a/local/destination/file.csv"} + data_set={"type": CSVDataset, "filepath": r"/path/to/a/local/destination/file.csv"} ) csv_dataset.save(data=pd.DataFrame({"a": [1, 2], "b": [3, 4]})) ``` diff --git a/kedro_mlflow/mlflow/kedro_pipeline_model.py b/kedro_mlflow/mlflow/kedro_pipeline_model.py index 1372fe28..244a1b7a 100644 --- a/kedro_mlflow/mlflow/kedro_pipeline_model.py +++ b/kedro_mlflow/mlflow/kedro_pipeline_model.py @@ -2,11 +2,11 @@ from pathlib import Path from typing import Dict, Optional, Union -from kedro.extras.datasets.pickle import PickleDataSet from kedro.framework.hooks import _create_hook_manager from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import Pipeline from kedro.runner import AbstractRunner, SequentialRunner +from kedro_datasets.pickle import PickleDataSet from mlflow.pyfunc import PythonModel from kedro_mlflow.pipeline.pipeline_ml import PipelineML diff --git a/requirements.txt b/requirements.txt index 3386c6f9..d2386a44 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ kedro>=0.18.1, <0.19.0 +kedro_datasets mlflow>=1.0.0, <3.0.0 pydantic>=1.0.0, <2.0.0 diff --git a/setup.py b/setup.py index 047f82cb..cacb6ab1 100644 --- a/setup.py +++ b/setup.py @@ -54,6 +54,7 @@ def _parse_requirements(path, encoding="utf-8"): "flake8==6.0.0", # ensure consistency with pre-commit "black==23.7.0", # pin black version because it is not compatible with a pip range (because of non semver version number) "isort==5.12.0", # ensure consistency with pre-commit + "kedro-datasets[pandas]", ], "dev": [ "pre-commit>=2.0.0,<4.0.0", diff --git a/tests/framework/cli/test_cli_modelify.py b/tests/framework/cli/test_cli_modelify.py index ee84cc18..c2de59d3 100644 --- a/tests/framework/cli/test_cli_modelify.py +++ b/tests/framework/cli/test_cli_modelify.py @@ -78,7 +78,7 @@ def register_pipelines(): catalog_yml = f""" trained_model: - type: pickle.PickleDataSet + type: pickle.PickleDataset filepath: {model_filepath} """ @@ -111,10 +111,10 @@ def kp_for_modelify_persistent_input(kp_for_modelify): data_filepath = (kp_for_modelify / "data" / "my_input_data.pkl").as_posix() catalog_yml = f""" trained_model: - type: pickle.PickleDataSet + type: pickle.PickleDataset filepath: {model_filepath} my_input_data: - type: pickle.PickleDataSet + type: pickle.PickleDataset filepath: {data_filepath} """ @@ -182,7 +182,7 @@ def register_pipelines(): catalog_yml = f""" trained_model: - type: pickle.PickleDataSet + type: pickle.PickleDataset filepath: {model_filepath} """ diff --git a/tests/framework/hooks/test_hook_deactivate_tracking.py b/tests/framework/hooks/test_hook_deactivate_tracking.py index 97eac656..b28a5050 100644 --- a/tests/framework/hooks/test_hook_deactivate_tracking.py +++ b/tests/framework/hooks/test_hook_deactivate_tracking.py @@ -80,7 +80,7 @@ def catalog_config(kedro_project_path): "artifact_data": { "type": "kedro_mlflow.io.artifacts.MlflowArtifactDataset", "data_set": { - "type": "pickle.PickleDataSet", + "type": "pickle.PickleDataset", "filepath": fake_data_filepath, }, }, diff --git a/tests/framework/hooks/test_hook_log_metrics.py b/tests/framework/hooks/test_hook_log_metrics.py index 04c761a2..d5a2eaad 100644 --- a/tests/framework/hooks/test_hook_log_metrics.py +++ b/tests/framework/hooks/test_hook_log_metrics.py @@ -1,12 +1,12 @@ import mlflow import pandas as pd import pytest -from kedro.extras.datasets.pickle import PickleDataSet from kedro.framework.session import KedroSession from kedro.framework.startup import bootstrap_project from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import Pipeline, node from kedro.runner import SequentialRunner +from kedro_datasets.pickle import PickleDataSet from kedro_mlflow.framework.hooks.mlflow_hook import MlflowHook from kedro_mlflow.io.metrics import ( diff --git a/tests/framework/hooks/test_hook_pipeline_ml.py b/tests/framework/hooks/test_hook_pipeline_ml.py index 915ac144..baf74290 100644 --- a/tests/framework/hooks/test_hook_pipeline_ml.py +++ b/tests/framework/hooks/test_hook_pipeline_ml.py @@ -3,12 +3,12 @@ import mlflow import pandas as pd import pytest -from kedro.extras.datasets.pickle import PickleDataSet from kedro.framework.session import KedroSession from kedro.framework.startup import bootstrap_project from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import Pipeline, node from kedro.runner import SequentialRunner +from kedro_datasets.pickle import PickleDataSet from mlflow.models import infer_signature from mlflow.tracking import MlflowClient diff --git a/tests/io/artifacts/test_mlflow_artifact_dataset.py b/tests/io/artifacts/test_mlflow_artifact_dataset.py index 802ffcf0..f82b1364 100644 --- a/tests/io/artifacts/test_mlflow_artifact_dataset.py +++ b/tests/io/artifacts/test_mlflow_artifact_dataset.py @@ -3,9 +3,9 @@ import mlflow import pandas as pd import pytest -from kedro.extras.datasets.pandas import CSVDataSet -from kedro.extras.datasets.pickle import PickleDataSet from kedro.io import PartitionedDataSet +from kedro_datasets.pandas import CSVDataset +from kedro_datasets.pickle import PickleDataSet from mlflow.tracking import MlflowClient from pytest_lazyfixture import lazy_fixture @@ -30,12 +30,12 @@ def df2(): @pytest.mark.parametrize( "dataset,extension,data,artifact_path", [ - (CSVDataSet, ".csv", lazy_fixture("df1"), None), - ("pandas.CSVDataSet", ".csv", lazy_fixture("df1"), None), + (CSVDataset, ".csv", lazy_fixture("df1"), None), + ("pandas.CSVDataset", ".csv", lazy_fixture("df1"), None), (PickleDataSet, ".pkl", lazy_fixture("df1"), None), ("pickle.PickleDataSet", ".pkl", lazy_fixture("df1"), None), - (CSVDataSet, ".csv", lazy_fixture("df1"), "artifact_dir"), - ("pandas.CSVDataSet", ".csv", lazy_fixture("df1"), "artifact_dir"), + (CSVDataset, ".csv", lazy_fixture("df1"), "artifact_dir"), + ("pandas.CSVDataset", ".csv", lazy_fixture("df1"), "artifact_dir"), (PickleDataSet, ".pkl", lazy_fixture("df1"), "artifact_dir"), ( "pickle.PickleDataSet", @@ -99,7 +99,7 @@ def test_artifact_dataset_save_with_run_id( # then same scenario but the run_id where data is saved is specified mlflow_csv_dataset = MlflowArtifactDataset( - data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix()), + data_set=dict(type=CSVDataset, filepath=(tmp_path / "df1.csv").as_posix()), run_id=run_id, ) mlflow_csv_dataset.save(df1) @@ -136,7 +136,7 @@ def test_is_versioned_dataset_logged_correctly_in_mlflow(tmp_path, tracking_uri, mlflow_csv_dataset = MlflowArtifactDataset( data_set=dict( - type=CSVDataSet, + type=CSVDataset, filepath=(tmp_path / "df1.csv").as_posix(), versioned=True, ), @@ -200,7 +200,7 @@ def test_artifact_dataset_logging_deactivation(tmp_path, tracking_uri): def test_mlflow_artifact_logging_deactivation_is_bool(tmp_path): mlflow_csv_dataset = MlflowArtifactDataset( - data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix()) + data_set=dict(type=CSVDataset, filepath=(tmp_path / "df1.csv").as_posix()) ) with pytest.raises(ValueError, match="_logging_activated must be a boolean"): @@ -212,7 +212,7 @@ def test_artifact_dataset_load_with_run_id(tmp_path, tracking_uri, df1, df2): # define the logger mlflow_csv_dataset = MlflowArtifactDataset( - data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df.csv").as_posix()) + data_set=dict(type=CSVDataset, filepath=(tmp_path / "df.csv").as_posix()) ) # create a first run, save a first dataset @@ -240,7 +240,7 @@ def test_artifact_dataset_load_with_run_id_and_artifact_path( # save first and retrieve run id mlflow_csv_dataset1 = MlflowArtifactDataset( - data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix()), + data_set=dict(type=CSVDataset, filepath=(tmp_path / "df1.csv").as_posix()), artifact_path=artifact_path, ) with mlflow.start_run(): @@ -251,7 +251,7 @@ def test_artifact_dataset_load_with_run_id_and_artifact_path( ).unlink() # we need to delete the data, else it is automatically reused instead of downloading # same as before, but a closed run_id is specified mlflow_csv_dataset2 = MlflowArtifactDataset( - data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix()), + data_set=dict(type=CSVDataset, filepath=(tmp_path / "df1.csv").as_posix()), artifact_path=artifact_path, run_id=first_run_id, ) @@ -274,7 +274,7 @@ def test_partitioned_dataset_save_and_reload( data_set=dict( type=PartitionedDataSet, path=(tmp_path / "df_dir").as_posix(), - dataset="pandas.CSVDataSet", + dataset="pandas.CSVDataset", filename_suffix=".csv", ), ) diff --git a/tests/io/models/test_mlflow_model_logger_dataset.py b/tests/io/models/test_mlflow_model_logger_dataset.py index 05da8bf0..c5f5400f 100644 --- a/tests/io/models/test_mlflow_model_logger_dataset.py +++ b/tests/io/models/test_mlflow_model_logger_dataset.py @@ -3,10 +3,10 @@ import mlflow import pandas as pd import pytest -from kedro.extras.datasets.pickle import PickleDataSet from kedro.io import DataCatalog, MemoryDataSet from kedro.io.core import DataSetError from kedro.pipeline import Pipeline, node +from kedro_datasets.pickle import PickleDataSet from mlflow.tracking import MlflowClient from sklearn.linear_model import LinearRegression diff --git a/tests/io/models/test_mlflow_model_saver_dataset.py b/tests/io/models/test_mlflow_model_saver_dataset.py index 4344bf2c..55bd9cdf 100644 --- a/tests/io/models/test_mlflow_model_saver_dataset.py +++ b/tests/io/models/test_mlflow_model_saver_dataset.py @@ -3,9 +3,9 @@ import mlflow import pandas as pd import pytest -from kedro.extras.datasets.pickle import PickleDataSet from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import Pipeline, node +from kedro_datasets.pickle import PickleDataSet from sklearn.linear_model import LinearRegression from kedro_mlflow.io.models import MlflowModelSaverDataSet diff --git a/tests/mlflow/test_kedro_pipeline_model.py b/tests/mlflow/test_kedro_pipeline_model.py index 820cdc72..1c4566a9 100644 --- a/tests/mlflow/test_kedro_pipeline_model.py +++ b/tests/mlflow/test_kedro_pipeline_model.py @@ -4,9 +4,9 @@ import mlflow import pandas as pd import pytest -from kedro.extras.datasets.pickle import PickleDataSet from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import Pipeline, node +from kedro_datasets.pickle import PickleDataSet from sklearn.linear_model import LinearRegression from kedro_mlflow.io.models import MlflowModelSaverDataSet diff --git a/tests/pipeline/test_pipeline_ml.py b/tests/pipeline/test_pipeline_ml.py index a9aa6feb..c3c6e50e 100644 --- a/tests/pipeline/test_pipeline_ml.py +++ b/tests/pipeline/test_pipeline_ml.py @@ -1,7 +1,7 @@ import pytest -from kedro.extras.datasets.pandas import CSVDataSet from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import Pipeline, node +from kedro_datasets.pandas import CSVDataset from kedro_mlflow.pipeline import pipeline_ml_factory from kedro_mlflow.pipeline.pipeline_ml import KedroMlflowPipelineMLError, PipelineML @@ -199,7 +199,7 @@ def dummy_catalog(): { "raw_data": MemoryDataSet(), "data": MemoryDataSet(), - "model": CSVDataSet("fake/path/to/model.csv"), + "model": CSVDataset("fake/path/to/model.csv"), } ) return dummy_catalog @@ -211,8 +211,8 @@ def catalog_with_encoder(): { "raw_data": MemoryDataSet(), "data": MemoryDataSet(), - "encoder": CSVDataSet("fake/path/to/encoder.csv"), - "model": CSVDataSet("fake/path/to/model.csv"), + "encoder": CSVDataset("fake/path/to/encoder.csv"), + "model": CSVDataset("fake/path/to/model.csv"), } ) return catalog_with_encoder @@ -224,8 +224,8 @@ def catalog_with_stopwords(): { "data": MemoryDataSet(), "cleaned_data": MemoryDataSet(), - "stopwords_from_nltk": CSVDataSet("fake/path/to/stopwords.csv"), - "model": CSVDataSet("fake/path/to/model.csv"), + "stopwords_from_nltk": CSVDataset("fake/path/to/stopwords.csv"), + "model": CSVDataset("fake/path/to/model.csv"), } ) return catalog_with_stopwords @@ -239,7 +239,7 @@ def catalog_with_parameters(): "cleaned_data": MemoryDataSet(), "params:stopwords": MemoryDataSet(["Hello", "Hi"]), "params:penalty": MemoryDataSet(0.1), - "model": CSVDataSet("fake/path/to/model.csv"), + "model": CSVDataset("fake/path/to/model.csv"), "params:threshold": MemoryDataSet(0.5), } )