udacity · dhedderich · Aug 20, 2023 · Aug 20, 2023 · Aug 20, 2023 · Aug 20, 2023
diff --git a/.github/workflows/manual.yml b/.github/workflows/manual.yml
diff --git a/Miniconda3-latest-Linux-x86_64.sh b/Miniconda3-latest-Linux-x86_64.sh
diff --git a/components/conda.yml b/components/conda.yml
@@ -1,6 +1,7 @@
 name: components
 channels:
-  - conda-forge
   - defaults
 dependencies:
-  - mlflow=1.14.1
+  - pip===20.3.3
+  - pip:
+      - mlflow==1.30.1
diff --git a/components/get_data/conda.yml b/components/get_data/conda.yml
@@ -1,11 +1,10 @@
 name: download_file
 channels:
-  - conda-forge
   - defaults
 dependencies:
-  - pip=20.3.3
-  - requests=2.24.0
-  - mlflow=1.14.1
+  - pip=23.0.1=py310h06a4308_0
   - pip:
-      - wandb==0.10.31
+      - wandb==0.15.8
+      - requests==2.24.0
+      - mlflow==1.30.1
       - git+https://github.com/udacity/nd0821-c2-build-model-workflow-starter.git#egg=wandb-utils&subdirectory=components
diff --git a/components/test_regression_model/conda.yml b/components/test_regression_model/conda.yml
@@ -1,12 +1,11 @@
 name: test_regression_model
 channels:
-  - conda-forge
   - defaults
 dependencies:
-  - pandas=1.1.4
   - pip=20.3.3
-  - mlflow=1.14.1
-  - scikit-learn=0.24.1
   - pip:
       - wandb==0.10.31
+      - pandas==1.1.4
+      - mlflow==1.30.1
+      - scikit-learn==1.3.0
       - git+https://github.com/udacity/nd0821-c2-build-model-workflow-starter.git#egg=wandb-utils&subdirectory=components
diff --git a/components/train_val_test_split/conda.yml b/components/train_val_test_split/conda.yml
@@ -1,12 +1,11 @@
 name: download_file
 channels:
-  - conda-forge
   - defaults
 dependencies:
   - pip=20.3.3
-  - requests=2.24.0
-  - mlflow=1.14.1
-  - scikit-learn=0.24.1
   - pip:
       - wandb==0.10.31
+      - requests==2.24.0
+      - mlflow==1.30.1
+      - scikit-learn==0.24.1
       - git+https://github.com/udacity/nd0821-c2-build-model-workflow-starter.git#egg=wandb-utils&subdirectory=components
diff --git a/conda.yml b/conda.yml
@@ -1,11 +1,10 @@
 name: components
 channels:
-  - conda-forge
   - defaults
 dependencies:
-  - mlflow=1.14.1
-  - pyyaml=5.3.1
-  - hydra-core=1.0.6
   - pip=20.3.3
   - pip:
-      - wandb==0.10.31
+      - wandb==0.10.31
+      - mlflow==1.30.1
+      - pyyaml==5.3.1
+      - hydra-core==1.0.6
diff --git a/config.yaml b/config.yaml
@@ -22,7 +22,7 @@ modeling:
   stratify_by: "neighbourhood_group"
   # Maximum number of features to consider for the TFIDF applied to the title of the
   # insertion (the column called "name")
-  max_tfidf_features: 5
+  max_tfidf_features: 15
   # NOTE: you can put here any parameter that is accepted by the constructor of
   # RandomForestRegressor. This is a subsample, but more could be added:
   random_forest:
@@ -32,7 +32,7 @@ modeling:
     min_samples_leaf: 3
     # Here -1 means all available cores
     n_jobs: -1
-    criterion: mae
+    criterion: absolute_error
     max_features: 0.5
     # DO not change the following
     oob_score: true
diff --git a/environment.yml b/environment.yml
@@ -1,17 +1,16 @@
 name: nyc_airbnb_dev
 channels:
-  - conda-forge
   - defaults
 dependencies:
-  - mlflow=1.14.1
-  - ipython=7.21.0
-  - notebook=6.2.0
-  - jupyterlab=3.0.10
-  - cookiecutter=1.7.2
-  - hydra-core=1.0.6
-  - matplotlib=3.3.4
-  - pandas=1.2.3
-  - git=2.30.2
   - pip=20.3.3
   - pip:
       - wandb==0.10.31
+      - mlflow==1.30.1
+      - ipython==7.21.0
+      - notebook==6.2.0
+      - jupyterlab==3.0.10
+      - cookiecutter==1.7.2
+      - hydra-core==1.0.6
+      - matplotlib==3.3.4
+      - pandas==1.2.3
+      - git==2.41.0
diff --git a/main.py b/main.py
@@ -28,6 +28,8 @@ def go(config: DictConfig):
     os.environ["WANDB_PROJECT"] = config["main"]["project_name"]
     os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"]
 
+    root_path = hydra.utils.get_original_cwd()
+
     # Steps to execute
     steps_par = config['main']['steps']
     active_steps = steps_par.split(",") if steps_par != "all" else _steps
@@ -38,9 +40,8 @@ def go(config: DictConfig):
         if "download" in active_steps:
             # Download file and load in W&B
             _ = mlflow.run(
-                f"{config['main']['components_repository']}/get_data",
+                os.path.join(root_path, "components/get_data"),
                 "main",
-                version='main',
                 parameters={
                     "sample": config["etl"]["sample"],
                     "artifact_name": "sample.csv",
@@ -50,22 +51,43 @@ def go(config: DictConfig):
             )
 
         if "basic_cleaning" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(root_path, "src", "basic_cleaning"),
+                "main",
+                parameters={
+                    "input_artifact": "sample.csv:latest",
+                    "output_artifact": "clean_sample.csv",
+                    "output_type": "clean_sample",
+                    "output_description": "Data with outliers and null values removed",
+                    "min_price": config["etl"]["min_price"],
+                    "max_price": config["etl"]["max_price"]
+                },
+            )
 
         if "data_check" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(root_path, "src", "data_check"),
+                "main",
+                parameters={
+                    "csv": "clean_sample.csv:latest",
+                    "ref": "clean_sample.csv:reference",
+                    "kl_threshold": config["data_check"]["kl_threshold"],
+                    "min_price": config["etl"]["min_price"],
+                    "max_price": config["etl"]["max_price"]
+                },
+            )
 
         if "data_split" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(root_path, "components", "train_val_test_split"),
+                "main",
+                parameters={
+                    "input": "clean_sample.csv:latest",
+                    "test_size": config["modeling"]["test_size"],
+                    "random_seed": config["modeling"]["random_seed"],
+                    "stratify_by": config["modeling"]["stratify_by"]
+                },
+            )
 
         if "train_random_forest" in active_steps:
 
@@ -77,19 +99,29 @@ def go(config: DictConfig):
             # NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest
             # step
 
-            ##################
-            # Implement here #
-            ##################
-
-            pass
-
+            _ = mlflow.run(
+                os.path.join(root_path, "src", "train_random_forest"),
+                "main",
+                parameters={
+                    "trainval_artifact": "trainval_data.csv:latest",
+                    "val_size": config["modeling"]["test_size"],
+                    "random_seed": config["modeling"]["random_seed"],
+                    "stratify_by": config["modeling"]["stratify_by"],
+                    "rf_config": rf_config,
+                    "max_tfidf_features": config["modeling"]["max_tfidf_features"],
+                    "output_artifact": "random_forest_export",
+                },
+            )
         if "test_regression_model" in active_steps:
 
-            ##################
-            # Implement here #
-            ##################
-
-            pass
+            _ = mlflow.run(
+                os.path.join(root_path, "components", "test_regression_model"),
+                "main",
+                parameters={
+                    "mlflow_model": "random_forest_export:prod",
+                    "test_dataset": "test_data.csv:latest",
+                },
+            )
 
 
 if __name__ == "__main__":

diff --git a/oryx-build-commands.txt b/oryx-build-commands.txt
@@ -0,0 +1,2 @@
+PlatformWithVersion=Python 
+BuildCommands=conda env create --file environment.yml --prefix ./venv --quiet
diff --git a/src/basic_cleaning/MLproject b/src/basic_cleaning/MLproject
@@ -0,0 +1,33 @@
+name: basic_cleaning
+conda_env: conda.yml
+
+entry_points:
+  main:
+    parameters:
+
+      input_artifact:
+        description: Input CSV file from W&B
+        type: string
+
+      output_artifact:
+        description: Cleaned output file
+        type: string
+
+      output_type:
+        description: Cleaned file
+        type: string
+
+      output_description:
+        description: Raw sample file has been cleaned
+        type: string
+
+      min_price:
+        description: Minimal price to filter for
+        type: float
+
+      max_price:
+        description: Maximum price to filter for
+        type: float
+
+    command: >-
+        python run.py  --input_artifact {input_artifact}  --output_artifact {output_artifact}  --output_type {output_type}  --output_description {output_description}  --min_price {min_price}  --max_price {max_price}