Merge pull request #14 from teamdatatonic/feat/combine_assets

feat: combine assets into a single folder
GoogleCloudPlatform · May 30, 2023 · d2c7c28 · d2c7c28
2 parents 3238801 + caeaa74
commit d2c7c28
Show file tree

Hide file tree

Showing 12 changed files with 20 additions and 29 deletions.
diff --git a/Makefile b/Makefile
@@ -69,12 +69,12 @@ test-all-components-coverage: ## Run tests with coverage
 		$(MAKE) test-components-coverage GROUP=$$(basename $$component_group) ; \
 	done
 
-sync-assets: ## Sync assets folder to GCS. Must specify pipeline=<training|prediction>
-	@if [ -d "./pipelines/src/pipelines/${PIPELINE_TEMPLATE}/$(pipeline)/assets/" ] ; then \
+sync-assets: ## Sync assets folder to GCS.
+	@if [ -d "./pipelines/assets/" ] ; then \
 		echo "Syncing assets to GCS" && \
-		gsutil -m rsync -r -d ./pipelines/src/pipelines/${PIPELINE_TEMPLATE}/$(pipeline)/assets ${PIPELINE_FILES_GCS_PATH}/$(pipeline)/assets ; \
+		gsutil -m rsync -r -d ./pipelines/assets ${PIPELINE_FILES_GCS_PATH}/assets ; \
 	else \
-		echo "No assets folder found for pipeline $(pipeline)" ; \
+		echo "No assets folder found" ; \
 	fi ;
 
 run: ## Compile pipeline, copy assets to GCS, and run pipeline in sandbox environment. Must specify pipeline=<training|prediction>. Optionally specify enable_pipeline_caching=<true|false> (defaults to default Vertex caching behaviour)

diff --git a/README.md b/README.md
@@ -178,8 +178,8 @@ When triggering ad hoc runs in your dev/sandbox environment, or when running the
 
 ### Assets
 
-In each pipeline folder, there is an `assets` directory (`pipelines/pipelines/<xgboost|tensorflow>/<training|prediction>/assets/`). 
-This can be used for any additional files that may be needed during execution of the pipelines. 
+The folder `pipelines/assets/` can be used for any additional files that may be needed during execution of the pipelines. 
+Most importantly this can include your training scripts.
 This directory is rsync'd to Google Cloud Storage when running a pipeline in the sandbox environment or as part of the CD pipeline (see [CI/CD setup](cloudbuild/README.md)).
 
 ## Testing
@@ -243,14 +243,10 @@ Below is a diagram of how the files are published in each environment in the `e2
 ```
 . <-- GCS directory set by _PIPELINE_PUBLISH_GCS_PATH
 └── TAG_NAME or GIT COMMIT HASH <-- Git tag used for the release (release.yaml) OR git commit hash (e2e-test.yaml)
-    ├── prediction
-    │   ├── assets
-    │   │   └── some_useful_file.json
-    │   └── prediction.json   <-- compiled prediction pipeline
-    └── training
-        ├── assets
-        │   └── training_task.py
-        └── training.json   <-- compiled training pipeline
+    ├── training.json
+    ├── prediction.json
+    ├── assets
+    │   └── some_useful_file.json
 ```
 
 4. `terraform-plan.yaml` - Checks the Terraform configuration under `terraform/envs/<env>` (e.g. `terraform/envs/test`), and produces a summary of any proposed changes that will be applied on merge to the main branch.

diff --git a/cloudbuild/README.md b/cloudbuild/README.md
@@ -21,8 +21,8 @@ limitations under the License.
 There are five CI/CD pipelines
 
 1. `pr-checks.yaml` - runs pre-commit checks and unit tests on the custom KFP components, and checks that the ML pipelines (training and prediction) can compile.
-1. `e2e-test.yaml` - copies the "assets" folders to the chosen GCS destination (versioned by git commit hash) and runs end-to-end tests of the training and prediction pipeline.
-1. `release.yaml` - compiles training and prediction pipelines, then copies the compiled pipelines and their respective "assets" folders to the chosen GCS destination (versioned by git tag).
+1. `e2e-test.yaml` - copies the "assets" folder to the chosen GCS destination (versioned by git commit hash) and runs end-to-end tests of the training and prediction pipeline.
+1. `release.yaml` - compiles training and prediction pipelines, then copies the compiled pipelines and "assets" folder to the chosen GCS destination (versioned by git tag).
 1. `terraform-plan.yaml` - Checks the Terraform configuration under `terraform/envs/<env>` (e.g. `terraform/envs/test`), and produces a summary of any proposed changes that will be applied on merge to the main branch.
 1. `terraform-apply.yaml` - Applies the Terraform configuration under `terraform/envs/<env>` (e.g. `terraform/envs/test`).
 

diff --git a/cloudbuild/e2e-test.yaml b/cloudbuild/e2e-test.yaml
@@ -22,10 +22,8 @@ steps:
     args:
       - -c
       - |
-        mkdir -p ${COMMIT_SHA}/training/assets && \
-        mkdir -p ${COMMIT_SHA}/prediction/assets && \
-        cp -r pipelines/src/pipelines/${_PIPELINE_TEMPLATE}/training/assets ${COMMIT_SHA}/training/ && \
-        cp -r pipelines/src/pipelines/${_PIPELINE_TEMPLATE}/prediction/assets ${COMMIT_SHA}/prediction/ && \
+        mkdir -p ${COMMIT_SHA}/assets && \
+        cp -r pipelines/assets ${COMMIT_SHA} && \
         gsutil cp -r ${COMMIT_SHA} ${_PIPELINE_PUBLISH_GCS_PATH}/${COMMIT_SHA}
 
   # Install Python deps

diff --git a/cloudbuild/release.yaml b/cloudbuild/release.yaml
@@ -21,7 +21,6 @@ steps:
       - -c
       - |
         make setup && \
-        make compile-all-components && \
         make compile-pipeline pipeline=training && \
         make compile-pipeline pipeline=prediction
     env:
@@ -35,12 +34,10 @@ steps:
     args:
       - -c
       - |
-        mkdir -p ${TAG_NAME}/training/assets && \
-        mkdir -p ${TAG_NAME}/prediction/assets && \
-        cp pipelines/training.json ${TAG_NAME}/training/training.json && \
-        cp pipelines/prediction.json ${TAG_NAME}/prediction/prediction.json && \
-        cp -r pipelines/pipelines/${_PIPELINE_TEMPLATE}/training/assets ${TAG_NAME}/training/ && \
-        cp -r pipelines/pipelines/${_PIPELINE_TEMPLATE}/prediction/assets ${TAG_NAME}/prediction/ && \
+        mkdir -p ${TAG_NAME}/assets && \
+        cp pipelines/src/training.json ${TAG_NAME}/training.json && \
+        cp pipelines/src/prediction.json ${TAG_NAME}/prediction.json && \
+        cp -r pipelines/assets/* ${TAG_NAME}/assets/ && \
         for dest in ${_PIPELINE_PUBLISH_GCS_PATHS} ; do \
           gsutil cp -r ${TAG_NAME} $$dest ; \
         done

diff --git a/...nes/tensorflow/prediction/assets/.gitkeep → pipelines/assets/.gitkeep b/...nes/tensorflow/prediction/assets/.gitkeep → pipelines/assets/.gitkeep
diff --git a/...sorflow/training/assets/train_tf_model.py → pipelines/assets/train_tf_model.py b/...sorflow/training/assets/train_tf_model.py → pipelines/assets/train_tf_model.py
diff --git a/...gboost/training/assets/train_xgb_model.py → pipelines/assets/train_xgb_model.py b/...gboost/training/assets/train_xgb_model.py → pipelines/assets/train_xgb_model.py
diff --git a/pipelines/src/pipelines/tensorflow/training/pipeline.py b/pipelines/src/pipelines/tensorflow/training/pipeline.py
@@ -80,7 +80,7 @@ def tensorflow_pipeline(
     valid_table = "valid_data" + table_suffix
     test_table = "test_data" + table_suffix
     primary_metric = "rootMeanSquaredError"
-    train_script_uri = f"{pipeline_files_gcs_path}/training/assets/train_tf_model.py"
+    train_script_uri = f"{pipeline_files_gcs_path}/assets/train_tf_model.py"
     hparams = dict(
         batch_size=100,
         epochs=5,

diff --git a/pipelines/src/pipelines/xgboost/prediction/assets/.gitkeep b/pipelines/src/pipelines/xgboost/prediction/assets/.gitkeep
diff --git a/pipelines/src/pipelines/xgboost/training/assets/.gitkeep b/pipelines/src/pipelines/xgboost/training/assets/.gitkeep
diff --git a/pipelines/src/pipelines/xgboost/training/pipeline.py b/pipelines/src/pipelines/xgboost/training/pipeline.py
@@ -78,7 +78,7 @@ def xgboost_pipeline(
     valid_table = "valid_data" + table_suffix
     test_table = "test_data" + table_suffix
     primary_metric = "rootMeanSquaredError"
-    train_script_uri = f"{pipeline_files_gcs_path}/training/assets/train_xgb_model.py"
+    train_script_uri = f"{pipeline_files_gcs_path}/assets/train_xgb_model.py"
     hparams = dict(
         n_estimators=200,
         early_stopping_rounds=10,