diff --git a/Makefile b/Makefile index faa0fa43..4dc752c9 100644 --- a/Makefile +++ b/Makefile @@ -69,12 +69,12 @@ test-all-components-coverage: ## Run tests with coverage $(MAKE) test-components-coverage GROUP=$$(basename $$component_group) ; \ done -sync-assets: ## Sync assets folder to GCS. Must specify pipeline= - @if [ -d "./pipelines/src/pipelines/${PIPELINE_TEMPLATE}/$(pipeline)/assets/" ] ; then \ +sync-assets: ## Sync assets folder to GCS. + @if [ -d "./pipelines/assets/" ] ; then \ echo "Syncing assets to GCS" && \ - gsutil -m rsync -r -d ./pipelines/src/pipelines/${PIPELINE_TEMPLATE}/$(pipeline)/assets ${PIPELINE_FILES_GCS_PATH}/$(pipeline)/assets ; \ + gsutil -m rsync -r -d ./pipelines/assets ${PIPELINE_FILES_GCS_PATH}/assets ; \ else \ - echo "No assets folder found for pipeline $(pipeline)" ; \ + echo "No assets folder found" ; \ fi ; run: ## Compile pipeline, copy assets to GCS, and run pipeline in sandbox environment. Must specify pipeline=. Optionally specify enable_pipeline_caching= (defaults to default Vertex caching behaviour) diff --git a/README.md b/README.md index 09940d13..38615ae0 100644 --- a/README.md +++ b/README.md @@ -178,8 +178,8 @@ When triggering ad hoc runs in your dev/sandbox environment, or when running the ### Assets -In each pipeline folder, there is an `assets` directory (`pipelines/pipelines///assets/`). -This can be used for any additional files that may be needed during execution of the pipelines. +The folder `pipelines/assets/` can be used for any additional files that may be needed during execution of the pipelines. +Most importantly this can include your training scripts. This directory is rsync'd to Google Cloud Storage when running a pipeline in the sandbox environment or as part of the CD pipeline (see [CI/CD setup](cloudbuild/README.md)). ## Testing @@ -243,14 +243,10 @@ Below is a diagram of how the files are published in each environment in the `e2 ``` . <-- GCS directory set by _PIPELINE_PUBLISH_GCS_PATH └── TAG_NAME or GIT COMMIT HASH <-- Git tag used for the release (release.yaml) OR git commit hash (e2e-test.yaml) - ├── prediction - │ ├── assets - │ │ └── some_useful_file.json - │ └── prediction.json <-- compiled prediction pipeline - └── training - ├── assets - │ └── training_task.py - └── training.json <-- compiled training pipeline + ├── training.json + ├── prediction.json + ├── assets + │ └── some_useful_file.json ``` 4. `terraform-plan.yaml` - Checks the Terraform configuration under `terraform/envs/` (e.g. `terraform/envs/test`), and produces a summary of any proposed changes that will be applied on merge to the main branch. diff --git a/cloudbuild/README.md b/cloudbuild/README.md index 97eb2a4a..d541b034 100644 --- a/cloudbuild/README.md +++ b/cloudbuild/README.md @@ -21,8 +21,8 @@ limitations under the License. There are five CI/CD pipelines 1. `pr-checks.yaml` - runs pre-commit checks and unit tests on the custom KFP components, and checks that the ML pipelines (training and prediction) can compile. -1. `e2e-test.yaml` - copies the "assets" folders to the chosen GCS destination (versioned by git commit hash) and runs end-to-end tests of the training and prediction pipeline. -1. `release.yaml` - compiles training and prediction pipelines, then copies the compiled pipelines and their respective "assets" folders to the chosen GCS destination (versioned by git tag). +1. `e2e-test.yaml` - copies the "assets" folder to the chosen GCS destination (versioned by git commit hash) and runs end-to-end tests of the training and prediction pipeline. +1. `release.yaml` - compiles training and prediction pipelines, then copies the compiled pipelines and "assets" folder to the chosen GCS destination (versioned by git tag). 1. `terraform-plan.yaml` - Checks the Terraform configuration under `terraform/envs/` (e.g. `terraform/envs/test`), and produces a summary of any proposed changes that will be applied on merge to the main branch. 1. `terraform-apply.yaml` - Applies the Terraform configuration under `terraform/envs/` (e.g. `terraform/envs/test`). diff --git a/cloudbuild/e2e-test.yaml b/cloudbuild/e2e-test.yaml index 3aa9a513..c2e28095 100644 --- a/cloudbuild/e2e-test.yaml +++ b/cloudbuild/e2e-test.yaml @@ -22,10 +22,8 @@ steps: args: - -c - | - mkdir -p ${COMMIT_SHA}/training/assets && \ - mkdir -p ${COMMIT_SHA}/prediction/assets && \ - cp -r pipelines/src/pipelines/${_PIPELINE_TEMPLATE}/training/assets ${COMMIT_SHA}/training/ && \ - cp -r pipelines/src/pipelines/${_PIPELINE_TEMPLATE}/prediction/assets ${COMMIT_SHA}/prediction/ && \ + mkdir -p ${COMMIT_SHA}/assets && \ + cp -r pipelines/assets ${COMMIT_SHA} && \ gsutil cp -r ${COMMIT_SHA} ${_PIPELINE_PUBLISH_GCS_PATH}/${COMMIT_SHA} # Install Python deps diff --git a/cloudbuild/release.yaml b/cloudbuild/release.yaml index 63e60115..0f262a52 100644 --- a/cloudbuild/release.yaml +++ b/cloudbuild/release.yaml @@ -21,7 +21,6 @@ steps: - -c - | make setup && \ - make compile-all-components && \ make compile-pipeline pipeline=training && \ make compile-pipeline pipeline=prediction env: @@ -35,12 +34,10 @@ steps: args: - -c - | - mkdir -p ${TAG_NAME}/training/assets && \ - mkdir -p ${TAG_NAME}/prediction/assets && \ - cp pipelines/training.json ${TAG_NAME}/training/training.json && \ - cp pipelines/prediction.json ${TAG_NAME}/prediction/prediction.json && \ - cp -r pipelines/pipelines/${_PIPELINE_TEMPLATE}/training/assets ${TAG_NAME}/training/ && \ - cp -r pipelines/pipelines/${_PIPELINE_TEMPLATE}/prediction/assets ${TAG_NAME}/prediction/ && \ + mkdir -p ${TAG_NAME}/assets && \ + cp pipelines/src/training.json ${TAG_NAME}/training.json && \ + cp pipelines/src/prediction.json ${TAG_NAME}/prediction.json && \ + cp -r pipelines/assets/* ${TAG_NAME}/assets/ && \ for dest in ${_PIPELINE_PUBLISH_GCS_PATHS} ; do \ gsutil cp -r ${TAG_NAME} $$dest ; \ done diff --git a/pipelines/src/pipelines/tensorflow/prediction/assets/.gitkeep b/pipelines/assets/.gitkeep similarity index 100% rename from pipelines/src/pipelines/tensorflow/prediction/assets/.gitkeep rename to pipelines/assets/.gitkeep diff --git a/pipelines/src/pipelines/tensorflow/training/assets/train_tf_model.py b/pipelines/assets/train_tf_model.py similarity index 100% rename from pipelines/src/pipelines/tensorflow/training/assets/train_tf_model.py rename to pipelines/assets/train_tf_model.py diff --git a/pipelines/src/pipelines/xgboost/training/assets/train_xgb_model.py b/pipelines/assets/train_xgb_model.py similarity index 100% rename from pipelines/src/pipelines/xgboost/training/assets/train_xgb_model.py rename to pipelines/assets/train_xgb_model.py diff --git a/pipelines/src/pipelines/tensorflow/training/pipeline.py b/pipelines/src/pipelines/tensorflow/training/pipeline.py index f6e70525..0dfb1bb2 100644 --- a/pipelines/src/pipelines/tensorflow/training/pipeline.py +++ b/pipelines/src/pipelines/tensorflow/training/pipeline.py @@ -80,7 +80,7 @@ def tensorflow_pipeline( valid_table = "valid_data" + table_suffix test_table = "test_data" + table_suffix primary_metric = "rootMeanSquaredError" - train_script_uri = f"{pipeline_files_gcs_path}/training/assets/train_tf_model.py" + train_script_uri = f"{pipeline_files_gcs_path}/assets/train_tf_model.py" hparams = dict( batch_size=100, epochs=5, diff --git a/pipelines/src/pipelines/xgboost/prediction/assets/.gitkeep b/pipelines/src/pipelines/xgboost/prediction/assets/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/pipelines/src/pipelines/xgboost/training/assets/.gitkeep b/pipelines/src/pipelines/xgboost/training/assets/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/pipelines/src/pipelines/xgboost/training/pipeline.py b/pipelines/src/pipelines/xgboost/training/pipeline.py index 6737a627..8499ce8a 100644 --- a/pipelines/src/pipelines/xgboost/training/pipeline.py +++ b/pipelines/src/pipelines/xgboost/training/pipeline.py @@ -78,7 +78,7 @@ def xgboost_pipeline( valid_table = "valid_data" + table_suffix test_table = "test_data" + table_suffix primary_metric = "rootMeanSquaredError" - train_script_uri = f"{pipeline_files_gcs_path}/training/assets/train_xgb_model.py" + train_script_uri = f"{pipeline_files_gcs_path}/assets/train_xgb_model.py" hparams = dict( n_estimators=200, early_stopping_rounds=10,