Merge pull request #10 from teamdatatonic/feature/simplify-component-…

…imports Simplify component imports
teamdatatonic · May 17, 2023 · 172d1c8 · 172d1c8
2 parents 484ea38 + bd31667
commit 172d1c8
Show file tree

Hide file tree

Showing 80 changed files with 2,833 additions and 2,149 deletions.
diff --git a/.gitignore b/.gitignore
@@ -170,5 +170,5 @@ env.sh
 # VS code configurations
 .vscode
 
-# Compiled pipeline components
-**/component.yaml
+# Compiled pipelines
+**/*pipeline.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,7 +21,6 @@ repos:
     rev: v0.0.9
     hooks:
       - id: terraform-fmt
-      # - id: shellcheck
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v1.2.3
@@ -43,7 +42,6 @@ repos:
     hooks:
       - id: yamllint
         args: [--format, parsable, --strict]
-        exclude: .*component.yaml
 
   - repo: https://github.com/psf/black
     rev: 22.3.0

diff --git a/.yamllint b/.yamllint
@@ -14,34 +14,8 @@
 
 ---
 
-yaml-files:
-  - '*.yaml'
-  - '*.yml'
-  - .yamllint
+extends: default
 
 rules:
-  braces: enable
-  brackets: enable
-  colons: enable
-  commas: enable
-  comments:
-    level: warning
-  comments-indentation:
-    level: warning
-  document-end: disable
-  document-start:
-    level: warning
-  empty-lines: enable
-  empty-values: disable
-  hyphens: enable
-  indentation: enable
-  key-duplicates: enable
-  key-ordering: disable
-  line-length: disable
-  new-line-at-end-of-file: enable
-  new-lines: enable
-  octal-values: disable
-  quoted-strings: disable
-  trailing-spaces: enable
-  truthy:
-    level: warning
+  line-length:
+    max: 120
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -52,7 +52,6 @@ When we test a function that makes an external API call (for example to a servic
 
 We do mocking/patching in two different ways:
 1. [`monkeypatch`](https://docs.pytest.org/en/6.2.x/monkeypatch.html): this built-in `pytest` fixture allows you to modify an attribute (such as an instance method in a class). We use `monkeypatch` only in the relevant `conftest.py` file for the fixtures that are applied before every unit test. We have used `monkeypatch` in the following fixtures:
-    - `patch_kfp_component_decorator`: used to patch the decorator `@component` in `kfp.v2.dsl`. This allows us to test each KFP component as an untouched Python function. Importantly, when testing KFP components, you must import the component inside the unit test function, otherwise the monkeypatch of the `@component` decorator will fail.
     - `mock_kfp_artifact`: used to mock the `Artifact` object (and thus any derived classes such as `Dataset`, `Model`, etc.) in `kfp.v2.dsl` to return the URI as
     the path. This lets us create mock Artifact objects locally for our unit tests.
 2. `unittest.mock.patch`: this object in the `unittest` library enables us to mock classes (and its associated attributes and methods) within a context manager. We use `mock.patch` inside the individual test scripts to mock object(s) that are used in the function being tested. This allows us to replace the target class/object with a Mock object, ultimately allowing us to make assertions on how this Mock object has been used. For example, the `assert_called_once_with` method allows us to check that a specific method of a Mock object was called once with specific arguments. Alternatively, we can set the attributes of our Mock objects to specific values, and assert that the component logic being tested handles these cases correctly (e.g. by raising a `ValueError`). An example of using the `mock.patch` context manager is in [`test_lookup_model.py`](tests/kfp_components/aiplatform/test_lookup_model.py) for [`lookup_model.py`](./pipeline_components/aiplatform/aiplatform/lookup_model/component.py), where there is an API call to list models (in Vertex AI) based on a filter, namely `google.cloud.aiplatform.Model.list`. When we test this KFP component, we are not interested in actually making this API call, so instead we mock it. We do this by mocking the `google.cloud.aiplatform.Model` class:
@@ -107,7 +106,7 @@ make test-all-components
 
 Or to just run the unit tests for a given component group (e.g. `aiplatform`):
 ```
-make test-components GROUP=aiplatform
+make test-components GROUP=vertex-components
 ```
 
 ### End-to-end (E2E) pipeline tests

diff --git a/Makefile b/Makefile
@@ -29,55 +29,57 @@ setup: ## Set up local environment for Python development on pipelines
 
 test-trigger: ## Runs unit tests for the pipeline trigger code
 	@cd pipelines && \
-	pipenv install --dev && \
 	pipenv run python -m pytest tests/trigger
 
 compile-pipeline: ## Compile the pipeline to training.json or prediction.json. Must specify pipeline=<training|prediction>
-	@cd pipelines && \
+	@cd pipelines/src && \
 	pipenv run python -m pipelines.${PIPELINE_TEMPLATE}.${pipeline}.pipeline
 
-compile-components: ## Compile all the components in a component group
-	@cd pipeline_components/${GROUP} && \
-	pipenv install && \
-	for component in ${GROUP}/*/component.py ; do \
-		pipenv run python $$component ; \
-	done
+setup-components: ## Run unit tests for a component group
+	@cd "components/${GROUP}" && \
+	pipenv install --dev
 
-compile-all-components: ## Compile all pipeline components
+setup-all-components: ## Run unit tests for all pipeline components
 	@set -e && \
-	for component_group in pipeline_components/*/ ; do \
-		echo "Compiling components under $$component_group" && \
-		$(MAKE) compile-components GROUP=$$(basename $$component_group) ; \
+	for component_group in components/*/ ; do \
+		echo "Setup components under $$component_group" && \
+		$(MAKE) setup-components GROUP=$$(basename $$component_group) ; \
 	done
 
 test-components: ## Run unit tests for a component group
-	@cd pipeline_components/${GROUP} && \
-	pipenv install --dev && \
+	@cd "components/${GROUP}" && \
 	pipenv run pytest
 
 test-all-components: ## Run unit tests for all pipeline components
 	@set -e && \
-	for component_group in pipeline_components/*/ ; do \
-		echo "Running unit tests for components under $$component_group" && \
+	for component_group in components/*/ ; do \
+		echo "Test components under $$component_group" && \
 		$(MAKE) test-components GROUP=$$(basename $$component_group) ; \
 	done
 
 sync-assets: ## Sync assets folder to GCS. Must specify pipeline=<training|prediction>
-	if [ -d "./pipelines/pipelines/${PIPELINE_TEMPLATE}/$(pipeline)/assets/" ] ; then \
-		gsutil -m rsync -r -d ./pipelines/pipelines/${PIPELINE_TEMPLATE}/$(pipeline)/assets ${PIPELINE_FILES_GCS_PATH}/$(pipeline)/assets ; \
+	@if [ -d "./pipelines/src/pipelines/${PIPELINE_TEMPLATE}/$(pipeline)/assets/" ] ; then \
+		echo "Syncing assets to GCS" && \
+		gsutil -m rsync -r -d ./pipelines/src/pipelines/${PIPELINE_TEMPLATE}/$(pipeline)/assets ${PIPELINE_FILES_GCS_PATH}/$(pipeline)/assets ; \
+	else \
+		echo "No assets folder found for pipeline $(pipeline)" ; \
 	fi ;
 
 run: ## Compile pipeline, copy assets to GCS, and run pipeline in sandbox environment. Must specify pipeline=<training|prediction>. Optionally specify enable_pipeline_caching=<true|false> (defaults to default Vertex caching behaviour)
 	@ $(MAKE) compile-pipeline && \
 	$(MAKE) sync-assets && \
+	cd pipelines/src && \
+	pipenv run python -m pipelines.trigger --template_path=./$(pipeline).json --enable_caching=$(enable_pipeline_caching)
+
+sync_assets ?= true
+e2e-tests: ## (Optionally) copy assets to GCS, and perform end-to-end (E2E) pipeline tests. Must specify pipeline=<training|prediction>. Optionally specify enable_pipeline_caching=<true|false> (defaults to default Vertex caching behaviour). Optionally specify sync_assets=<true|false> (defaults to true)
+	@if [ $$sync_assets = true ] ; then \
+        $(MAKE) sync-assets; \
+	else \
+		echo "Skipping syncing assets to GCS"; \
+    fi && \
 	cd pipelines && \
-	pipenv run python -m trigger.main --template_path=./$(pipeline).json --enable_caching=$(enable_pipeline_caching)
-
-e2e-tests: ## Compile pipeline, copy assets to GCS, and perform end-to-end (E2E) pipeline tests. Must specify pipeline=<training|prediction>. Optionally specify enable_pipeline_caching=<true|false> (defaults to default Vertex caching behaviour)
-	@ $(MAKE) compile-pipeline && \
-	$(MAKE) sync-assets && \
-	cd pipelines && \
-	pipenv run python -m pytest --log-cli-level=INFO tests/${PIPELINE_TEMPLATE}/$(pipeline) --enable_caching=$(enable_pipeline_caching)
+	pipenv run pytest --log-cli-level=INFO tests/${PIPELINE_TEMPLATE}/$(pipeline) --enable_caching=$(enable_pipeline_caching)
 
 env ?= dev
 deploy-infra: ## Deploy the Terraform infrastructure to your project. Requires VERTEX_PROJECT_ID and VERTEX_LOCATION env variables to be set in env.sh. Optionally specify env=<dev|test|prod> (default = dev)

diff --git a/README.md b/README.md
@@ -152,18 +152,6 @@ bq mk --transfer_config \
 
 #### Running Pipelines
 
-Before you run the pipeline(s), you will need to compile the pipeline components to their YAML format with:
-
-```bash
-make compile-all-components
-```
-
-Whenever you make changes to the pipeline components, you will need to re-compile the relevant components with:
-
-```bash
-make compile-components GROUP=<component group e.g. aiplatform>
-```
-
 You can run the XGBoost training pipeline (for example) with:
 
 ```bash
@@ -190,30 +178,34 @@ When triggering ad hoc runs in your dev/sandbox environment, or when running the
 
 ### Assets
 
-In each pipeline folder, there is an `assets` directory (`pipelines/pipelines/<xgboost|tensorflow>/<training|prediction>/assets/`). This can be used for any additional files that may be needed during execution of the pipelines. 
+In each pipeline folder, there is an `assets` directory (`pipelines/pipelines/<xgboost|tensorflow>/<training|prediction>/assets/`). 
+This can be used for any additional files that may be needed during execution of the pipelines. 
 This directory is rsync'd to Google Cloud Storage when running a pipeline in the sandbox environment or as part of the CD pipeline (see [CI/CD setup](cloudbuild/README.md)).
 
 ## Testing
 
-Unit tests and end-to-end (E2E) pipeline tests are performed using [pytest](https://docs.pytest.org). The unit tests for custom KFP components are run on each pull request, and the E2E tests are run on merge to the main branch. To run them on your local machine:
+Unit tests and end-to-end (E2E) pipeline tests are performed using [pytest](https://docs.pytest.org). 
+The unit tests for custom KFP components are run on each pull request, and the E2E tests are run on merge to the main branch. To run them on your local machine:
 
 ```
-make test-components GROUP=<component group e.g. aiplatform>
+make setup-all-components
+make test-all-components
 ```
 
-or
-
+Alternatively, only setup and install one of the components groups by running:
 ```
-make test-all-components
+make setup-components GROUP=vertex-components
+make test-components GROUP=vertex-components
 ```
 
-and
+To run end-to-end tests of a single pipeline, you can use:
 
 ```
-make e2e-tests pipeline=<training|prediction> [ enable_caching=<true|false> ]
+make e2e-tests pipeline=<training|prediction> [ enable_caching=<true|false> ] [ sync_assets=<true|false> ]
 ```
 
-There are also unit tests for the pipeline triggering code [`pipelines/pipelines/trigger`](../pipelines/trigger). This is not run as part of a CI/CD pipeline, as we don't expect this to be changed for each use case. To run them on your local machine:
+There are also unit tests for the pipeline triggering code. 
+This is not run as part of a CI/CD pipeline, as we don't expect this to be changed for each use case. To run them on your local machine:
 
 ```
 make test-trigger
@@ -223,7 +215,7 @@ make test-trigger
 
 ### Update existing pipelines
 
-See existing [XGBoost](pipelines/pipelines/xgboost) and [Tensorflow](pipelines/pipelines/tensorflow) pipelines as part of this template.
+See existing [XGBoost](pipelines/src/pipelines/xgboost) and [Tensorflow](pipelines/src/pipelines/tensorflow) pipelines as part of this template.
 Update `PIPELINE_TEMPLATE` to `xgboost` or `tensorflow` in [env.sh](env.sh.example) to specify whether to run the XGBoost pipelines or TensorFlow pipelines. 
 Make changes to the ML pipelines and their associated tests.
 Refer to the [contribution instructions](CONTRIBUTING.md) for more information on committing changes. 

diff --git a/cloudbuild/e2e-test.yaml b/cloudbuild/e2e-test.yaml
@@ -24,27 +24,20 @@ steps:
       - |
         mkdir -p ${COMMIT_SHA}/training/assets && \
         mkdir -p ${COMMIT_SHA}/prediction/assets && \
-        cp -r pipelines/pipelines/${_PIPELINE_TEMPLATE}/training/assets ${COMMIT_SHA}/training/ && \
-        cp -r pipelines/pipelines/${_PIPELINE_TEMPLATE}/prediction/assets ${COMMIT_SHA}/prediction/ && \
+        cp -r pipelines/src/pipelines/${_PIPELINE_TEMPLATE}/training/assets ${COMMIT_SHA}/training/ && \
+        cp -r pipelines/src/pipelines/${_PIPELINE_TEMPLATE}/prediction/assets ${COMMIT_SHA}/prediction/ && \
         gsutil cp -r ${COMMIT_SHA} ${_PIPELINE_PUBLISH_GCS_PATH}/${COMMIT_SHA}
 
   # Install Python deps
   # Run end-to-end (E2E) pipeline tests on both pipelines
   - name: python:3.7
     entrypoint: /bin/sh
-    dir: pipelines
     args:
       - -c
       - |
-        pip install pipenv && \
-        pipenv install --dev && \
-        cd ../ && \
-        make compile-all-components && \
-        make compile-pipeline pipeline=training && \
-        make compile-pipeline pipeline=prediction && \
-        cd pipelines && \
-        pipenv run python -m pytest --log-cli-level=INFO tests/${_PIPELINE_TEMPLATE}/training --enable_caching=${_TEST_ENABLE_PIPELINE_CACHING} && \
-        pipenv run python -m pytest --log-cli-level=INFO tests/${_PIPELINE_TEMPLATE}/prediction --enable_caching=${_TEST_ENABLE_PIPELINE_CACHING}
+        make setup && \
+        make e2e-tests pipeline=training enable_pipeline_caching=False sync_assets=false && \
+        make e2e-tests pipeline=prediction enable_pipeline_caching=False sync_assets=false
     env:
       - VERTEX_LOCATION=${_TEST_VERTEX_LOCATION}
       - VERTEX_PROJECT_ID=${_TEST_VERTEX_PROJECT_ID}

diff --git a/cloudbuild/pr-checks.yaml b/cloudbuild/pr-checks.yaml
@@ -26,10 +26,10 @@ steps:
         git init && \
         git add . && \
         make pre-commit && \
-        make compile-all-components && \
-        make test-all-components && \
         make compile-pipeline pipeline=training && \
-        make compile-pipeline pipeline=prediction
+        make compile-pipeline pipeline=prediction && \
+        make setup-all-components && \
+        make test-all-components
     env:
       - SKIP=terraform-fmt,git-dirty
       - PIPELINE_TEMPLATE=${_PIPELINE_TEMPLATE}

diff --git a/cloudbuild/trigger-tests.yaml b/cloudbuild/trigger-tests.yaml
@@ -17,7 +17,7 @@ steps:
     args:
       - '-c'
       - |
-        pip install pipenv && \
+        make setup && \
         make test-trigger
     entrypoint: /bin/sh
 

diff --git a/components/README.md b/components/README.md
@@ -0,0 +1,28 @@
+# Kubeflow Pipelines Components
+
+This directory contains multiple Python packages that are used to define pipeline components with the Kubeflow Pipelines SDK. Each subdirectory is a package with its own set of Python dependencies. Components with the same Python dependencies can reside in the same package, but components with different Python dependencies should be split into different Python packages.
+
+## Creating a new pipeline components package
+
+To create a new set of components (with different Python dependencies), copy one of the existing subdirectories and rename the different files and directories as appropriate (e.g. `bigquery-components` -> `my-new-components`). You will also need to update any references in the Python files themselves, as well as the `Pipfile` and `pyproject.toml`.
+
+Your Python dependencies should be defined in `Pipfile`, `pyproject.toml`, and in `packages_to_install` (in the `@component` decorator):
+
+- In `Pipfile`, add `kfp` to the `[packages]` section (pinned to a specific version), and add any dependencies that your component uses under `[dev-packages]` (each pinned to a specific version)
+- In `pyproject.toml`, add `kfp` to the `[dependencies]` section (pinned to a specific version), and add any dependencies that your component uses under `[[project.optional-dependencies]` -> `tests` (each pinned to a specific version)
+- In `packages_to_install` (in the `@component` decorator used to define your component), add any dependencies that your component uses (each pinned to a specific version)
+
+Define your pipeline components using the `@component` decorator in Python files under `my-new-components/src/my-new-components`. You will need to update the `__init__.py` file to provide tests - see the [Kubeflow Pipelines documentation](https://www.kubeflow.org/docs/components/pipelines/v1/sdk-v2/python-function-components/#building-python-function-based-components) for more information about writing pipeline components.
+
+Finally, you will need to install this new components package into the [`pipelines`](../pipelines) package. In [`pipelines/Pipfile`](../pipelines/Pipfile), add the following line to the `packages` section:
+```ini
+my-new-components = {editable = true, path = "./../components/my-new-components"}
+```
+
+Once you have added this line to [`pipelines/Pipfile`](../pipelines/Pipfile), run `make setup` from the root of the repository to install the new components package into the `pipelines` package.
+
+## Testing components
+
+Unit tests for components are defined using pytest and should be created under `my-new-components/tests`. Take a look at the existing components to see examples of how you can write these tests and perform mocking/patching of KFP Artifact types.
+
+To run the unit tests, you will first need to set up the virtual environment for the new components by running `make setup-components GROUP=my-new-components` from the root of the repository. Once you have done this, you can run the unit tests using `make test-components GROUP=my-new-components`.
diff --git a/...ine_components/aiplatform/.python-version → ...nents/bigquery-components/.python-version b/...ine_components/aiplatform/.python-version → ...nents/bigquery-components/.python-version
diff --git a/pipeline_components/bigquery/Pipfile → components/bigquery-components/Pipfile b/pipeline_components/bigquery/Pipfile → components/bigquery-components/Pipfile
@@ -4,11 +4,11 @@ verify_ssl = true
 name = "pypi"
 
 [packages]
-kfp = "==1.8.9"
+kfp = "==1.8.21"
 
 [dev-packages]
-pytest = ">=6.2.4,<7.0.0"
 google-cloud-bigquery = "==2.30.0"
+pytest = ">=7.3.1,<8.0.0"
 
 [requires]
 python_version = "3.7"