From d11fae78d8a49e9945760d10bbffc5427c3741d4 Mon Sep 17 00:00:00 2001 From: "Yuan (Bob) Gong" Date: Wed, 21 Aug 2019 08:25:20 +0800 Subject: [PATCH] Use KFP lite deployment for presubmit tests (#1808) * Refactor presubmit-tests-with-pipeline-deployment.sh so that it can be run from a different project * Simplify getting service account from cluster. * Migrate presubmit-tests-with-pipeline-deployment.sh to use kfp lightweight deployment. * Add option to cache built images to make debugging faster. * Fix cluster set up * Copy image builder image instead of granting permission * Add missed yes command * fix stuff * Let other usages of image-builder image become configurable * let test workflow use image builder image * Fix permission issue * Hide irrelevant error logs * Use shared service account key instead * Move test manifest to test folder * Move build-images.sh to a different script file * Update README.md * add cluster info dump * Use the same cluster resources as kubeflow deployment * Remove cluster info dump * Add timing to test log * cleaned up code * fix tests * address cr comments * Address cr comments * Enable image caching to improve retest speed --- test/.gitignore | 2 + test/README.md | 5 ++ test/build-images.sh | 59 +++++++++++++++ test/check-argo-status.sh | 5 +- test/deploy-cluster.sh | 73 +++++++++++++++++++ test/deploy-pipeline-lite.sh | 47 ++++++++++++ test/install-argo.sh | 27 ++++--- test/manifests/kustomization.yaml | 7 ++ ...resubmit-tests-with-pipeline-deployment.sh | 49 ++++--------- 9 files changed, 226 insertions(+), 48 deletions(-) create mode 100644 test/.gitignore create mode 100755 test/build-images.sh create mode 100755 test/deploy-cluster.sh create mode 100755 test/deploy-pipeline-lite.sh create mode 100644 test/manifests/kustomization.yaml diff --git a/test/.gitignore b/test/.gitignore new file mode 100644 index 00000000000..fb86d0d665c --- /dev/null +++ b/test/.gitignore @@ -0,0 +1,2 @@ +# temporary folder used in tests +bin diff --git a/test/README.md b/test/README.md index 8add7f2eb35..8b605edee66 100644 --- a/test/README.md +++ b/test/README.md @@ -72,6 +72,11 @@ Run the following commands from root of kubeflow/pipelines repo. #$PULL_PULL_SHA and $WORKSPACE are env variables set by Prow export PULL_PULL_SHA=pull-sha-placeholder export WORKSPACE=$(pwd) # root of kubeflow/pipelines git repo +export SA_KEY_FILE=PATH/TO/YOUR/GCP/PROJECT/SERVICE/ACCOUNT/KEY +# (optional) uncomment the following to keep reusing the same cluster +# export TEST_CLUSTER=YOUR_PRECONFIGURED_CLUSTER_NAME +# (optional) uncomment the following to disable built image caching +# export DISABLE_IMAGE_CACHING=true ./test/presubmit-tests-with-pipeline-deployment.sh \ --workflow_file e2e_test_gke_v2.yaml \ # You can specify other workflows you want to test too. diff --git a/test/build-images.sh b/test/build-images.sh new file mode 100755 index 00000000000..472c219948e --- /dev/null +++ b/test/build-images.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -ex + +IMAGE_BUILDER_ARG="" +if [ "$PROJECT" != "ml-pipeline-test" ]; then + COPIED_IMAGE_BUILDER_IMAGE=${GCR_IMAGE_BASE_DIR}/image-builder + echo "Copy image builder image to ${COPIED_IMAGE_BUILDER_IMAGE}" + yes | gcloud container images add-tag \ + gcr.io/ml-pipeline-test/image-builder:v20181128-0.1.3-rc.1-109-ga5a14dc-e3b0c4 \ + ${COPIED_IMAGE_BUILDER_IMAGE}:latest + IMAGE_BUILDER_ARG="-p image-builder-image=${COPIED_IMAGE_BUILDER_IMAGE}" +fi + +# Image caching can be turned off by setting $DISABLE_IMAGE_CACHING env flag. +# Note that GCR_IMAGE_BASE_DIR contains commit hash, so whenever there's a code +# change, we won't use caches for sure. +BUILT_IMAGES=$(gcloud container images list --repository=${GCR_IMAGE_BASE_DIR}) +if + test -z "$DISABLE_IMAGE_CACHING" && \ + echo "$BUILT_IMAGES" | grep api-server && \ + echo "$BUILT_IMAGES" | grep frontend && \ + echo "$BUILT_IMAGES" | grep scheduledworkflow && \ + echo "$BUILT_IMAGES" | grep persistenceagent; +then + echo "docker images for api-server, frontend, scheduledworkflow and \ + persistenceagent are already built in ${GCR_IMAGE_BASE_DIR}." +else + echo "submitting argo workflow to build docker images for commit ${PULL_PULL_SHA}..." + # Build Images + ARGO_WORKFLOW=`argo submit ${DIR}/build_image.yaml \ + -p image-build-context-gcs-uri="$remote_code_archive_uri" \ + ${IMAGE_BUILDER_ARG} \ + -p api-image="${GCR_IMAGE_BASE_DIR}/api-server" \ + -p frontend-image="${GCR_IMAGE_BASE_DIR}/frontend" \ + -p scheduledworkflow-image="${GCR_IMAGE_BASE_DIR}/scheduledworkflow" \ + -p persistenceagent-image="${GCR_IMAGE_BASE_DIR}/persistenceagent" \ + -n ${NAMESPACE} \ + --serviceaccount test-runner \ + -o name + ` + echo "build docker images workflow submitted successfully" + source "${DIR}/check-argo-status.sh" + echo "build docker images workflow completed" +fi diff --git a/test/check-argo-status.sh b/test/check-argo-status.sh index 64f99841825..80c4d4803cb 100755 --- a/test/check-argo-status.sh +++ b/test/check-argo-status.sh @@ -25,7 +25,8 @@ echo "check status of argo workflow $ARGO_WORKFLOW...." # probing the argo workflow status until it completed. Timeout after 30 minutes for i in $(seq 1 ${PULL_ARGO_WORKFLOW_STATUS_MAX_ATTEMPT}) do - WORKFLOW_STATUS=`kubectl get workflow $ARGO_WORKFLOW -n ${NAMESPACE} --show-labels` + WORKFLOW_STATUS=`kubectl get workflow $ARGO_WORKFLOW -n ${NAMESPACE} --show-labels 2>&1` \ + || echo kubectl get workflow failed with "$WORKFLOW_STATUS" # Tolerate temporary network failure during kubectl get workflow echo $WORKFLOW_STATUS | grep ${WORKFLOW_COMPLETE_KEYWORD} && s=0 && break || s=$? && printf "Workflow ${ARGO_WORKFLOW} is not finished.\n${WORKFLOW_STATUS}\nSleep for 20 seconds...\n" && sleep 20 done @@ -54,4 +55,4 @@ if [[ $WORKFLOW_STATUS = *"${WORKFLOW_FAILED_KEYWORD}"* ]]; then exit 1 else argo get ${ARGO_WORKFLOW} -n ${NAMESPACE} -fi \ No newline at end of file +fi diff --git a/test/deploy-cluster.sh b/test/deploy-cluster.sh new file mode 100755 index 00000000000..10f6e0e5d2a --- /dev/null +++ b/test/deploy-cluster.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -ex + +# Specify TEST_CLUSTER env variable to use an existing cluster. +TEST_CLUSTER_PREFIX=${WORKFLOW_FILE%.*} +TEST_CLUSTER_DEFAULT=$(echo $TEST_CLUSTER_PREFIX | cut -d _ -f 1)-${PULL_PULL_SHA:0:7}-${RANDOM} +TEST_CLUSTER=${TEST_CLUSTER:-${TEST_CLUSTER_DEFAULT}} +SHOULD_CLEANUP_CLUSTER=false + +function clean_up { + set +e # the following clean up commands shouldn't exit on error + + echo "Status of pods before clean up:" + kubectl get pods --all-namespaces + + echo "Clean up..." + if [ $SHOULD_CLEANUP_CLUSTER == true ]; then + # --async doesn't wait for this operation to complete, so we can get test + # results faster + yes | gcloud container clusters delete ${TEST_CLUSTER} --async + fi +} +trap clean_up EXIT SIGINT SIGTERM + +cd ${DIR} +# test if ${TEST_CLUSTER} exists or not +if gcloud container clusters describe ${TEST_CLUSTER} &>/dev/null; then + echo "Use existing test cluster: ${TEST_CLUSTER}" +else + echo "Creating a new test cluster: ${TEST_CLUSTER}" + SHOULD_CLEANUP_CLUSTER=true + # "storage-rw" is needed to allow VMs to push to gcr.io + # reference: https://cloud.google.com/compute/docs/access/service-accounts#accesscopesiam + SCOPE_ARG="--scopes=storage-rw" + # Machine type and cluster size is the same as kubeflow deployment to + # easily compare performance. We can reduce usage later. + NODE_POOL_CONFIG_ARG="--num-nodes=2 --machine-type=n1-standard-8 \ + --enable-autoscaling --max-nodes=8 --min-nodes=2" + gcloud container clusters create ${TEST_CLUSTER} ${SCOPE_ARG} ${NODE_POOL_CONFIG_ARG} +fi + +gcloud container clusters get-credentials ${TEST_CLUSTER} + +# when we reuse a cluster when debugging, clean up its kfp installation first +# this does nothing with a new cluster +kubectl delete namespace ${NAMESPACE} --wait || echo "No need to delete ${NAMESPACE} namespace. It doesn't exist." +kubectl create namespace ${NAMESPACE} --dry-run -o yaml | kubectl apply -f - + +if [ -z $SA_KEY_FILE ]; then + SA_KEY_FILE=${DIR}/key.json + # The service account key is for default VM service account. + # ref: https://cloud.google.com/compute/docs/access/service-accounts#compute_engine_default_service_account + # It was generated by the following command + # `gcloud iam service-accounts keys create $SA_KEY_FILE --iam-account ${VM_SERVICE_ACCOUNT}` + # Because there's a limit of 10 keys per service account, we are reusing the same key stored in the following bucket. + gsutil cp "gs://ml-pipeline-test-keys/ml-pipeline-test-sa-key.json" $SA_KEY_FILE +fi +kubectl create secret -n ${NAMESPACE} generic user-gcp-sa --from-file=user-gcp-sa.json=$SA_KEY_FILE --dry-run -o yaml | kubectl apply -f - diff --git a/test/deploy-pipeline-lite.sh b/test/deploy-pipeline-lite.sh new file mode 100755 index 00000000000..ae141d680be --- /dev/null +++ b/test/deploy-pipeline-lite.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -ex + +if ! which kustomize; then + # Download kustomize cli tool + TOOL_DIR=${DIR}/bin + mkdir -p ${TOOL_DIR} + wget https://github.com/kubernetes-sigs/kustomize/releases/download/v3.1.0/kustomize_3.1.0_linux_amd64 -O ${TOOL_DIR}/kustomize + chmod +x ${TOOL_DIR}/kustomize + PATH=${PATH}:${TOOL_DIR} +fi + +# delete argo first because KFP comes with argo too +kubectl delete namespace argo --wait || echo "No argo installed" + +KFP_MANIFEST_DIR=${DIR}/manifests +pushd ${KFP_MANIFEST_DIR} + +# This is the recommended approach to do this. +# reference: https://github.com/kubernetes-sigs/kustomize/blob/master/docs/eschewedFeatures.md#build-time-side-effects-from-cli-args-or-env-variables +kustomize edit set image gcr.io/ml-pipeline/api-server=${GCR_IMAGE_BASE_DIR}/api-server:latest +kustomize edit set image gcr.io/ml-pipeline/persistenceagent=${GCR_IMAGE_BASE_DIR}/persistenceagent:latest +kustomize edit set image gcr.io/ml-pipeline/scheduledworkflow=${GCR_IMAGE_BASE_DIR}/scheduledworkflow:latest +kustomize edit set image gcr.io/ml-pipeline/frontend=${GCR_IMAGE_BASE_DIR}/frontend:latest +cat kustomization.yaml + +kustomize build . | kubectl apply -f - +# show current info +echo "Status of pods after kubectl apply" +kubectl get pods -n ${NAMESPACE} + +popd diff --git a/test/install-argo.sh b/test/install-argo.sh index 6e2ef523d19..fad8af483b0 100755 --- a/test/install-argo.sh +++ b/test/install-argo.sh @@ -20,20 +20,25 @@ set -ex kubectl config set-context $(kubectl config current-context) --namespace=default echo "Add necessary cluster role bindings" ACCOUNT=$(gcloud info --format='value(config.account)') -kubectl create clusterrolebinding PROW_BINDING --clusterrole=cluster-admin --user=$ACCOUNT -kubectl create clusterrolebinding DEFAULT_BINDING --clusterrole=cluster-admin --serviceaccount=default:default +kubectl create clusterrolebinding PROW_BINDING --clusterrole=cluster-admin --user=$ACCOUNT --dry-run -o yaml | kubectl apply -f - +kubectl create clusterrolebinding DEFAULT_BINDING --clusterrole=cluster-admin --serviceaccount=default:default --dry-run -o yaml | kubectl apply -f - -echo "install argo" ARGO_VERSION=v2.3.0 -mkdir -p ~/bin/ -export PATH=~/bin/:$PATH -curl -sSL -o ~/bin/argo https://github.com/argoproj/argo/releases/download/$ARGO_VERSION/argo-linux-amd64 -chmod +x ~/bin/argo -#kubectl create ns argo -#kubectl apply -n argo -f https://raw.githubusercontent.com/argoproj/argo/$ARGO_VERSION/manifests/install.yaml + +# if argo is not installed +if ! which argo; then + echo "install argo" + mkdir -p ~/bin/ + export PATH=~/bin/:$PATH + curl -sSL -o ~/bin/argo https://github.com/argoproj/argo/releases/download/$ARGO_VERSION/argo-linux-amd64 + chmod +x ~/bin/argo +fi + +kubectl create ns argo --dry-run -o yaml | kubectl apply -f - +kubectl apply -n argo -f https://raw.githubusercontent.com/argoproj/argo/$ARGO_VERSION/manifests/install.yaml # Some workflows are deployed to the non-default namespace where the GCP credential secret is stored # In this case, the default service account in that namespace doesn't have enough permission echo "add service account for running the test workflow" -kubectl create serviceaccount test-runner -n ${NAMESPACE} -kubectl create clusterrolebinding test-admin-binding --clusterrole=cluster-admin --serviceaccount=${NAMESPACE}:test-runner +kubectl create serviceaccount test-runner -n ${NAMESPACE} --dry-run -o yaml | kubectl apply -f - +kubectl create clusterrolebinding test-admin-binding --clusterrole=cluster-admin --serviceaccount=${NAMESPACE}:test-runner --dry-run -o yaml | kubectl apply -f - diff --git a/test/manifests/kustomization.yaml b/test/manifests/kustomization.yaml new file mode 100644 index 00000000000..d785bf95edf --- /dev/null +++ b/test/manifests/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Actual image overrides will be added in test scripts. +images: [] +resources: +- ../../manifests/kustomize/namespaced-install diff --git a/test/presubmit-tests-with-pipeline-deployment.sh b/test/presubmit-tests-with-pipeline-deployment.sh index 7e75642af51..d1a506e06e6 100755 --- a/test/presubmit-tests-with-pipeline-deployment.sh +++ b/test/presubmit-tests-with-pipeline-deployment.sh @@ -68,45 +68,25 @@ GCR_IMAGE_BASE_DIR=gcr.io/${PROJECT}/${PULL_PULL_SHA} TEST_RESULTS_GCS_DIR=gs://${TEST_RESULT_BUCKET}/${PULL_PULL_SHA}/${TEST_RESULT_FOLDER} DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" > /dev/null && pwd)" +# Configure `time` command output format. +TIMEFORMAT="[test-timing] It took %lR." + echo "presubmit test starts" -source "${DIR}/test-prep.sh" +time source "${DIR}/test-prep.sh" +echo "test env prepared" -# Deploy Kubeflow -source "${DIR}/deploy-kubeflow.sh" +time source "${DIR}/deploy-cluster.sh" +echo "cluster deployed" # Install Argo CLI and test-runner service account -source "${DIR}/install-argo.sh" +time source "${DIR}/install-argo.sh" +echo "argo installed" -IMAGE_BUILDER_ARG="" -# When project is not ml-pipeline-test, VMs need permission to fetch some images in gcr.io/ml-pipeline-test. -if [ "$PROJECT" != "ml-pipeline-test" ]; then - COPIED_IMAGE_BUILDER_IMAGE=${GCR_IMAGE_BASE_DIR}/image-builder - echo "Copy image builder image to ${COPIED_IMAGE_BUILDER_IMAGE}" - yes | gcloud container images add-tag \ - gcr.io/ml-pipeline-test/image-builder:v20181128-0.1.3-rc.1-109-ga5a14dc-e3b0c4 \ - ${COPIED_IMAGE_BUILDER_IMAGE}:latest - IMAGE_BUILDER_ARG="-p image-builder-image=${COPIED_IMAGE_BUILDER_IMAGE}" -fi +time source "${DIR}/build-images.sh" +echo "KFP images built" -# Build Images -echo "submitting argo workflow to build docker images for commit ${PULL_PULL_SHA}..." -ARGO_WORKFLOW=`argo submit ${DIR}/build_image.yaml \ --p image-build-context-gcs-uri="$remote_code_archive_uri" \ -${IMAGE_BUILDER_ARG} \ --p api-image="${GCR_IMAGE_BASE_DIR}/api-server" \ --p frontend-image="${GCR_IMAGE_BASE_DIR}/frontend" \ --p scheduledworkflow-image="${GCR_IMAGE_BASE_DIR}/scheduledworkflow" \ --p persistenceagent-image="${GCR_IMAGE_BASE_DIR}/persistenceagent" \ --n ${NAMESPACE} \ ---serviceaccount test-runner \ --o name -` -echo "build docker images workflow submitted successfully" -source "${DIR}/check-argo-status.sh" -echo "build docker images workflow completed" - -# Deploy the pipeline -source ${DIR}/deploy-pipeline.sh --gcr_image_base_dir ${GCR_IMAGE_BASE_DIR} +time source "${DIR}/deploy-pipeline-lite.sh" +echo "KFP lite deployed" echo "submitting argo workflow to run tests for commit ${PULL_PULL_SHA}..." ARGO_WORKFLOW=`argo submit ${DIR}/${WORKFLOW_FILE} \ @@ -119,7 +99,6 @@ ${IMAGE_BUILDER_ARG} \ --serviceaccount test-runner \ -o name ` - echo "test workflow submitted successfully" -source "${DIR}/check-argo-status.sh" +time source "${DIR}/check-argo-status.sh" echo "test workflow completed"