From 3ecefe36b46221b5b0c499fc7629ba8d3a00b6f6 Mon Sep 17 00:00:00 2001 From: Yuan Gong Date: Thu, 22 Aug 2019 18:38:25 +0800 Subject: [PATCH 1/3] Use cloud build to build images instead --- test/build-images.sh | 46 ++++++++--------- test/check-build-image-status.sh | 51 +++++++++++++++++++ test/cloudbuild/api_server.yaml | 8 +++ test/cloudbuild/frontend.yaml | 8 +++ test/cloudbuild/persistence_agent.yaml | 8 +++ test/cloudbuild/scheduled_workflow.yaml | 8 +++ ...resubmit-tests-with-pipeline-deployment.sh | 9 ++-- 7 files changed, 110 insertions(+), 28 deletions(-) create mode 100755 test/check-build-image-status.sh create mode 100644 test/cloudbuild/api_server.yaml create mode 100644 test/cloudbuild/frontend.yaml create mode 100644 test/cloudbuild/persistence_agent.yaml create mode 100644 test/cloudbuild/scheduled_workflow.yaml diff --git a/test/build-images.sh b/test/build-images.sh index 472c219948e..51279c8eee5 100755 --- a/test/build-images.sh +++ b/test/build-images.sh @@ -16,15 +16,7 @@ set -ex -IMAGE_BUILDER_ARG="" -if [ "$PROJECT" != "ml-pipeline-test" ]; then - COPIED_IMAGE_BUILDER_IMAGE=${GCR_IMAGE_BASE_DIR}/image-builder - echo "Copy image builder image to ${COPIED_IMAGE_BUILDER_IMAGE}" - yes | gcloud container images add-tag \ - gcr.io/ml-pipeline-test/image-builder:v20181128-0.1.3-rc.1-109-ga5a14dc-e3b0c4 \ - ${COPIED_IMAGE_BUILDER_IMAGE}:latest - IMAGE_BUILDER_ARG="-p image-builder-image=${COPIED_IMAGE_BUILDER_IMAGE}" -fi +IMAGES_BUILDING=false # Image caching can be turned off by setting $DISABLE_IMAGE_CACHING env flag. # Note that GCR_IMAGE_BASE_DIR contains commit hash, so whenever there's a code @@ -40,20 +32,24 @@ then echo "docker images for api-server, frontend, scheduledworkflow and \ persistenceagent are already built in ${GCR_IMAGE_BASE_DIR}." else - echo "submitting argo workflow to build docker images for commit ${PULL_PULL_SHA}..." - # Build Images - ARGO_WORKFLOW=`argo submit ${DIR}/build_image.yaml \ - -p image-build-context-gcs-uri="$remote_code_archive_uri" \ - ${IMAGE_BUILDER_ARG} \ - -p api-image="${GCR_IMAGE_BASE_DIR}/api-server" \ - -p frontend-image="${GCR_IMAGE_BASE_DIR}/frontend" \ - -p scheduledworkflow-image="${GCR_IMAGE_BASE_DIR}/scheduledworkflow" \ - -p persistenceagent-image="${GCR_IMAGE_BASE_DIR}/persistenceagent" \ - -n ${NAMESPACE} \ - --serviceaccount test-runner \ - -o name - ` - echo "build docker images workflow submitted successfully" - source "${DIR}/check-argo-status.sh" - echo "build docker images workflow completed" + echo "submitting cloud build to build docker images for commit ${PULL_PULL_SHA}..." + IMAGES_BUILDING=true + CLOUD_BUILD_COMMON_ARGS=(. --async --format='value(id)' --substitutions=_GCR_BASE=${GCR_IMAGE_BASE_DIR}) + # Use faster machine because this is CPU intensive + BUILD_ID_API_SERVER=$(gcloud builds submit ${CLOUD_BUILD_COMMON_ARGS[@]} \ + --config ${DIR}/cloudbuild/api_server.yaml) + BUILD_ID_FRONTEND=$(gcloud builds submit ${CLOUD_BUILD_COMMON_ARGS[@]} \ + --config ${DIR}/cloudbuild/frontend.yaml) + BUILD_ID_SCHEDULED_WORKFLOW=$(gcloud builds submit ${CLOUD_BUILD_COMMON_ARGS[@]} \ + --config ${DIR}/cloudbuild/scheduled_workflow.yaml) + BUILD_ID_PERSISTENCE_AGENT=$(gcloud builds submit ${CLOUD_BUILD_COMMON_ARGS[@]} \ + --config ${DIR}/cloudbuild/persistence_agent.yaml) + + BUILD_IDS=( + "${BUILD_ID_API_SERVER}" + "${BUILD_ID_FRONTEND}" + "${BUILD_ID_SCHEDULED_WORKFLOW}" + "${BUILD_ID_PERSISTENCE_AGENT}" + ) + echo "Submitted the following cloud build jobs: ${BUILD_IDS[@]}" fi diff --git a/test/check-build-image-status.sh b/test/check-build-image-status.sh new file mode 100755 index 00000000000..de899904318 --- /dev/null +++ b/test/check-build-image-status.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -ex + +if [ "$IMAGES_BUILDING" == true ]; then + MAX_ATTEMPT=$(expr $TIMEOUT_SECONDS / 20) + for i in $(seq 1 ${MAX_ATTEMPT}) + do + (( success_count=0 )) || true + for id in "${BUILD_IDS[@]}" + do + status=$(gcloud builds describe $id --format='value(status)') || status="FETCH_ERROR" + case "$status" in + "SUCCESS") + (( ++success_count )) + ;; + "WORKING") + # do nothing + ;; + "FETCH_ERROR") + echo "Fetching cloud build status failed, retrying..." + ;; + *) + echo "Cloud build with build id ${id} failed with status ${status}" + exit 1 + ;; + esac + done + if [ $success_count == 4 ]; then + echo "All cloud builds succeeded." + break + fi + + echo "Cloud build in progress, waiting for 20 seconds..." + sleep 20 + done +fi diff --git a/test/cloudbuild/api_server.yaml b/test/cloudbuild/api_server.yaml new file mode 100644 index 00000000000..297f6275f11 --- /dev/null +++ b/test/cloudbuild/api_server.yaml @@ -0,0 +1,8 @@ +steps: +- name: 'gcr.io/cloud-builders/docker' + args: [ 'build', '-t', '$_GCR_BASE/api-server', '-f', 'backend/Dockerfile', '.' ] +timeout: 1800s # 30min +options: + machineType: N1_HIGHCPU_8 +images: +- '$_GCR_BASE/api-server' diff --git a/test/cloudbuild/frontend.yaml b/test/cloudbuild/frontend.yaml new file mode 100644 index 00000000000..9b477250010 --- /dev/null +++ b/test/cloudbuild/frontend.yaml @@ -0,0 +1,8 @@ +steps: +- name: 'gcr.io/cloud-builders/docker' + args: [ 'build', '-t', '$_GCR_BASE/frontend', '-f', 'frontend/Dockerfile', '.' ] +timeout: 1800s # 30min +options: + machineType: UNSPECIFIED +images: +- '$_GCR_BASE/frontend' diff --git a/test/cloudbuild/persistence_agent.yaml b/test/cloudbuild/persistence_agent.yaml new file mode 100644 index 00000000000..5d2b1153614 --- /dev/null +++ b/test/cloudbuild/persistence_agent.yaml @@ -0,0 +1,8 @@ +steps: +- name: 'gcr.io/cloud-builders/docker' + args: [ 'build', '-t', '$_GCR_BASE/persistenceagent', '-f', 'backend/Dockerfile.persistenceagent', '.' ] +timeout: 1800s # 30min +options: + machineType: UNSPECIFIED +images: +- '$_GCR_BASE/persistenceagent' diff --git a/test/cloudbuild/scheduled_workflow.yaml b/test/cloudbuild/scheduled_workflow.yaml new file mode 100644 index 00000000000..ef91540f6b7 --- /dev/null +++ b/test/cloudbuild/scheduled_workflow.yaml @@ -0,0 +1,8 @@ +steps: +- name: 'gcr.io/cloud-builders/docker' + args: [ 'build', '-t', '$_GCR_BASE/scheduledworkflow', '-f', 'backend/Dockerfile.scheduledworkflow', '.' ] +timeout: 1800s # 30min +options: + machineType: UNSPECIFIED +images: +- '$_GCR_BASE/scheduledworkflow' diff --git a/test/presubmit-tests-with-pipeline-deployment.sh b/test/presubmit-tests-with-pipeline-deployment.sh index d1a506e06e6..4b4add7eda8 100755 --- a/test/presubmit-tests-with-pipeline-deployment.sh +++ b/test/presubmit-tests-with-pipeline-deployment.sh @@ -75,16 +75,19 @@ echo "presubmit test starts" time source "${DIR}/test-prep.sh" echo "test env prepared" +time source "${DIR}/build-images.sh" +echo "KFP images cloudbuild jobs submitted" + time source "${DIR}/deploy-cluster.sh" echo "cluster deployed" +time source "${DIR}/check-build-image-status.sh" +echo "KFP images built" + # Install Argo CLI and test-runner service account time source "${DIR}/install-argo.sh" echo "argo installed" -time source "${DIR}/build-images.sh" -echo "KFP images built" - time source "${DIR}/deploy-pipeline-lite.sh" echo "KFP lite deployed" From ee7fb294b7217d8e25469d25098a37cb901f5fc8 Mon Sep 17 00:00:00 2001 From: Yuan Gong Date: Fri, 23 Aug 2019 08:56:21 +0800 Subject: [PATCH 2/3] Batch 3 image buld tasks --- test/build-images.sh | 15 +++++---------- test/cloudbuild/api_server.yaml | 2 +- test/cloudbuild/batch_build.yaml | 20 ++++++++++++++++++++ test/cloudbuild/frontend.yaml | 8 -------- test/cloudbuild/persistence_agent.yaml | 8 -------- test/cloudbuild/scheduled_workflow.yaml | 8 -------- test/install-argo.sh | 5 +++-- 7 files changed, 29 insertions(+), 37 deletions(-) create mode 100644 test/cloudbuild/batch_build.yaml delete mode 100644 test/cloudbuild/frontend.yaml delete mode 100644 test/cloudbuild/persistence_agent.yaml delete mode 100644 test/cloudbuild/scheduled_workflow.yaml diff --git a/test/build-images.sh b/test/build-images.sh index 51279c8eee5..a6dfd638df2 100755 --- a/test/build-images.sh +++ b/test/build-images.sh @@ -35,21 +35,16 @@ else echo "submitting cloud build to build docker images for commit ${PULL_PULL_SHA}..." IMAGES_BUILDING=true CLOUD_BUILD_COMMON_ARGS=(. --async --format='value(id)' --substitutions=_GCR_BASE=${GCR_IMAGE_BASE_DIR}) - # Use faster machine because this is CPU intensive + # Split into two tasks because api_server builds slowly, use a separate task + # to make it faster. BUILD_ID_API_SERVER=$(gcloud builds submit ${CLOUD_BUILD_COMMON_ARGS[@]} \ --config ${DIR}/cloudbuild/api_server.yaml) - BUILD_ID_FRONTEND=$(gcloud builds submit ${CLOUD_BUILD_COMMON_ARGS[@]} \ - --config ${DIR}/cloudbuild/frontend.yaml) - BUILD_ID_SCHEDULED_WORKFLOW=$(gcloud builds submit ${CLOUD_BUILD_COMMON_ARGS[@]} \ - --config ${DIR}/cloudbuild/scheduled_workflow.yaml) - BUILD_ID_PERSISTENCE_AGENT=$(gcloud builds submit ${CLOUD_BUILD_COMMON_ARGS[@]} \ - --config ${DIR}/cloudbuild/persistence_agent.yaml) + BUILD_ID_BATCH=$(gcloud builds submit ${CLOUD_BUILD_COMMON_ARGS[@]} \ + --config ${DIR}/cloudbuild/batch_build.yaml) BUILD_IDS=( "${BUILD_ID_API_SERVER}" - "${BUILD_ID_FRONTEND}" - "${BUILD_ID_SCHEDULED_WORKFLOW}" - "${BUILD_ID_PERSISTENCE_AGENT}" + "${BUILD_ID_BATCH}" ) echo "Submitted the following cloud build jobs: ${BUILD_IDS[@]}" fi diff --git a/test/cloudbuild/api_server.yaml b/test/cloudbuild/api_server.yaml index 297f6275f11..b8e20adaec9 100644 --- a/test/cloudbuild/api_server.yaml +++ b/test/cloudbuild/api_server.yaml @@ -3,6 +3,6 @@ steps: args: [ 'build', '-t', '$_GCR_BASE/api-server', '-f', 'backend/Dockerfile', '.' ] timeout: 1800s # 30min options: - machineType: N1_HIGHCPU_8 + machineType: N1_HIGHCPU_8 # This is cpu intensive, use a better machine. images: - '$_GCR_BASE/api-server' diff --git a/test/cloudbuild/batch_build.yaml b/test/cloudbuild/batch_build.yaml new file mode 100644 index 00000000000..96359d5f34b --- /dev/null +++ b/test/cloudbuild/batch_build.yaml @@ -0,0 +1,20 @@ +steps: + - name: "gcr.io/cloud-builders/docker" + args: + ["build", "-t", "$_GCR_BASE/persistenceagent", "-f", "backend/Dockerfile.persistenceagent", "."] + waitFor: ["-"] + - name: "gcr.io/cloud-builders/docker" + args: + ["build", "-t", "$_GCR_BASE/scheduledworkflow", "-f", "backend/Dockerfile.scheduledworkflow", "."] + waitFor: ["-"] + - name: "gcr.io/cloud-builders/docker" + args: + ["build", "-t", "$_GCR_BASE/frontend", "-f", "frontend/Dockerfile", "."] + waitFor: ["-"] +options: + machineType: N1_HIGHCPU_8 # use a fast machine to build because there a lot of work +images: + - "$_GCR_BASE/frontend" + - "$_GCR_BASE/scheduledworkflow" + - "$_GCR_BASE/persistenceagent" +timeout: 1800s # 30min diff --git a/test/cloudbuild/frontend.yaml b/test/cloudbuild/frontend.yaml deleted file mode 100644 index 9b477250010..00000000000 --- a/test/cloudbuild/frontend.yaml +++ /dev/null @@ -1,8 +0,0 @@ -steps: -- name: 'gcr.io/cloud-builders/docker' - args: [ 'build', '-t', '$_GCR_BASE/frontend', '-f', 'frontend/Dockerfile', '.' ] -timeout: 1800s # 30min -options: - machineType: UNSPECIFIED -images: -- '$_GCR_BASE/frontend' diff --git a/test/cloudbuild/persistence_agent.yaml b/test/cloudbuild/persistence_agent.yaml deleted file mode 100644 index 5d2b1153614..00000000000 --- a/test/cloudbuild/persistence_agent.yaml +++ /dev/null @@ -1,8 +0,0 @@ -steps: -- name: 'gcr.io/cloud-builders/docker' - args: [ 'build', '-t', '$_GCR_BASE/persistenceagent', '-f', 'backend/Dockerfile.persistenceagent', '.' ] -timeout: 1800s # 30min -options: - machineType: UNSPECIFIED -images: -- '$_GCR_BASE/persistenceagent' diff --git a/test/cloudbuild/scheduled_workflow.yaml b/test/cloudbuild/scheduled_workflow.yaml deleted file mode 100644 index ef91540f6b7..00000000000 --- a/test/cloudbuild/scheduled_workflow.yaml +++ /dev/null @@ -1,8 +0,0 @@ -steps: -- name: 'gcr.io/cloud-builders/docker' - args: [ 'build', '-t', '$_GCR_BASE/scheduledworkflow', '-f', 'backend/Dockerfile.scheduledworkflow', '.' ] -timeout: 1800s # 30min -options: - machineType: UNSPECIFIED -images: -- '$_GCR_BASE/scheduledworkflow' diff --git a/test/install-argo.sh b/test/install-argo.sh index fad8af483b0..1eca2a3b9ce 100755 --- a/test/install-argo.sh +++ b/test/install-argo.sh @@ -34,8 +34,9 @@ if ! which argo; then chmod +x ~/bin/argo fi -kubectl create ns argo --dry-run -o yaml | kubectl apply -f - -kubectl apply -n argo -f https://raw.githubusercontent.com/argoproj/argo/$ARGO_VERSION/manifests/install.yaml +# No need to install here, it comes with kfp lite deployment +# kubectl create ns argo --dry-run -o yaml | kubectl apply -f - +# kubectl apply -n argo -f https://raw.githubusercontent.com/argoproj/argo/$ARGO_VERSION/manifests/install.yaml # Some workflows are deployed to the non-default namespace where the GCP credential secret is stored # In this case, the default service account in that namespace doesn't have enough permission From d65ca066a25be20d0b86bc8f04294569ffcee7e6 Mon Sep 17 00:00:00 2001 From: Yuan Gong Date: Fri, 23 Aug 2019 10:33:51 +0800 Subject: [PATCH 3/3] Fix check cloud build status script --- test/check-build-image-status.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/test/check-build-image-status.sh b/test/check-build-image-status.sh index de899904318..2ff37b99445 100755 --- a/test/check-build-image-status.sh +++ b/test/check-build-image-status.sh @@ -18,21 +18,23 @@ set -ex if [ "$IMAGES_BUILDING" == true ]; then MAX_ATTEMPT=$(expr $TIMEOUT_SECONDS / 20) + PENDING_BUILD_IDS=("${BUILD_IDS[@]}") # copy pending build ids for i in $(seq 1 ${MAX_ATTEMPT}) do - (( success_count=0 )) || true - for id in "${BUILD_IDS[@]}" + NEW_PENDING_BUILD_IDS=() + for id in "${PENDING_BUILD_IDS[@]}" do status=$(gcloud builds describe $id --format='value(status)') || status="FETCH_ERROR" case "$status" in "SUCCESS") - (( ++success_count )) + echo "Build with id ${id} has succeeded." ;; "WORKING") - # do nothing + NEW_PENDING_BUILD_IDS+=( "$id" ) ;; "FETCH_ERROR") echo "Fetching cloud build status failed, retrying..." + NEW_PENDING_BUILD_IDS+=( "$id" ) ;; *) echo "Cloud build with build id ${id} failed with status ${status}" @@ -40,11 +42,11 @@ if [ "$IMAGES_BUILDING" == true ]; then ;; esac done - if [ $success_count == 4 ]; then + PENDING_BUILD_IDS=("${NEW_PENDING_BUILD_IDS[@]}") + if [ 0 == "${#PENDING_BUILD_IDS[@]}" ]; then echo "All cloud builds succeeded." break fi - echo "Cloud build in progress, waiting for 20 seconds..." sleep 20 done