From a3e870130c876847d35a9d99e006f6080b1deb27 Mon Sep 17 00:00:00 2001 From: Oleg Avdeev Date: Mon, 25 Jan 2021 19:51:28 -0800 Subject: [PATCH] add emr CICD entrypoint script Signed-off-by: Oleg Avdeev --- infra/scripts/codebuild-entrypoint.sh | 157 ++++++++++++++++++++++++++ infra/scripts/k8s-common-functions.sh | 2 +- 2 files changed, 158 insertions(+), 1 deletion(-) create mode 100755 infra/scripts/codebuild-entrypoint.sh diff --git a/infra/scripts/codebuild-entrypoint.sh b/infra/scripts/codebuild-entrypoint.sh new file mode 100755 index 0000000000..bf7792da34 --- /dev/null +++ b/infra/scripts/codebuild-entrypoint.sh @@ -0,0 +1,157 @@ +#!/bin/bash + +set -euo pipefail + +STEP_BREADCRUMB='~~~~~~~~' +SECONDS=0 +TIMEFORMAT="${STEP_BREADCRUMB} took %R seconds" + +function maybe_build_push_docker { + # Build and push docker image, tagged with SHA tag, if it doesn't exist already. + NAME=$1 + TARGET=$NAME-docker + SUFFIX=feast-$NAME + + if ! aws ecr describe-images --repository-name "feast-ci/feast/$SUFFIX" "--image-ids=imageTag=${GIT_TAG}" >/dev/null ; then + make "build-$TARGET" "push-$TARGET" REGISTRY="${DOCKER_REPOSITORY}" VERSION="${GIT_TAG}" + else + echo "Image ${DOCKER_REPOSITORY}/$SUFFIX:$GIT_TAG already exists, skipping docker build" + fi +} + +source infra/scripts/k8s-common-functions.sh + +GIT_TAG=${CODEBUILD_RESOLVED_SOURCE_VERSION} + +echo "########## Starting stage $STAGE for ${CODEBUILD_SOURCE_REPO_URL} ${GIT_TAG} ###########" + +# This seems to make builds a bit faster. +export DOCKER_BUILDKIT=1 + +# Workaround for COPY command in core docker image that pulls local maven repo into the image +# itself. +mkdir .m2 2>/dev/null || true +mkdir deps/feast/.m2 2>/dev/null || true + +# Log into k8s. +echo "${STEP_BREADCRUMB} Updating kubeconfig" +aws eks update-kubeconfig --name "$EKS_CLUSTER_NAME" + +# chmod kubeconfig so it doesn't complain all the time +chmod 755 ~/.kube/config + +# Sanity check that kubectl is working. +echo "${STEP_BREADCRUMB} k8s sanity check" +kubectl get pods + +case $STAGE in + core-docker) + maybe_build_push_docker core + ;; + serving-docker) + maybe_build_push_docker serving + ;; + jupyter-docker) + maybe_build_push_docker jupyter + ;; + jobservice-docker) + maybe_build_push_docker jobservice + ;; + ci-docker) + maybe_build_push_docker ci + ;; + e2e-test-emr) + # EMR test - runs in default namespace. + + # Copy cluster config template generated for us by terraform. + aws s3 cp "${EMR_TEMPLATE_YML}" emr_cluster.yaml + + # Delete old helm release and PVCs + k8s_cleanup cicd default + + # Create cluster OR get existing EMR cluster id. In the latter case, clean up any steps + # already running there from previous test runs. + echo "${STEP_BREADCRUMB} Creating EMR cluster, this can take up 10 minutes." + CLUSTER_ID=$(time emr_cluster.py --template emr_cluster.yaml ensure --cleanup) + + # Get (any) node IP. EMR will use this to connect to Kafka and Redis. We make them + # available to the EMR job by exposing them as NodePort services. + NODE_IP=$(kubectl get nodes -o custom-columns=Name:.metadata.name | tail -n1) + + # Helm install everything. + # + # This may occasionally run into "provided port is already allocated" error due to + # https://github.com/kubernetes/kubernetes/issues/85894 + helm_install cicd "$DOCKER_REPOSITORY" "$GIT_TAG" default \ + --set "redis.master.service.type=NodePort" \ + --set "redis.master.service.nodePort=32379" \ + --set "kafka.externalAccess.service.type=NodePort" \ + --set "kafka.externalAccess.enabled=true" \ + --set "kafka.externalAccess.service.nodePorts[0]=30092" \ + --set "kafka.externalAccess.service.domain=${NODE_IP}" \ + --set "kafka.service.externalPort=30094" + + # Run the test suite as a one-off pod. We could also run it here, in the codebuild container + # itself, but that'd require more networking setup to make feast services available + # outside k8s cluster. + kubectl delete pod ci-test-runner 2>/dev/null || true + + echo "${STEP_BREADCRUMB} Running the test suite" + time kubectl run --rm -i ci-test-runner \ + --restart=Never \ + --image="${DOCKER_REPOSITORY}/feast-ci:${GIT_TAG}" \ + --env="CLUSTER_ID=$CLUSTER_ID" \ + --env="STAGING_PATH=$STAGING_PATH" \ + --env="NODE_IP=$NODE_IP" \ + -- \ + bash -c "mkdir src && cd src && git clone $CODEBUILD_SOURCE_REPO_URL && cd feast* && git config remote.origin.fetch '+refs/pull/*:refs/remotes/origin/pull/*' && git fetch -q && git checkout $CODEBUILD_RESOLVED_SOURCE_VERSION && ./infra/scripts/setup-e2e-env-aws.sh && ./infra/scripts/test-end-to-end-aws.sh" + + ;; + e2e-test-sparkop) + # spark k8s test - runs in sparkop namespace (so it doesn't interfere with a concurrently + # running EMR test). + NAMESPACE=sparkop + RELEASE=sparkop + + # Clean up old release + k8s_cleanup "$RELEASE" "$NAMESPACE" + + # Helm install everything in a namespace + helm_install "$RELEASE" "${DOCKER_REPOSITORY}" "${GIT_TAG}" "$NAMESPACE" + + # Delete old test running pod if it exists + kubectl delete pod -n "$NAMESPACE" ci-test-runner 2>/dev/null || true + + # Delete all sparkapplication resources that may be left over from the previous test runs. + kubectl delete sparkapplication --all -n "$NAMESPACE" || true + + # Make sure the test pod has permissions to create sparkapplication resources + setup_sparkop_role + + # Run the test suite as a one-off pod. + echo "${STEP_BREADCRUMB} Running the test suite" + if ! time kubectl run --rm -n "$NAMESPACE" -i ci-test-runner \ + --restart=Never \ + --image="${DOCKER_REPOSITORY}/feast-ci:${GIT_TAG}" \ + --env="STAGING_PATH=$STAGING_PATH" \ + -- \ + bash -c "mkdir src && cd src && git clone $CODEBUILD_SOURCE_REPO_URL && cd feast* && git config remote.origin.fetch '+refs/pull/*:refs/remotes/origin/pull/*' && git fetch -q && git checkout $CODEBUILD_RESOLVED_SOURCE_VERSION && ./infra/scripts/setup-e2e-env-sparkop.sh && ./infra/scripts/test-end-to-end-sparkop.sh" ; then + + readarray -t CRASHED_PODS < <(kubectl get pods --no-headers=true --namespace sparkop | grep Error | awk '{ print $1 }') + + for POD in "${CRASHED_PODS[@]}"; do + echo "Logs from crashed pod $POD:" + kubectl logs --namespace sparkop "$POD" + done + fi + + ;; + cleanup) + emr_cluster.py --template emr_cluster.yaml destroy + ;; + *) + echo "Unknown stage $STAGE" + ;; +esac + +echo "########## Stage $STAGE took $SECONDS seconds ###########" diff --git a/infra/scripts/k8s-common-functions.sh b/infra/scripts/k8s-common-functions.sh index 520dbd4817..b6161fcfd4 100644 --- a/infra/scripts/k8s-common-functions.sh +++ b/infra/scripts/k8s-common-functions.sh @@ -56,7 +56,7 @@ function helm_install { # has some issues with unbound PVCs (that cause kubectl delete pvc to hang). echo "${STEP_BREADCRUMB:-} Helm installing feast" - if ! time helm install --wait "$RELEASE" ./infra/charts/feast \ + if ! time helm install --wait "$RELEASE" "${HELM_CHART_LOCATION:-./infra/charts/feast}" \ --timeout 15m \ --set "feast-jupyter.image.repository=${DOCKER_REPOSITORY}/feast-jupyter" \ --set "feast-jupyter.image.tag=${GIT_TAG}" \