From 7912912351fd0d077fe49d6a14c80594f3fc6fbc Mon Sep 17 00:00:00 2001 From: chaodaiG <45011425+chaodaiG@users.noreply.github.com> Date: Tue, 26 Mar 2019 10:34:51 -0700 Subject: [PATCH] Cleanup job also deletes stale Boskos clusters (#570) * add function for deleting old clusters * update README * Update based on PR comments * Update scripts/library.sh Co-Authored-By: chaodaiG <45011425+chaodaiG@users.noreply.github.com> * fix bad merge * Update based on PR comments --- ci/prow/config.yaml | 4 +-- ci/prow/make_config.go | 4 +-- scripts/library.sh | 6 ++++ test/unit/cleanup-tests.sh | 32 +++++++++-------- tools/cleanup/README.md | 26 +++++--------- tools/cleanup/cleanup-functions.sh | 56 +++++++++++++++++++++++------- tools/cleanup/cleanup.sh | 34 ++++++++---------- 7 files changed, 95 insertions(+), 67 deletions(-) diff --git a/ci/prow/config.yaml b/ci/prow/config.yaml index 711a606bdae..8c0ac06ecaa 100644 --- a/ci/prow/config.yaml +++ b/ci/prow/config.yaml @@ -2662,9 +2662,9 @@ periodics: command: - "./tools/cleanup/cleanup.sh" args: - - "delete-old-gcr-images" - "--project-resource-yaml ci/prow/boskos/resources.yaml" - - "--days-to-keep 30" + - "--days-to-keep-images 30" + - "--hours-to-keep-clusters 24" - "--service-account /etc/test-account/service-account.json" - "--artifacts $(ARTIFACTS)" volumeMounts: diff --git a/ci/prow/make_config.go b/ci/prow/make_config.go index bdcd6db5c26..dfd2751a1c4 100644 --- a/ci/prow/make_config.go +++ b/ci/prow/make_config.go @@ -925,9 +925,9 @@ func generateCleanupPeriodicJob() { data.Base.DecorationConfig = []string{"timeout: 28800000000000"} // 8 hours data.Base.Command = cleanupScript data.Base.Args = []string{ - "delete-old-gcr-images", "--project-resource-yaml ci/prow/boskos/resources.yaml", - "--days-to-keep 30", + "--days-to-keep-images 30", + "--hours-to-keep-clusters 24", "--service-account " + data.Base.ServiceAccount, "--artifacts $(ARTIFACTS)"} addExtraEnvVarsToJob(&data.Base) diff --git a/scripts/library.sh b/scripts/library.sh index 3472b477357..0f4252775d8 100755 --- a/scripts/library.sh +++ b/scripts/library.sh @@ -429,6 +429,12 @@ function remove_broken_symlinks() { done } +# Return whether the given parameter is knative-tests. +# Parameters: $1 - project name +function is_protected_project() { + [[ -n "$1" && "$1" == "knative-tests" ]] +} + # Returns the canonical path of a filesystem object. # Parameters: $1 - path to return in canonical form # $2 - base dir for relative links; optional, defaults to current diff --git a/test/unit/cleanup-tests.sh b/test/unit/cleanup-tests.sh index 3066c7160ad..329ae1f33f0 100755 --- a/test/unit/cleanup-tests.sh +++ b/test/unit/cleanup-tests.sh @@ -34,18 +34,17 @@ cd ${REPO_ROOT_DIR} echo ">> Testing directly invoking cleanup script" -test_function ${FAILURE} "error: unknown option" cleanup_script "action-not-exist" -test_function ${FAILURE} "error: missing gcr" cleanup_script "delete-old-images-from-gcr" -test_function ${FAILURE} "error: missing resource" cleanup_script "delete-old-gcr-images" +test_function ${FAILURE} "error: missing parameter" cleanup_script -test_function ${FAILURE} "error: expecting value following" cleanup_script "delete-old-gcr-images" --project-resource-yaml --dry-run -test_function ${FAILURE} "error: expecting value following" cleanup_script "delete-old-gcr-images" --re-project-name --dry-run -test_function ${FAILURE} "error: expecting value following" cleanup_script "delete-old-gcr-images" --gcr-to-cleanup --dry-run -test_function ${FAILURE} "error: expecting value following" cleanup_script "delete-old-gcr-images" --days-to-keep --dry-run -test_function ${FAILURE} "error: expecting value following" cleanup_script "delete-old-gcr-images" --artifacts --dry-run +test_function ${FAILURE} "error: expecting value following" cleanup_script --project-resource-yaml --dry-run +test_function ${FAILURE} "error: expecting value following" cleanup_script --re-project-name --dry-run +test_function ${FAILURE} "error: expecting value following" cleanup_script --gcr-to-cleanup --dry-run +test_function ${FAILURE} "error: expecting value following" cleanup_script --days-to-keep-images --dry-run +test_function ${FAILURE} "error: expecting value following" cleanup_script --hours-to-keep-clusters --dry-run +test_function ${FAILURE} "error: expecting value following" cleanup_script --artifacts --dry-run -test_function ${FAILURE} "error: days to keep" cleanup_script "delete-old-images-from-gcr" --days-to-keep "a" --dry-run -test_function ${FAILURE} "error: days to keep" cleanup_script "delete-old-gcr-images" --days-to-keep "a" --dry-run +test_function ${FAILURE} "error: days to keep" cleanup_script --days-to-keep-images "a" --dry-run +test_function ${FAILURE} "error: hours to keep" cleanup_script --hours-to-keep-clusters "a" --dry-run # Test individual functions echo ">> Testing deleting images from single project" @@ -56,9 +55,14 @@ test_function ${SUCCESS} "" mock_gcloud_function delete_old_images_from_gcr ${_F echo ">> Testing deleting images from multiple projects" -test_function ${FAILURE} "error: missing resource" delete_old_gcr_images -test_function ${FAILURE} "error: missing regex" delete_old_gcr_images "file" -test_function ${FAILURE} "error: missing days" delete_old_gcr_images "file" "regex" -test_function ${SUCCESS} "Start" mock_gcloud_function delete_old_gcr_images ${_PROJECT_RESOURCE_YAML} ${_RE_PROJECT_NAME} 99 +test_function ${FAILURE} "error: missing project names" delete_old_gcr_images +test_function ${FAILURE} "error: missing days" delete_old_gcr_images "${_FAKE_BOSKOS_PROJECT_NAME}1" +test_function ${SUCCESS} "Start" mock_gcloud_function delete_old_gcr_images "${_PROJECT_RESOURCE_YAML}1 ${_PROJECT_RESOURCE_YAML}2" 99 + +echo ">> Testing deleting clusters from multiple projects" + +test_function ${FAILURE} "error: missing project names" delete_old_test_clusters +test_function ${FAILURE} "error: missing hours" delete_old_test_clusters "${_FAKE_BOSKOS_PROJECT_NAME}1" +test_function ${SUCCESS} "Start" mock_gcloud_function delete_old_test_clusters "${_FAKE_BOSKOS_PROJECT_NAME}1 ${_FAKE_BOSKOS_PROJECT_NAME}2" 99 echo ">> All tests passed" diff --git a/tools/cleanup/README.md b/tools/cleanup/README.md index 5ca221bed0a..e9792e17c21 100644 --- a/tools/cleanup/README.md +++ b/tools/cleanup/README.md @@ -1,34 +1,26 @@ # Resources Clean Up Tool -This tool is designed to clean up stale resources from gcr, for now it only deletes old images created during testing. +This tool is designed to clean up stale test resources. For now it deletes GCR images and GKE clusters created during testing. ## Basic Usage -Directly invoke [cleanup.sh](cleanup.sh) script with certain flags. There is no-op if invoking or sourcing this script without arguments. +Directly invoke [cleanup.sh](cleanup.sh) script with certain flags, but don't source this script. By default the current gcloud credentials are used to delete the images. If necessary, use the flag `--service-account _key-file.json_` to specify a service account that will be performing the access to the gcr. -### Clean up old images from multiple gcrs +Projects to be cleaned up are expected to be defined in a `resources.yaml` file. To remove old images and clusters from them, call [cleanup.sh](cleanup.sh) with following flags: -Projects to be cleaned up are expected to be defined in a `resources.yaml` file. To remove old images from them, call [cleanup.sh](cleanup.sh) with action "delete-old-gcr-images" and following flags: - "--project-resource-yaml" as path of `resources.yaml` file - Mandatory - "--re-project-name" for regex matching projects names - Optional, defaults to `knative-boskos-[a-zA-Z0-9]+` -- "--days-to-keep" - Optional, default `365` +- "--days-to-keep-images" - Optional, defaults to `365` as 1 year +- "--hours-to-keep-clusters" - Optional, defaults to `720` as 30 days +- "--dry-run" - Optional, performs dryrun for all gcloud functions, defaults to false Example: -```./cleanup.sh "delete-old-gcr-images" --project-resource-yaml "ci/prow/boskos/resources.yaml" --days-to-keep 90``` - -### Clean up old images from a specific gcr - -Cleaning up from a specific gcr is supported, except for some special ones (_knative-release_ and _knative-nightly_). Call [cleanup.sh](cleanup.sh) with action "delete-old-images-from-gcr" and following flags: -- "--gcr-to-cleanup" as name of gcr, e.g. "gcr.io/foo" - Mandatory -- "--days-to-keep" - Optional, default `365` - -Example: - -```./cleanup.sh "delete-old-images-from-gcr" --gcr-to-cleanup "gcr.io/foo" --days-to-keep 90``` +```./cleanup.sh --project-resource-yaml "ci/prow/boskos/resources.yaml" --days-to-keep-images 90 --days-to-keep-clusters 24``` +This command deletes test images older than 90 days and test clusters created more than 24 hours ago. ## Prow Job -There is a weekly prow job that triggers this tool runs at 11:00/12:00PM(Day light saving) PST every Monday. This tool scans all gcr projects defined in [ci/prow/boskos/resources.yaml](/ci/prow/boskos/resources.yaml) and deletes images older than 90 days. +There is a weekly prow job that triggers this tool runs at 11:00/12:00PM(Day light saving) PST every Monday. This tool scans all gcr projects defined in [ci/prow/boskos/resources.yaml](/ci/prow/boskos/resources.yaml) and deletes images older than 90 days and clusters older than 24 hours. diff --git a/tools/cleanup/cleanup-functions.sh b/tools/cleanup/cleanup-functions.sh index 373b695b640..209ac0ccb4c 100755 --- a/tools/cleanup/cleanup-functions.sh +++ b/tools/cleanup/cleanup-functions.sh @@ -48,21 +48,51 @@ function delete_old_images_from_gcr() { done } -# Delete old images in the GCP projects defined in the yaml file provided. -# Parameters: $1 - yaml file path defining projects that will be cleaned up -# $2 - regex pattern for parsing the project names -# $3 - days to keep images +# Delete old images in the given GCP projects +# Parameters: $1 - array of projects names +# $2 - days to keep images function delete_old_gcr_images() { - [[ -z $1 ]] && abort "missing resource yaml path" - [[ -z $2 ]] && abort "missing regex pattern for project name" - [[ -z $3 ]] && abort "missing days to keep images" - - local target_projects # delared here as local + assignment in one line always return 0 exit code - target_projects="$(grep -Eio "$2" "$1")" - [[ $? -eq 0 ]] || abort "no project found in $1" + [[ -z $1 ]] && abort "missing project names" + [[ -z $2 ]] && abort "missing days to keep images" - for project in ${target_projects}; do + for project in $1; do echo "Start deleting images from ${project}" - delete_old_images_from_gcr "gcr.io/${project}" $3 + delete_old_images_from_gcr "gcr.io/${project}" $2 + done +} + +# Delete old clusters in the given GCP projects +# Parameters: $1 - array of projects names +# $2 - hours to keep images +function delete_old_test_clusters() { + [[ -z $1 ]] && abort "missing project names" + [[ -z $2 ]] && abort "missing hours to keep clusters" + + for project in $1; do + echo "Start deleting clusters from ${project}" + + is_protected_project $project && \ + abort "Target project set to $project, which is forbidden" + + local current_time=$(date +%s) + local target_time=$(date -d "`date -d @${current_time}`-$2hours" +%s) + # Fail if the difference of current time and target time is not 3600 times hours to keep + if (( ! DRY_RUN )); then # Don't check on dry runs, as dry run is used for unit testing + [[ "$((3600*$2))" -eq "$(($current_time-$target_time))" ]] || abort "date operation failed" + fi + + gcloud --format='get(name,createTime,zone)' container clusters list --project=$project --limit=99999 | \ + while read cluster_name cluster_createtime cluster_zone; do + [[ -n "${cluster_name}" ]] && [[ -z "${cluster_zone}" ]] && abort "list cluster output missing cluster zone" + echo "Checking ${cluster_name} for removal" + local create_time=$(date -d "$cluster_createtime" +%s) + [[ $create_time -gt $current_time ]] && abort "cluster creation time shouldn't be newer than current time" + [[ $create_time -gt $target_time ]] && echo "skip deleting as it's created within $2 hours" && continue + if (( DRY_RUN )); then + echo "[DRY RUN] gcloud container clusters delete -q ${full_image} -zone ${cluster_zone}" + else + gcloud container clusters delete -q ${full_image} -zone ${cluster_zone} + fi + done done } diff --git a/tools/cleanup/cleanup.sh b/tools/cleanup/cleanup.sh index 85b02c7052e..47d2a2bfce6 100755 --- a/tools/cleanup/cleanup.sh +++ b/tools/cleanup/cleanup.sh @@ -20,13 +20,12 @@ source $(dirname $0)/cleanup-functions.sh # Global variables DAYS_TO_KEEP_IMAGES=365 # Keep images up to 1 year by default +HOURS_TO_KEEP_CLUSTERS=720 # keep clusters up to 30 days by default RE_PROJECT_NAME="knative-boskos-[a-zA-Z0-9]+" PROJECT_RESOURCE_YAML="" -GCR_TO_CLEANUP="" ARTIFACTS_DIR="" DRY_RUN=0 - function parse_args() { while [[ $# -ne 0 ]]; do local parameter=$1 @@ -38,8 +37,8 @@ function parse_args() { case ${parameter} in --project-resource-yaml) PROJECT_RESOURCE_YAML=$1 ;; --re-project-name) RE_PROJECT_NAME=$1 ;; - --gcr-to-cleanup) GCR_TO_CLEANUP=$1 ;; - --days-to-keep) DAYS_TO_KEEP_IMAGES=$1 ;; + --days-to-keep-images) DAYS_TO_KEEP_IMAGES=$1 ;; + --hours-to-keep-clusters) HOURS_TO_KEEP_CLUSTERS=$1 ;; --artifacts) ARTIFACTS_DIR=$1 ;; --service-account) gcloud auth activate-service-account --key-file=$1 || exit 1 @@ -51,11 +50,12 @@ function parse_args() { done is_int ${DAYS_TO_KEEP_IMAGES} || abort "days to keep has to be integer" + is_int ${HOURS_TO_KEEP_CLUSTERS} || abort "hours to keep clusters has to be integer" readonly DAYS_TO_KEEP_IMAGES + readonly HOURS_TO_KEEP_CLUSTERS readonly PROJECT_RESOURCE_YAML readonly RE_PROJECT_NAME - readonly GCR_TO_CLEANUP readonly ARTIFACTS_DIR readonly DRY_RUN } @@ -68,25 +68,21 @@ if [[ -z $1 ]]; then abort "missing parameters to the tool" fi -FUNCTION_TO_RUN=$1 -shift parse_args $@ -(( DRY_RUN )) && echo "-- Running in dry-run mode, no image deletion --" +(( DRY_RUN )) && echo "-- Running in dry-run mode, no resource deletion --" +echo "Iterating over projects defined in '${PROJECT_RESOURCE_YAML}', matching '${RE_PROJECT_NAME}" +target_projects="$(grep -Eio "${RE_PROJECT_NAME}" "${PROJECT_RESOURCE_YAML}")" +[[ $? -eq 0 ]] || abort "no project found in $PROJECT_RESOURCE_YAML" +# delete old gcr images echo "Removing images with following rules:" echo "- older than ${DAYS_TO_KEEP_IMAGES} days" -case ${FUNCTION_TO_RUN} in - delete-old-gcr-images) - echo "- from projects defined in '${PROJECT_RESOURCE_YAML}', matching '${RE_PROJECT_NAME}" - delete_old_gcr_images "${PROJECT_RESOURCE_YAML}" "${RE_PROJECT_NAME}" "${DAYS_TO_KEEP_IMAGES}" - ;; - delete-old-images-from-gcr) - echo "- from gcr '${GCR_TO_CLEANUP}'" - delete_old_images_from_gcr "${GCR_TO_CLEANUP}" "${DAYS_TO_KEEP_IMAGES}" - ;; - *) abort "unknown option '${FUNCTION_TO_RUN}'" ;; -esac +delete_old_gcr_images "${target_projects}" "${DAYS_TO_KEEP_IMAGES}" +# delete old clusters +echo "Removing clusters with following rules:" +echo "- older than ${HOURS_TO_KEEP_CLUSTERS} hours" +delete_old_test_clusters "${target_projects}" "${HOURS_TO_KEEP_CLUSTERS}" # Gubernator considers job failure if "junit_*.xml" not found under artifact, # create a placeholder file to make this job succeed