Skip to content

Commit

Permalink
Cleanup job also deletes stale Boskos clusters (#570)
Browse files Browse the repository at this point in the history
* add function for deleting old clusters

* update README

* Update based on PR comments

* Update scripts/library.sh

Co-Authored-By: chaodaiG <[email protected]>

* fix bad merge

* Update based on PR comments
  • Loading branch information
chaodaiG authored and knative-prow-robot committed Mar 26, 2019
1 parent 44d8507 commit 7912912
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 67 deletions.
4 changes: 2 additions & 2 deletions ci/prow/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2662,9 +2662,9 @@ periodics:
command:
- "./tools/cleanup/cleanup.sh"
args:
- "delete-old-gcr-images"
- "--project-resource-yaml ci/prow/boskos/resources.yaml"
- "--days-to-keep 30"
- "--days-to-keep-images 30"
- "--hours-to-keep-clusters 24"
- "--service-account /etc/test-account/service-account.json"
- "--artifacts $(ARTIFACTS)"
volumeMounts:
Expand Down
4 changes: 2 additions & 2 deletions ci/prow/make_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -925,9 +925,9 @@ func generateCleanupPeriodicJob() {
data.Base.DecorationConfig = []string{"timeout: 28800000000000"} // 8 hours
data.Base.Command = cleanupScript
data.Base.Args = []string{
"delete-old-gcr-images",
"--project-resource-yaml ci/prow/boskos/resources.yaml",
"--days-to-keep 30",
"--days-to-keep-images 30",
"--hours-to-keep-clusters 24",
"--service-account " + data.Base.ServiceAccount,
"--artifacts $(ARTIFACTS)"}
addExtraEnvVarsToJob(&data.Base)
Expand Down
6 changes: 6 additions & 0 deletions scripts/library.sh
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,12 @@ function remove_broken_symlinks() {
done
}

# Return whether the given parameter is knative-tests.
# Parameters: $1 - project name
function is_protected_project() {
[[ -n "$1" && "$1" == "knative-tests" ]]
}

# Returns the canonical path of a filesystem object.
# Parameters: $1 - path to return in canonical form
# $2 - base dir for relative links; optional, defaults to current
Expand Down
32 changes: 18 additions & 14 deletions test/unit/cleanup-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,17 @@ cd ${REPO_ROOT_DIR}

echo ">> Testing directly invoking cleanup script"

test_function ${FAILURE} "error: unknown option" cleanup_script "action-not-exist"
test_function ${FAILURE} "error: missing gcr" cleanup_script "delete-old-images-from-gcr"
test_function ${FAILURE} "error: missing resource" cleanup_script "delete-old-gcr-images"
test_function ${FAILURE} "error: missing parameter" cleanup_script

test_function ${FAILURE} "error: expecting value following" cleanup_script "delete-old-gcr-images" --project-resource-yaml --dry-run
test_function ${FAILURE} "error: expecting value following" cleanup_script "delete-old-gcr-images" --re-project-name --dry-run
test_function ${FAILURE} "error: expecting value following" cleanup_script "delete-old-gcr-images" --gcr-to-cleanup --dry-run
test_function ${FAILURE} "error: expecting value following" cleanup_script "delete-old-gcr-images" --days-to-keep --dry-run
test_function ${FAILURE} "error: expecting value following" cleanup_script "delete-old-gcr-images" --artifacts --dry-run
test_function ${FAILURE} "error: expecting value following" cleanup_script --project-resource-yaml --dry-run
test_function ${FAILURE} "error: expecting value following" cleanup_script --re-project-name --dry-run
test_function ${FAILURE} "error: expecting value following" cleanup_script --gcr-to-cleanup --dry-run
test_function ${FAILURE} "error: expecting value following" cleanup_script --days-to-keep-images --dry-run
test_function ${FAILURE} "error: expecting value following" cleanup_script --hours-to-keep-clusters --dry-run
test_function ${FAILURE} "error: expecting value following" cleanup_script --artifacts --dry-run

test_function ${FAILURE} "error: days to keep" cleanup_script "delete-old-images-from-gcr" --days-to-keep "a" --dry-run
test_function ${FAILURE} "error: days to keep" cleanup_script "delete-old-gcr-images" --days-to-keep "a" --dry-run
test_function ${FAILURE} "error: days to keep" cleanup_script --days-to-keep-images "a" --dry-run
test_function ${FAILURE} "error: hours to keep" cleanup_script --hours-to-keep-clusters "a" --dry-run

# Test individual functions
echo ">> Testing deleting images from single project"
Expand All @@ -56,9 +55,14 @@ test_function ${SUCCESS} "" mock_gcloud_function delete_old_images_from_gcr ${_F

echo ">> Testing deleting images from multiple projects"

test_function ${FAILURE} "error: missing resource" delete_old_gcr_images
test_function ${FAILURE} "error: missing regex" delete_old_gcr_images "file"
test_function ${FAILURE} "error: missing days" delete_old_gcr_images "file" "regex"
test_function ${SUCCESS} "Start" mock_gcloud_function delete_old_gcr_images ${_PROJECT_RESOURCE_YAML} ${_RE_PROJECT_NAME} 99
test_function ${FAILURE} "error: missing project names" delete_old_gcr_images
test_function ${FAILURE} "error: missing days" delete_old_gcr_images "${_FAKE_BOSKOS_PROJECT_NAME}1"
test_function ${SUCCESS} "Start" mock_gcloud_function delete_old_gcr_images "${_PROJECT_RESOURCE_YAML}1 ${_PROJECT_RESOURCE_YAML}2" 99

echo ">> Testing deleting clusters from multiple projects"

test_function ${FAILURE} "error: missing project names" delete_old_test_clusters
test_function ${FAILURE} "error: missing hours" delete_old_test_clusters "${_FAKE_BOSKOS_PROJECT_NAME}1"
test_function ${SUCCESS} "Start" mock_gcloud_function delete_old_test_clusters "${_FAKE_BOSKOS_PROJECT_NAME}1 ${_FAKE_BOSKOS_PROJECT_NAME}2" 99

echo ">> All tests passed"
26 changes: 9 additions & 17 deletions tools/cleanup/README.md
Original file line number Diff line number Diff line change
@@ -1,34 +1,26 @@
# Resources Clean Up Tool

This tool is designed to clean up stale resources from gcr, for now it only deletes old images created during testing.
This tool is designed to clean up stale test resources. For now it deletes GCR images and GKE clusters created during testing.

## Basic Usage

Directly invoke [cleanup.sh](cleanup.sh) script with certain flags. There is no-op if invoking or sourcing this script without arguments.
Directly invoke [cleanup.sh](cleanup.sh) script with certain flags, but don't source this script.

By default the current gcloud credentials are used to delete the images. If necessary, use the flag `--service-account _key-file.json_` to specify a service account that will be performing the access to the gcr.

### Clean up old images from multiple gcrs
Projects to be cleaned up are expected to be defined in a `resources.yaml` file. To remove old images and clusters from them, call [cleanup.sh](cleanup.sh) with following flags:

Projects to be cleaned up are expected to be defined in a `resources.yaml` file. To remove old images from them, call [cleanup.sh](cleanup.sh) with action "delete-old-gcr-images" and following flags:
- "--project-resource-yaml" as path of `resources.yaml` file - Mandatory
- "--re-project-name" for regex matching projects names - Optional, defaults to `knative-boskos-[a-zA-Z0-9]+`
- "--days-to-keep" - Optional, default `365`
- "--days-to-keep-images" - Optional, defaults to `365` as 1 year
- "--hours-to-keep-clusters" - Optional, defaults to `720` as 30 days
- "--dry-run" - Optional, performs dryrun for all gcloud functions, defaults to false

Example:

```./cleanup.sh "delete-old-gcr-images" --project-resource-yaml "ci/prow/boskos/resources.yaml" --days-to-keep 90```

### Clean up old images from a specific gcr

Cleaning up from a specific gcr is supported, except for some special ones (_knative-release_ and _knative-nightly_). Call [cleanup.sh](cleanup.sh) with action "delete-old-images-from-gcr" and following flags:
- "--gcr-to-cleanup" as name of gcr, e.g. "gcr.io/foo" - Mandatory
- "--days-to-keep" - Optional, default `365`

Example:

```./cleanup.sh "delete-old-images-from-gcr" --gcr-to-cleanup "gcr.io/foo" --days-to-keep 90```
```./cleanup.sh --project-resource-yaml "ci/prow/boskos/resources.yaml" --days-to-keep-images 90 --days-to-keep-clusters 24```
This command deletes test images older than 90 days and test clusters created more than 24 hours ago.

## Prow Job

There is a weekly prow job that triggers this tool runs at 11:00/12:00PM(Day light saving) PST every Monday. This tool scans all gcr projects defined in [ci/prow/boskos/resources.yaml](/ci/prow/boskos/resources.yaml) and deletes images older than 90 days.
There is a weekly prow job that triggers this tool runs at 11:00/12:00PM(Day light saving) PST every Monday. This tool scans all gcr projects defined in [ci/prow/boskos/resources.yaml](/ci/prow/boskos/resources.yaml) and deletes images older than 90 days and clusters older than 24 hours.
56 changes: 43 additions & 13 deletions tools/cleanup/cleanup-functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,51 @@ function delete_old_images_from_gcr() {
done
}

# Delete old images in the GCP projects defined in the yaml file provided.
# Parameters: $1 - yaml file path defining projects that will be cleaned up
# $2 - regex pattern for parsing the project names
# $3 - days to keep images
# Delete old images in the given GCP projects
# Parameters: $1 - array of projects names
# $2 - days to keep images
function delete_old_gcr_images() {
[[ -z $1 ]] && abort "missing resource yaml path"
[[ -z $2 ]] && abort "missing regex pattern for project name"
[[ -z $3 ]] && abort "missing days to keep images"

local target_projects # delared here as local + assignment in one line always return 0 exit code
target_projects="$(grep -Eio "$2" "$1")"
[[ $? -eq 0 ]] || abort "no project found in $1"
[[ -z $1 ]] && abort "missing project names"
[[ -z $2 ]] && abort "missing days to keep images"

for project in ${target_projects}; do
for project in $1; do
echo "Start deleting images from ${project}"
delete_old_images_from_gcr "gcr.io/${project}" $3
delete_old_images_from_gcr "gcr.io/${project}" $2
done
}

# Delete old clusters in the given GCP projects
# Parameters: $1 - array of projects names
# $2 - hours to keep images
function delete_old_test_clusters() {
[[ -z $1 ]] && abort "missing project names"
[[ -z $2 ]] && abort "missing hours to keep clusters"

for project in $1; do
echo "Start deleting clusters from ${project}"

is_protected_project $project && \
abort "Target project set to $project, which is forbidden"

local current_time=$(date +%s)
local target_time=$(date -d "`date -d @${current_time}`-$2hours" +%s)
# Fail if the difference of current time and target time is not 3600 times hours to keep
if (( ! DRY_RUN )); then # Don't check on dry runs, as dry run is used for unit testing
[[ "$((3600*$2))" -eq "$(($current_time-$target_time))" ]] || abort "date operation failed"
fi

gcloud --format='get(name,createTime,zone)' container clusters list --project=$project --limit=99999 | \
while read cluster_name cluster_createtime cluster_zone; do
[[ -n "${cluster_name}" ]] && [[ -z "${cluster_zone}" ]] && abort "list cluster output missing cluster zone"
echo "Checking ${cluster_name} for removal"
local create_time=$(date -d "$cluster_createtime" +%s)
[[ $create_time -gt $current_time ]] && abort "cluster creation time shouldn't be newer than current time"
[[ $create_time -gt $target_time ]] && echo "skip deleting as it's created within $2 hours" && continue
if (( DRY_RUN )); then
echo "[DRY RUN] gcloud container clusters delete -q ${full_image} -zone ${cluster_zone}"
else
gcloud container clusters delete -q ${full_image} -zone ${cluster_zone}
fi
done
done
}
34 changes: 15 additions & 19 deletions tools/cleanup/cleanup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,12 @@ source $(dirname $0)/cleanup-functions.sh

# Global variables
DAYS_TO_KEEP_IMAGES=365 # Keep images up to 1 year by default
HOURS_TO_KEEP_CLUSTERS=720 # keep clusters up to 30 days by default
RE_PROJECT_NAME="knative-boskos-[a-zA-Z0-9]+"
PROJECT_RESOURCE_YAML=""
GCR_TO_CLEANUP=""
ARTIFACTS_DIR=""
DRY_RUN=0


function parse_args() {
while [[ $# -ne 0 ]]; do
local parameter=$1
Expand All @@ -38,8 +37,8 @@ function parse_args() {
case ${parameter} in
--project-resource-yaml) PROJECT_RESOURCE_YAML=$1 ;;
--re-project-name) RE_PROJECT_NAME=$1 ;;
--gcr-to-cleanup) GCR_TO_CLEANUP=$1 ;;
--days-to-keep) DAYS_TO_KEEP_IMAGES=$1 ;;
--days-to-keep-images) DAYS_TO_KEEP_IMAGES=$1 ;;
--hours-to-keep-clusters) HOURS_TO_KEEP_CLUSTERS=$1 ;;
--artifacts) ARTIFACTS_DIR=$1 ;;
--service-account)
gcloud auth activate-service-account --key-file=$1 || exit 1
Expand All @@ -51,11 +50,12 @@ function parse_args() {
done

is_int ${DAYS_TO_KEEP_IMAGES} || abort "days to keep has to be integer"
is_int ${HOURS_TO_KEEP_CLUSTERS} || abort "hours to keep clusters has to be integer"

readonly DAYS_TO_KEEP_IMAGES
readonly HOURS_TO_KEEP_CLUSTERS
readonly PROJECT_RESOURCE_YAML
readonly RE_PROJECT_NAME
readonly GCR_TO_CLEANUP
readonly ARTIFACTS_DIR
readonly DRY_RUN
}
Expand All @@ -68,25 +68,21 @@ if [[ -z $1 ]]; then
abort "missing parameters to the tool"
fi

FUNCTION_TO_RUN=$1
shift
parse_args $@

(( DRY_RUN )) && echo "-- Running in dry-run mode, no image deletion --"
(( DRY_RUN )) && echo "-- Running in dry-run mode, no resource deletion --"
echo "Iterating over projects defined in '${PROJECT_RESOURCE_YAML}', matching '${RE_PROJECT_NAME}"
target_projects="$(grep -Eio "${RE_PROJECT_NAME}" "${PROJECT_RESOURCE_YAML}")"
[[ $? -eq 0 ]] || abort "no project found in $PROJECT_RESOURCE_YAML"

# delete old gcr images
echo "Removing images with following rules:"
echo "- older than ${DAYS_TO_KEEP_IMAGES} days"
case ${FUNCTION_TO_RUN} in
delete-old-gcr-images)
echo "- from projects defined in '${PROJECT_RESOURCE_YAML}', matching '${RE_PROJECT_NAME}"
delete_old_gcr_images "${PROJECT_RESOURCE_YAML}" "${RE_PROJECT_NAME}" "${DAYS_TO_KEEP_IMAGES}"
;;
delete-old-images-from-gcr)
echo "- from gcr '${GCR_TO_CLEANUP}'"
delete_old_images_from_gcr "${GCR_TO_CLEANUP}" "${DAYS_TO_KEEP_IMAGES}"
;;
*) abort "unknown option '${FUNCTION_TO_RUN}'" ;;
esac
delete_old_gcr_images "${target_projects}" "${DAYS_TO_KEEP_IMAGES}"
# delete old clusters
echo "Removing clusters with following rules:"
echo "- older than ${HOURS_TO_KEEP_CLUSTERS} hours"
delete_old_test_clusters "${target_projects}" "${HOURS_TO_KEEP_CLUSTERS}"

# Gubernator considers job failure if "junit_*.xml" not found under artifact,
# create a placeholder file to make this job succeed
Expand Down

0 comments on commit 7912912

Please sign in to comment.