From 64d005476e4516d75f02fb38f413f00f7e5b3060 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Thu, 21 May 2020 02:53:16 -0700 Subject: [PATCH 01/62] Fix logs duplication --- hooks/command | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/hooks/command b/hooks/command index ff9840b..a7bafee 100755 --- a/hooks/command +++ b/hooks/command @@ -19,12 +19,10 @@ function cleanup { } trap cleanup EXIT - function tail_logs { - while true ; do - stdbuf -o0 -e0 kubectl logs -f "job/${job_name}" 2>>/dev/null || true - sleep 0.2 - done + # logs will be failing until they're available for streaming, once available we actually start streaming them. + while ! kubectl logs --tail 0 --limit-bytes 1 "job/${job_name}" > /dev/null 2>&1; do sleep 0.2; done + kubectl logs --follow "job/${job_name}" } echo "--- :kubernetes: Starting Kubernetes Job" From df9227664f7ac2d108a18b72bf605cbda3fdd4fe Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Thu, 21 May 2020 16:44:12 -0700 Subject: [PATCH 02/62] =?UTF-8?q?Run=20kubectl=20logs=20--follow=20in=20a?= =?UTF-8?q?=20loop,=20it=20actually=20fails=20sometimes=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- hooks/command | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hooks/command b/hooks/command index a7bafee..fa4694b 100755 --- a/hooks/command +++ b/hooks/command @@ -22,7 +22,9 @@ trap cleanup EXIT function tail_logs { # logs will be failing until they're available for streaming, once available we actually start streaming them. while ! kubectl logs --tail 0 --limit-bytes 1 "job/${job_name}" > /dev/null 2>&1; do sleep 0.2; done - kubectl logs --follow "job/${job_name}" + + # It can fail even after we get the first result, so run it in loop (potentially allowing log to be displayed multple times, but chance is low). + while ! kubectl logs --follow "job/${job_name}"; do sleep 0.2; done } echo "--- :kubernetes: Starting Kubernetes Job" From 2142fe64afb0c0af94d4a3bc95c23dc092ef0aef Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Thu, 21 May 2020 16:51:09 -0700 Subject: [PATCH 03/62] Redirect err to /dev/null, add error example to comment --- hooks/command | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hooks/command b/hooks/command index fa4694b..e0bc657 100755 --- a/hooks/command +++ b/hooks/command @@ -23,8 +23,9 @@ function tail_logs { # logs will be failing until they're available for streaming, once available we actually start streaming them. while ! kubectl logs --tail 0 --limit-bytes 1 "job/${job_name}" > /dev/null 2>&1; do sleep 0.2; done - # It can fail even after we get the first result, so run it in loop (potentially allowing log to be displayed multple times, but chance is low). - while ! kubectl logs --follow "job/${job_name}"; do sleep 0.2; done + # It can fail even after we get the first result, so run it in loop (potentially allowing log to be displayed multiple times, but chance is low). + # Example of a failure: "Error from server (BadRequest): container "step" in pod "somepod" is waiting to start: PodInitializing" + while ! kubectl logs --follow "job/${job_name}" 2>/dev/null; do sleep 0.2; done } echo "--- :kubernetes: Starting Kubernetes Job" From bb3fdf96042e2ccd004c4124ee29bcaa31df6eb2 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Thu, 21 May 2020 16:59:35 -0700 Subject: [PATCH 04/62] improve comments --- hooks/command | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hooks/command b/hooks/command index e0bc657..8e7658e 100755 --- a/hooks/command +++ b/hooks/command @@ -20,11 +20,12 @@ function cleanup { trap cleanup EXIT function tail_logs { - # logs will be failing until they're available for streaming, once available we actually start streaming them. - while ! kubectl logs --tail 0 --limit-bytes 1 "job/${job_name}" > /dev/null 2>&1; do sleep 0.2; done + # logs will be failing until they're available for streaming, once available — we actually start attempting to stream them. + while ! kubectl logs --tail 0 --limit-bytes 32 "job/${job_name}" > /dev/null 2>&1; do sleep 0.2; done - # It can fail even after we get the first result, so run it in loop (potentially allowing log to be displayed multiple times, but chance is low). - # Example of a failure: "Error from server (BadRequest): container "step" in pod "somepod" is waiting to start: PodInitializing" + # Run kubectl logs --follow in a loop since it can still fail: + # 1) It can fail due to pod not being initialized yet: "Error from server (BadRequest): container "step" in pod "somepod" is waiting to start: PodInitializing" + # 2) It can fail mid-streaming, in this case we unfortunately will display logs multiple times (partially). while ! kubectl logs --follow "job/${job_name}" 2>/dev/null; do sleep 0.2; done } From e6289841eaacf6e4a13785154c01becc62ca9eab Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Thu, 21 May 2020 18:56:26 -0700 Subject: [PATCH 05/62] Add timeout to the check --- hooks/command | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hooks/command b/hooks/command index 8e7658e..8c93980 100755 --- a/hooks/command +++ b/hooks/command @@ -21,7 +21,7 @@ trap cleanup EXIT function tail_logs { # logs will be failing until they're available for streaming, once available — we actually start attempting to stream them. - while ! kubectl logs --tail 0 --limit-bytes 32 "job/${job_name}" > /dev/null 2>&1; do sleep 0.2; done + while ! timeout 5 kubectl logs --tail 0 --limit-bytes 1 "job/${job_name}" > /dev/null 2>&1; do sleep 0.2; done # Run kubectl logs --follow in a loop since it can still fail: # 1) It can fail due to pod not being initialized yet: "Error from server (BadRequest): container "step" in pod "somepod" is waiting to start: PodInitializing" From 7cf0ad5cf00b3f2f3e2754ed35b46b7a113f2cf3 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Thu, 21 May 2020 19:08:01 -0700 Subject: [PATCH 06/62] do not tail --- hooks/command | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hooks/command b/hooks/command index 8c93980..308765e 100755 --- a/hooks/command +++ b/hooks/command @@ -21,7 +21,7 @@ trap cleanup EXIT function tail_logs { # logs will be failing until they're available for streaming, once available — we actually start attempting to stream them. - while ! timeout 5 kubectl logs --tail 0 --limit-bytes 1 "job/${job_name}" > /dev/null 2>&1; do sleep 0.2; done + while ! timeout 5 kubectl logs --limit-bytes 32 "job/${job_name}" > /dev/null 2>&1; do sleep 0.2; done # Run kubectl logs --follow in a loop since it can still fail: # 1) It can fail due to pod not being initialized yet: "Error from server (BadRequest): container "step" in pod "somepod" is waiting to start: PodInitializing" From e86fe72bdb113de67cff3eb95c7d98ab4554da48 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Fri, 22 May 2020 23:39:04 -0700 Subject: [PATCH 07/62] Add check for log snapshot content --- hooks/command | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/hooks/command b/hooks/command index 308765e..c57dd87 100755 --- a/hooks/command +++ b/hooks/command @@ -13,19 +13,33 @@ fi ((timeout=BUILDKITE_TIMEOUT*60)) export BUILDKITE_TIMEOUT +readonly job_logs_snapshot_file="$(mktemp)" + function cleanup { # Delete all jobs older than a day kubectl delete job "$(kubectl get job -l buildkite/plugin=k8s | awk 'match($4,/[0-9]+d/) {print $1}')" 2>/dev/null || true + rm -f "$job_logs_snapshot_file" } trap cleanup EXIT function tail_logs { # logs will be failing until they're available for streaming, once available — we actually start attempting to stream them. - while ! timeout 5 kubectl logs --limit-bytes 32 "job/${job_name}" > /dev/null 2>&1; do sleep 0.2; done - - # Run kubectl logs --follow in a loop since it can still fail: + while ! timeout 5 kubectl logs --limit-bytes 1024 "job/${job_name}" > "$job_logs_snapshot_file" 2>/dev/null; + do + # If output is not empty we should start attempting to stream the logs. + # Without this check streaming can end up not getting any result (probably due to some race in k8s). + # Keep looping otherwise since empty output means that there is no useful log to display so we're not losing information. + if [[ -n "$(cat "$job_logs_snapshot_file")" ]]; then + break + else + sleep 0.2 + fi + done + + # Run kubectl logs --follow in a loop since it can fail: # 1) It can fail due to pod not being initialized yet: "Error from server (BadRequest): container "step" in pod "somepod" is waiting to start: PodInitializing" # 2) It can fail mid-streaming, in this case we unfortunately will display logs multiple times (partially). + # 3) It can hang not getting any result, but that's why we check for contents in a loop above. while ! kubectl logs --follow "job/${job_name}" 2>/dev/null; do sleep 0.2; done } From 1a8f2e66dd5afae12eccea07df1d31178e227f9b Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Fri, 22 May 2020 23:48:25 -0700 Subject: [PATCH 08/62] Add debug --- hooks/command | 1 + 1 file changed, 1 insertion(+) diff --git a/hooks/command b/hooks/command index c57dd87..1cf5a43 100755 --- a/hooks/command +++ b/hooks/command @@ -30,6 +30,7 @@ function tail_logs { # Without this check streaming can end up not getting any result (probably due to some race in k8s). # Keep looping otherwise since empty output means that there is no useful log to display so we're not losing information. if [[ -n "$(cat "$job_logs_snapshot_file")" ]]; then + echo "[debug] Got logs snapshot, attempting to stream the logs now..." break else sleep 0.2 From a7e7b627b46680264fe7cf1a4f4edcb1aaf438d5 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Sat, 23 May 2020 00:37:41 -0700 Subject: [PATCH 09/62] fix contents check --- hooks/command | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/hooks/command b/hooks/command index 1cf5a43..3fd4a29 100755 --- a/hooks/command +++ b/hooks/command @@ -23,24 +23,21 @@ function cleanup { trap cleanup EXIT function tail_logs { - # logs will be failing until they're available for streaming, once available — we actually start attempting to stream them. - while ! timeout 5 kubectl logs --limit-bytes 1024 "job/${job_name}" > "$job_logs_snapshot_file" 2>/dev/null; + # Once logs are not empty we should start attempting to stream them. + # Keep looping otherwise since empty output means that there is no useful log to display so we're not losing information by looping. + while true; do - # If output is not empty we should start attempting to stream the logs. - # Without this check streaming can end up not getting any result (probably due to some race in k8s). - # Keep looping otherwise since empty output means that there is no useful log to display so we're not losing information. + timeout 5 kubectl logs --limit-bytes 1024 "job/${job_name}" > "$job_logs_snapshot_file" 2>/dev/null if [[ -n "$(cat "$job_logs_snapshot_file")" ]]; then - echo "[debug] Got logs snapshot, attempting to stream the logs now..." break - else - sleep 0.2 fi + sleep 0.2 done # Run kubectl logs --follow in a loop since it can fail: # 1) It can fail due to pod not being initialized yet: "Error from server (BadRequest): container "step" in pod "somepod" is waiting to start: PodInitializing" # 2) It can fail mid-streaming, in this case we unfortunately will display logs multiple times (partially). - # 3) It can hang not getting any result, but that's why we check for contents in a loop above. + # 3) It can hang not providing any result, that's why we check not only exit code but also contents in the loop above. while ! kubectl logs --follow "job/${job_name}" 2>/dev/null; do sleep 0.2; done } From 391faaa7def95a043e742ddcbbe715084e76eadc Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Sat, 23 May 2020 00:59:18 -0700 Subject: [PATCH 10/62] print debug --- hooks/command | 1 + 1 file changed, 1 insertion(+) diff --git a/hooks/command b/hooks/command index 3fd4a29..c9fdce1 100755 --- a/hooks/command +++ b/hooks/command @@ -29,6 +29,7 @@ function tail_logs { do timeout 5 kubectl logs --limit-bytes 1024 "job/${job_name}" > "$job_logs_snapshot_file" 2>/dev/null if [[ -n "$(cat "$job_logs_snapshot_file")" ]]; then + echo "[debug] Got non-empty log content: $(cat "$job_logs_snapshot_file")" break fi sleep 0.2 From 27af56806be9eacc2efa1c6145700b9c396bd6a5 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Sat, 23 May 2020 01:04:42 -0700 Subject: [PATCH 11/62] print more debug --- hooks/command | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hooks/command b/hooks/command index c9fdce1..e2c24b1 100755 --- a/hooks/command +++ b/hooks/command @@ -32,7 +32,8 @@ function tail_logs { echo "[debug] Got non-empty log content: $(cat "$job_logs_snapshot_file")" break fi - sleep 0.2 + echo "[debug] Sleeping again" + sleep 1 done # Run kubectl logs --follow in a loop since it can fail: From b55eaf8cb213261fed9296cab1e97b806c6aa0a2 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Sat, 23 May 2020 01:08:57 -0700 Subject: [PATCH 12/62] print more debug --- hooks/command | 1 + 1 file changed, 1 insertion(+) diff --git a/hooks/command b/hooks/command index e2c24b1..88fa92f 100755 --- a/hooks/command +++ b/hooks/command @@ -27,6 +27,7 @@ function tail_logs { # Keep looping otherwise since empty output means that there is no useful log to display so we're not losing information by looping. while true; do + echo "[debug] checking logs snapshots" timeout 5 kubectl logs --limit-bytes 1024 "job/${job_name}" > "$job_logs_snapshot_file" 2>/dev/null if [[ -n "$(cat "$job_logs_snapshot_file")" ]]; then echo "[debug] Got non-empty log content: $(cat "$job_logs_snapshot_file")" From 2f581bacb6daa013b37943614872051f03f58e8a Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 26 May 2020 14:43:26 -0700 Subject: [PATCH 13/62] Remove timeout --- hooks/command | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hooks/command b/hooks/command index 88fa92f..6bcdae7 100755 --- a/hooks/command +++ b/hooks/command @@ -28,13 +28,13 @@ function tail_logs { while true; do echo "[debug] checking logs snapshots" - timeout 5 kubectl logs --limit-bytes 1024 "job/${job_name}" > "$job_logs_snapshot_file" 2>/dev/null + kubectl logs --tail --limit-bytes 1024 "job/${job_name}" > "$job_logs_snapshot_file" 2>/dev/null if [[ -n "$(cat "$job_logs_snapshot_file")" ]]; then echo "[debug] Got non-empty log content: $(cat "$job_logs_snapshot_file")" break fi echo "[debug] Sleeping again" - sleep 1 + sleep 0.2 done # Run kubectl logs --follow in a loop since it can fail: From df7444b7b5b58d651da1af21417daf69551d9718 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 26 May 2020 14:54:47 -0700 Subject: [PATCH 14/62] add set -x, dont use file --- hooks/command | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/hooks/command b/hooks/command index 6bcdae7..d576033 100755 --- a/hooks/command +++ b/hooks/command @@ -23,14 +23,16 @@ function cleanup { trap cleanup EXIT function tail_logs { + set -x # Once logs are not empty we should start attempting to stream them. # Keep looping otherwise since empty output means that there is no useful log to display so we're not losing information by looping. while true; do echo "[debug] checking logs snapshots" - kubectl logs --tail --limit-bytes 1024 "job/${job_name}" > "$job_logs_snapshot_file" 2>/dev/null - if [[ -n "$(cat "$job_logs_snapshot_file")" ]]; then - echo "[debug] Got non-empty log content: $(cat "$job_logs_snapshot_file")" + local log_snapshot + log_snapshot="$(timeout 5 kubectl logs --limit-bytes 1024 "job/${job_name}" 2>/dev/null)" + if [[ -n "$log_snapshot" ]]; then + echo -e "[debug] Got non-empty log content: $log_snapshot)" break fi echo "[debug] Sleeping again" From 54583a92a87c9065528639a73dff3e60b2e6b0e6 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 26 May 2020 15:03:31 -0700 Subject: [PATCH 15/62] set +e --- hooks/command | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hooks/command b/hooks/command index d576033..0dbd3e2 100755 --- a/hooks/command +++ b/hooks/command @@ -30,7 +30,9 @@ function tail_logs { do echo "[debug] checking logs snapshots" local log_snapshot - log_snapshot="$(timeout 5 kubectl logs --limit-bytes 1024 "job/${job_name}" 2>/dev/null)" + set +e + log_snapshot="$(kubectl logs --limit-bytes 1024 "job/${job_name}" 2>/dev/null)" + set -e if [[ -n "$log_snapshot" ]]; then echo -e "[debug] Got non-empty log content: $log_snapshot)" break From 81e99da2ef69d291d5e670a333fa09dbf61b4542 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 26 May 2020 15:10:10 -0700 Subject: [PATCH 16/62] Remove -xg --- hooks/command | 1 - 1 file changed, 1 deletion(-) diff --git a/hooks/command b/hooks/command index 0dbd3e2..5eeea37 100755 --- a/hooks/command +++ b/hooks/command @@ -23,7 +23,6 @@ function cleanup { trap cleanup EXIT function tail_logs { - set -x # Once logs are not empty we should start attempting to stream them. # Keep looping otherwise since empty output means that there is no useful log to display so we're not losing information by looping. while true; From 75cbe9c77a016c15e75f99e2f9a3586fd000f01b Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 26 May 2020 15:47:30 -0700 Subject: [PATCH 17/62] Add race fix --- hooks/command | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/hooks/command b/hooks/command index 5eeea37..7875314 100755 --- a/hooks/command +++ b/hooks/command @@ -13,12 +13,12 @@ fi ((timeout=BUILDKITE_TIMEOUT*60)) export BUILDKITE_TIMEOUT -readonly job_logs_snapshot_file="$(mktemp)" +readonly job_log_complete_marker_file="$(mktemp)" function cleanup { # Delete all jobs older than a day kubectl delete job "$(kubectl get job -l buildkite/plugin=k8s | awk 'match($4,/[0-9]+d/) {print $1}')" 2>/dev/null || true - rm -f "$job_logs_snapshot_file" + rm -f "$job_log_complete_marker_file" } trap cleanup EXIT @@ -27,16 +27,13 @@ function tail_logs { # Keep looping otherwise since empty output means that there is no useful log to display so we're not losing information by looping. while true; do - echo "[debug] checking logs snapshots" local log_snapshot set +e - log_snapshot="$(kubectl logs --limit-bytes 1024 "job/${job_name}" 2>/dev/null)" + log_snapshot="$(timeout 5 kubectl logs --limit-bytes 1024 "job/${job_name}" 2>/dev/null)" set -e if [[ -n "$log_snapshot" ]]; then - echo -e "[debug] Got non-empty log content: $log_snapshot)" break fi - echo "[debug] Sleeping again" sleep 0.2 done @@ -45,6 +42,8 @@ function tail_logs { # 2) It can fail mid-streaming, in this case we unfortunately will display logs multiple times (partially). # 3) It can hang not providing any result, that's why we check not only exit code but also contents in the loop above. while ! kubectl logs --follow "job/${job_name}" 2>/dev/null; do sleep 0.2; done + + echo "0" > "$job_log_complete_marker_file" } echo "--- :kubernetes: Starting Kubernetes Job" @@ -78,6 +77,14 @@ done echo echo "--- :kubernetes: Job status: $jobstatus" + +# Wait for logs to be printed because printing runs in a fork and we're racing with it. +readonly log_wait_start_time="$SECONDS" +while [[ "$(cat "$job_log_complete_marker_file")" != "0" ]] && [[ "$((SECONDS - log_wait_start_time))" -lt 15 ]] +do + sleep 0.5 +done + status=1 if [[ "$jobstatus" == "Complete" ]] ; then echo "success" From 97ae246f49e09ada33dd0063341f7b202741bb22 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 26 May 2020 15:57:41 -0700 Subject: [PATCH 18/62] Remove empty print, move out local var --- hooks/command | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hooks/command b/hooks/command index 7875314..f64918f 100755 --- a/hooks/command +++ b/hooks/command @@ -23,11 +23,11 @@ function cleanup { trap cleanup EXIT function tail_logs { + local log_snapshot="" # Once logs are not empty we should start attempting to stream them. # Keep looping otherwise since empty output means that there is no useful log to display so we're not losing information by looping. while true; do - local log_snapshot set +e log_snapshot="$(timeout 5 kubectl logs --limit-bytes 1024 "job/${job_name}" 2>/dev/null)" set -e @@ -75,7 +75,6 @@ while [[ -z "$jobstatus" ]] ; do fi done -echo echo "--- :kubernetes: Job status: $jobstatus" # Wait for logs to be printed because printing runs in a fork and we're racing with it. From 724410e364782201960b2b32e95d41e83275e2e2 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Mon, 1 Jun 2020 18:41:34 -0700 Subject: [PATCH 19/62] Increase log loop interval to avoid k8s API endpoint overload, allow user value --- hooks/command | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hooks/command b/hooks/command index f64918f..ab47289 100755 --- a/hooks/command +++ b/hooks/command @@ -13,6 +13,7 @@ fi ((timeout=BUILDKITE_TIMEOUT*60)) export BUILDKITE_TIMEOUT +readonly log_loop_interval_seconds="${K8S_BUILDKITE_PLUGIN_LOG_LOOP_INTERVAL:-3}" readonly job_log_complete_marker_file="$(mktemp)" function cleanup { @@ -34,7 +35,7 @@ function tail_logs { if [[ -n "$log_snapshot" ]]; then break fi - sleep 0.2 + sleep "$log_loop_interval_seconds" done # Run kubectl logs --follow in a loop since it can fail: @@ -79,7 +80,7 @@ echo "--- :kubernetes: Job status: $jobstatus" # Wait for logs to be printed because printing runs in a fork and we're racing with it. readonly log_wait_start_time="$SECONDS" -while [[ "$(cat "$job_log_complete_marker_file")" != "0" ]] && [[ "$((SECONDS - log_wait_start_time))" -lt 15 ]] +while [[ "$(cat "$job_log_complete_marker_file")" != "0" ]] && [[ "$((SECONDS - log_wait_start_time))" -lt 30 ]] do sleep 0.5 done From c4735dcdd825d51d8aea691126ad1da551fa27bb Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Mon, 1 Jun 2020 20:09:19 -0700 Subject: [PATCH 20/62] Retry all kubectl commands --- hooks/command | 63 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/hooks/command b/hooks/command index ab47289..5746246 100755 --- a/hooks/command +++ b/hooks/command @@ -13,19 +13,46 @@ fi ((timeout=BUILDKITE_TIMEOUT*60)) export BUILDKITE_TIMEOUT +readonly job_apply_loop_interval_seconds="${K8S_BUILDKITE_PLUGIN_JOB_APPLY_LOOP_INTERVAL:-5}" +readonly job_apply_loop_timeout_seconds="${K8S_BUILDKITE_PLUGIN_JOB_APPLY_LOOP_TIMEOUT:-120}" readonly log_loop_interval_seconds="${K8S_BUILDKITE_PLUGIN_LOG_LOOP_INTERVAL:-3}" +readonly cleanup_loop_interval_seconds="${K8S_BUILDKITE_PLUGIN_CLEANUP_LOOP_INTERVAL:-5}" +readonly cleanup_loop_timeout_seconds="${K8S_BUILDKITE_PLUGIN_CLEANUP_LOOP_INTERVAL:-60}" + +readonly job_json="$(mktemp)" readonly job_log_complete_marker_file="$(mktemp)" function cleanup { # Delete all jobs older than a day - kubectl delete job "$(kubectl get job -l buildkite/plugin=k8s | awk 'match($4,/[0-9]+d/) {print $1}')" 2>/dev/null || true - rm -f "$job_log_complete_marker_file" + local -r cleanup_start_time="$SECONDS" + local cleanup_old_jobs_exit_code="" + + while [[ "$((SECONDS - cleanup_start_time))" -lt "$cleanup_loop_timeout_seconds" ]] + do + set +e + local old_jobs="$(kubectl get job -l buildkite/plugin=k8s | awk 'match($4,/[0-9]+d/) {print $1}')" + if [[ -n "$old_jobs" ]]; then + kubectl delete job "$old_jobs" + fi + cleanup_old_jobs_exit_code="$?" + set -e + + if [[ "$cleanup_old_jobs_exit_code" == "0" ]]; then + break + else + echo "Attempt to cleanup old jobs failed, exit code '$cleanup_old_jobs_exit_code'" + sleep "$cleanup_loop_interval_seconds" + fi + done + + echo "Cleanup old jobs exit code '$cleanup_old_jobs_exit_code'" + rm -f "$job_json" "$job_log_complete_marker_file" } trap cleanup EXIT function tail_logs { local log_snapshot="" - # Once logs are not empty we should start attempting to stream them. + # Once logs are not empty we start attempting to stream them. # Keep looping otherwise since empty output means that there is no useful log to display so we're not losing information by looping. while true; do @@ -42,7 +69,10 @@ function tail_logs { # 1) It can fail due to pod not being initialized yet: "Error from server (BadRequest): container "step" in pod "somepod" is waiting to start: PodInitializing" # 2) It can fail mid-streaming, in this case we unfortunately will display logs multiple times (partially). # 3) It can hang not providing any result, that's why we check not only exit code but also contents in the loop above. - while ! kubectl logs --follow "job/${job_name}" 2>/dev/null; do sleep 0.2; done + while ! kubectl logs --follow "job/${job_name}" 2>/dev/null; + do + sleep "$log_loop_interval_seconds" + done echo "0" > "$job_log_complete_marker_file" } @@ -57,7 +87,26 @@ jsonnet \ --tla-code "agentEnv=$(jq -c -n env)" \ --tla-code patchFunc \ "${basedir}/lib/job.jsonnet" \ - | kubectl apply -f - + > "$job_json" + +readonly job_apply_start_time="$SECONDS" +job_apply_exit_code="" + +while [[ "$((SECONDS - job_apply_start_time))" -lt "$job_apply_loop_timeout_seconds" ]] +do + set +e + kubectl apply -f "$job_json" + job_apply_exit_code="$?" + set -e + + if [[ "$job_apply_exit_code" == "0" ]]; then + break + else + echo "Attempt apply the job failed, exit code '$job_apply_exit_code'" + sleep "$job_apply_loop_interval_seconds" + fi +done +echo "Apply job exit code '$job_apply_exit_code'" echo "Timeout: ${timeout}s" @@ -65,11 +114,13 @@ echo "+++ :kubernetes: Running image: ${BUILDKITE_PLUGIN_K8S_IMAGE}" tail_logs & -sleeper=2 +readonly sleeper="${K8S_BUILDKITE_PLUGIN_SLEEPER_LOOP_INTERVAL:-5}" counter=${timeout} jobstatus="" while [[ -z "$jobstatus" ]] ; do + set +e jobstatus=$(kubectl get job "${job_name}" -o 'jsonpath={.status.conditions[].type}') + set -e sleep $sleeper if [[ $timeout -gt 0 ]]; then (( counter -= sleeper )) || jobstatus="timeout" From 5f039903aacec5a63f6508f36ae153c6250b8c16 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Mon, 1 Jun 2020 20:32:13 -0700 Subject: [PATCH 21/62] Add retries to pre-exit --- hooks/command | 24 ++++++++++++------------ hooks/pre-exit | 28 +++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/hooks/command b/hooks/command index 5746246..6810e1c 100755 --- a/hooks/command +++ b/hooks/command @@ -13,11 +13,12 @@ fi ((timeout=BUILDKITE_TIMEOUT*60)) export BUILDKITE_TIMEOUT -readonly job_apply_loop_interval_seconds="${K8S_BUILDKITE_PLUGIN_JOB_APPLY_LOOP_INTERVAL:-5}" -readonly job_apply_loop_timeout_seconds="${K8S_BUILDKITE_PLUGIN_JOB_APPLY_LOOP_TIMEOUT:-120}" -readonly log_loop_interval_seconds="${K8S_BUILDKITE_PLUGIN_LOG_LOOP_INTERVAL:-3}" -readonly cleanup_loop_interval_seconds="${K8S_BUILDKITE_PLUGIN_CLEANUP_LOOP_INTERVAL:-5}" -readonly cleanup_loop_timeout_seconds="${K8S_BUILDKITE_PLUGIN_CLEANUP_LOOP_INTERVAL:-60}" +readonly job_apply_loop_interval_seconds="${K8S_PLUGIN_JOB_APPLY_LOOP_INTERVAL:-5}" +readonly job_apply_loop_timeout_seconds="${K8S_PLUGIN_JOB_APPLY_LOOP_TIMEOUT:-120}" +readonly log_loop_interval_seconds="${K8S_PLUGIN_LOG_LOOP_INTERVAL:-3}" +readonly job_status_loop_sleep_interval="${K8S_PLUGIN_JOB_STATUS_LOOP_INTERVAL:-5}" +readonly cleanup_old_jobs_loop_interval_seconds="${K8S_PLUGIN_CLEANUP_OLD_JOBS_LOOP_INTERVAL:-5}" +readonly cleanup_old_jobs_loop_timeout_seconds="${K8S_PLUGIN_CLEANUP_OLD_JOBS_LOOP_INTERVAL:-2}" readonly job_json="$(mktemp)" readonly job_log_complete_marker_file="$(mktemp)" @@ -27,7 +28,7 @@ function cleanup { local -r cleanup_start_time="$SECONDS" local cleanup_old_jobs_exit_code="" - while [[ "$((SECONDS - cleanup_start_time))" -lt "$cleanup_loop_timeout_seconds" ]] + while [[ "$((SECONDS - cleanup_start_time))" -lt "$cleanup_old_jobs_loop_timeout_seconds" ]] do set +e local old_jobs="$(kubectl get job -l buildkite/plugin=k8s | awk 'match($4,/[0-9]+d/) {print $1}')" @@ -41,7 +42,7 @@ function cleanup { break else echo "Attempt to cleanup old jobs failed, exit code '$cleanup_old_jobs_exit_code'" - sleep "$cleanup_loop_interval_seconds" + sleep "$cleanup_old_jobs_loop_interval_seconds" fi done @@ -102,11 +103,11 @@ do if [[ "$job_apply_exit_code" == "0" ]]; then break else - echo "Attempt apply the job failed, exit code '$job_apply_exit_code'" + echo "Attempt to schedule the job failed, exit code '$job_apply_exit_code'" sleep "$job_apply_loop_interval_seconds" fi done -echo "Apply job exit code '$job_apply_exit_code'" +echo "Schedule job exit code '$job_apply_exit_code'" echo "Timeout: ${timeout}s" @@ -114,16 +115,15 @@ echo "+++ :kubernetes: Running image: ${BUILDKITE_PLUGIN_K8S_IMAGE}" tail_logs & -readonly sleeper="${K8S_BUILDKITE_PLUGIN_SLEEPER_LOOP_INTERVAL:-5}" counter=${timeout} jobstatus="" while [[ -z "$jobstatus" ]] ; do set +e jobstatus=$(kubectl get job "${job_name}" -o 'jsonpath={.status.conditions[].type}') set -e - sleep $sleeper + sleep "$job_status_loop_sleep_interval" if [[ $timeout -gt 0 ]]; then - (( counter -= sleeper )) || jobstatus="timeout" + (( counter -= job_status_loop_sleep_interval )) || jobstatus="timeout" fi done diff --git a/hooks/pre-exit b/hooks/pre-exit index 2fb580b..258b6a7 100755 --- a/hooks/pre-exit +++ b/hooks/pre-exit @@ -6,10 +6,28 @@ job_name="$(cat /tmp/job_name)" echo "--- :kubernetes: Cleanup" -pod=$(kubectl get pod --output=name -l "job-name=${job_name}") -if [[ -n "${pod}" ]] ; then - kubectl patch --patch '{"spec":{"activeDeadlineSeconds":1}}' "${pod}" -fi +readonly job_cleanup_loop_interval_seconds="${K8S_PLUGIN_JOB_CLEANUP_LOOP_INTERVAL:-5}" +readonly job_cleanup_loop_timeout_seconds="${K8S_PLUGIN_JOB_CLEANUP_LOOP_TIMEOUT:-60}" +readonly job_cleanup_start_time="$SECONDS" -kubectl patch --patch '{"spec":{"activeDeadlineSeconds":1}}' "job/${job_name}" +job_cleanup_exit_code="" +while [[ "$((SECONDS - job_cleanup_start_time))" -lt "$job_cleanup_loop_timeout_seconds" ]] +do + set +e + pod=$(kubectl get pod --output=name -l "job-name=${job_name}") + if [[ -n "${pod}" ]] ; then + kubectl patch --patch '{"spec":{"activeDeadlineSeconds":1}}' "${pod}" + fi + kubectl patch --patch '{"spec":{"activeDeadlineSeconds":1}}' "job/${job_name}" + job_cleanup_exit_code="$?" + set -e + if [[ "$job_cleanup_exit_code" == "0" ]]; then + break + else + echo "Attempt to cleanup the job failed, exit code '$job_cleanup_exit_code'" + sleep "$job_cleanup_loop_interval_seconds" + fi +done + +echo "Job cleanup exit code '$job_cleanup_exit_code'" From 0ba78db9f7cf2ad82aef18333bb6d804fa7b0a4e Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Mon, 1 Jun 2020 20:45:06 -0700 Subject: [PATCH 22/62] Add ttlSecondsAfterFinished of 1 day, same as cleanup --- lib/job.jsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/job.jsonnet b/lib/job.jsonnet index 24a3b17..3ca1686 100644 --- a/lib/job.jsonnet +++ b/lib/job.jsonnet @@ -260,6 +260,7 @@ function(jobName, agentEnv={}, stepEnvFile='', patchFunc=identity) patchFunc({ backoffLimit: 0, activeDeadlineSeconds: deadline, completions: 1, + ttlSecondsAfterFinished: 86400, template: { metadata: { labels: labels, From d63009da41880c526cd1ae354207beceeff0bffc Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Mon, 1 Jun 2020 20:46:13 -0700 Subject: [PATCH 23/62] rely only on ttlSecondsAfterFinished --- hooks/command | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/hooks/command b/hooks/command index 6810e1c..af13f4d 100755 --- a/hooks/command +++ b/hooks/command @@ -17,36 +17,11 @@ readonly job_apply_loop_interval_seconds="${K8S_PLUGIN_JOB_APPLY_LOOP_INTERVAL:- readonly job_apply_loop_timeout_seconds="${K8S_PLUGIN_JOB_APPLY_LOOP_TIMEOUT:-120}" readonly log_loop_interval_seconds="${K8S_PLUGIN_LOG_LOOP_INTERVAL:-3}" readonly job_status_loop_sleep_interval="${K8S_PLUGIN_JOB_STATUS_LOOP_INTERVAL:-5}" -readonly cleanup_old_jobs_loop_interval_seconds="${K8S_PLUGIN_CLEANUP_OLD_JOBS_LOOP_INTERVAL:-5}" -readonly cleanup_old_jobs_loop_timeout_seconds="${K8S_PLUGIN_CLEANUP_OLD_JOBS_LOOP_INTERVAL:-2}" readonly job_json="$(mktemp)" readonly job_log_complete_marker_file="$(mktemp)" function cleanup { - # Delete all jobs older than a day - local -r cleanup_start_time="$SECONDS" - local cleanup_old_jobs_exit_code="" - - while [[ "$((SECONDS - cleanup_start_time))" -lt "$cleanup_old_jobs_loop_timeout_seconds" ]] - do - set +e - local old_jobs="$(kubectl get job -l buildkite/plugin=k8s | awk 'match($4,/[0-9]+d/) {print $1}')" - if [[ -n "$old_jobs" ]]; then - kubectl delete job "$old_jobs" - fi - cleanup_old_jobs_exit_code="$?" - set -e - - if [[ "$cleanup_old_jobs_exit_code" == "0" ]]; then - break - else - echo "Attempt to cleanup old jobs failed, exit code '$cleanup_old_jobs_exit_code'" - sleep "$cleanup_old_jobs_loop_interval_seconds" - fi - done - - echo "Cleanup old jobs exit code '$cleanup_old_jobs_exit_code'" rm -f "$job_json" "$job_log_complete_marker_file" } trap cleanup EXIT From 732803622ad50040babbf8a86813b29d91d3ef56 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 2 Jun 2020 15:41:16 -0700 Subject: [PATCH 24/62] Reduce TTL to ease load on apiserver --- lib/job.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/job.jsonnet b/lib/job.jsonnet index 3ca1686..fd100d8 100644 --- a/lib/job.jsonnet +++ b/lib/job.jsonnet @@ -260,7 +260,7 @@ function(jobName, agentEnv={}, stepEnvFile='', patchFunc=identity) patchFunc({ backoffLimit: 0, activeDeadlineSeconds: deadline, completions: 1, - ttlSecondsAfterFinished: 86400, + ttlSecondsAfterFinished: 3600, template: { metadata: { labels: labels, From 5f9f5822aa13da4f363ecebf516769ed5780e6dc Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 2 Jun 2020 20:44:11 -0700 Subject: [PATCH 25/62] print env vars in pre-exit, catch exit code --- hooks/pre-exit | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hooks/pre-exit b/hooks/pre-exit index 258b6a7..4bd4b63 100755 --- a/hooks/pre-exit +++ b/hooks/pre-exit @@ -6,6 +6,8 @@ job_name="$(cat /tmp/job_name)" echo "--- :kubernetes: Cleanup" +printenv + readonly job_cleanup_loop_interval_seconds="${K8S_PLUGIN_JOB_CLEANUP_LOOP_INTERVAL:-5}" readonly job_cleanup_loop_timeout_seconds="${K8S_PLUGIN_JOB_CLEANUP_LOOP_TIMEOUT:-60}" readonly job_cleanup_start_time="$SECONDS" From 8fe3ce5f1a4a2c306c7d7a2bd2121d0c1ad0fabd Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 2 Jun 2020 21:29:22 -0700 Subject: [PATCH 26/62] also print cmd args in pre-exit --- hooks/command | 4 ++-- hooks/pre-exit | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/hooks/command b/hooks/command index af13f4d..2749bc1 100755 --- a/hooks/command +++ b/hooks/command @@ -78,11 +78,11 @@ do if [[ "$job_apply_exit_code" == "0" ]]; then break else - echo "Attempt to schedule the job failed, exit code '$job_apply_exit_code'" + echo "Attempt to apply the job failed, exit code '$job_apply_exit_code'" sleep "$job_apply_loop_interval_seconds" fi done -echo "Schedule job exit code '$job_apply_exit_code'" +echo "Apply job exit code '$job_apply_exit_code'" echo "Timeout: ${timeout}s" diff --git a/hooks/pre-exit b/hooks/pre-exit index 4bd4b63..175ecf7 100755 --- a/hooks/pre-exit +++ b/hooks/pre-exit @@ -6,7 +6,8 @@ job_name="$(cat /tmp/job_name)" echo "--- :kubernetes: Cleanup" -printenv +env +echo "cmd args: $@" readonly job_cleanup_loop_interval_seconds="${K8S_PLUGIN_JOB_CLEANUP_LOOP_INTERVAL:-5}" readonly job_cleanup_loop_timeout_seconds="${K8S_PLUGIN_JOB_CLEANUP_LOOP_TIMEOUT:-60}" From d6b7b521a2351279f6afa3a71eb839475e478577 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 2 Jun 2020 21:44:42 -0700 Subject: [PATCH 27/62] Skip job cleanup if command successful, saves 3 kubectl calls --- hooks/pre-exit | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hooks/pre-exit b/hooks/pre-exit index 175ecf7..3e950e4 100755 --- a/hooks/pre-exit +++ b/hooks/pre-exit @@ -6,8 +6,10 @@ job_name="$(cat /tmp/job_name)" echo "--- :kubernetes: Cleanup" -env -echo "cmd args: $@" +if [[ "$BUILDKITE_COMMAND_EXIT_STATUS" == "0" ]]; then + # If command succeeded there is no need to cleanup k8s jobs (reduces load on apiserver). + exit 0 +fi readonly job_cleanup_loop_interval_seconds="${K8S_PLUGIN_JOB_CLEANUP_LOOP_INTERVAL:-5}" readonly job_cleanup_loop_timeout_seconds="${K8S_PLUGIN_JOB_CLEANUP_LOOP_TIMEOUT:-60}" From df4880b9ae3fbfbd377a2a429670d0764b62a110 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 2 Jun 2020 21:45:34 -0700 Subject: [PATCH 28/62] Reduce ttl to 600 --- lib/job.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/job.jsonnet b/lib/job.jsonnet index fd100d8..22296e5 100644 --- a/lib/job.jsonnet +++ b/lib/job.jsonnet @@ -260,7 +260,7 @@ function(jobName, agentEnv={}, stepEnvFile='', patchFunc=identity) patchFunc({ backoffLimit: 0, activeDeadlineSeconds: deadline, completions: 1, - ttlSecondsAfterFinished: 3600, + ttlSecondsAfterFinished: 600, template: { metadata: { labels: labels, From 21042378fc4ba3cbc174a362bc1aefd269814139 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 2 Jun 2020 21:54:26 -0700 Subject: [PATCH 29/62] Add job ttl property with old value as default --- lib/job.jsonnet | 3 ++- plugin.yml | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/job.jsonnet b/lib/job.jsonnet index 22296e5..22117ef 100644 --- a/lib/job.jsonnet +++ b/lib/job.jsonnet @@ -68,6 +68,7 @@ function(jobName, agentEnv={}, stepEnvFile='', patchFunc=identity) patchFunc({ BUILDKITE_PLUGIN_K8S_RESOURCES_REQUEST_MEMORY: '', BUILDKITE_PLUGIN_K8S_RESOURCES_LIMIT_MEMORY: '', BUILDKITE_PLUGIN_K8S_WORKDIR: std.join('/', [env.BUILDKITE_BUILD_PATH, buildSubPath]), + BUILDKITE_PLUGIN_K8S_JOB_TTL_SECONDS_AFTER_FINISHED: 86400, } + agentEnv, local stepEnv = @@ -260,7 +261,7 @@ function(jobName, agentEnv={}, stepEnvFile='', patchFunc=identity) patchFunc({ backoffLimit: 0, activeDeadlineSeconds: deadline, completions: 1, - ttlSecondsAfterFinished: 600, + ttlSecondsAfterFinished: env.BUILDKITE_PLUGIN_K8S_JOB_TTL_SECONDS_AFTER_FINISHED, template: { metadata: { labels: labels, diff --git a/plugin.yml b/plugin.yml index beea8fc..e207db8 100644 --- a/plugin.yml +++ b/plugin.yml @@ -27,6 +27,8 @@ configuration: type: [string, array] privileged: type: boolean + job-ttl-seconds-after-finished: + type: number secret-name: type: string git-credentials-secret-name: From 3bc537a0d2148ff6bf14050dcda605847ecc4fbb Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 2 Jun 2020 22:00:40 -0700 Subject: [PATCH 30/62] Parse ttl as int --- lib/job.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/job.jsonnet b/lib/job.jsonnet index 22117ef..1eb7a9b 100644 --- a/lib/job.jsonnet +++ b/lib/job.jsonnet @@ -261,7 +261,7 @@ function(jobName, agentEnv={}, stepEnvFile='', patchFunc=identity) patchFunc({ backoffLimit: 0, activeDeadlineSeconds: deadline, completions: 1, - ttlSecondsAfterFinished: env.BUILDKITE_PLUGIN_K8S_JOB_TTL_SECONDS_AFTER_FINISHED, + ttlSecondsAfterFinished: std.parseInt(env.BUILDKITE_PLUGIN_K8S_JOB_TTL_SECONDS_AFTER_FINISHED), template: { metadata: { labels: labels, From b0ef161d87eb888266b53df461b9d83569311285 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Wed, 3 Jun 2020 14:37:12 -0700 Subject: [PATCH 31/62] debug BUILDKITE_ENV_FILE encoding --- hooks/command | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hooks/command b/hooks/command index 2749bc1..ce19a02 100755 --- a/hooks/command +++ b/hooks/command @@ -57,6 +57,9 @@ echo "--- :kubernetes: Starting Kubernetes Job" export patchFunc=${BUILDKITE_PLUGIN_K8S_PATCH:-"function(f) f"} +echo "BUILDKITE_ENV_FILE contents:" +cat "$BUILDKITE_ENV_FILE" + jsonnet \ --tla-str "jobName=${job_name}" \ --tla-str-file "stepEnvFile=${BUILDKITE_ENV_FILE}" \ From e28e51a51738a77d30ef5d644f7ff9d7db2045b5 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Wed, 3 Jun 2020 14:42:36 -0700 Subject: [PATCH 32/62] Strip " from env vars --- lib/job.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/job.jsonnet b/lib/job.jsonnet index 1eb7a9b..d2eff2c 100644 --- a/lib/job.jsonnet +++ b/lib/job.jsonnet @@ -76,7 +76,7 @@ function(jobName, agentEnv={}, stepEnvFile='', patchFunc=identity) patchFunc({ { local kv = std.splitLimit(l, '=', 1), name: kv[0], - value: kv[1], + value: std.stripChars(kv[1], '"'), } for l in std.split(stepEnvFile, '\n') if l != '' && !std.startsWith(l, 'BUILDKITE') From 0ea00ea9e5f84ea82270eca193ec5187e23c66c5 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Wed, 3 Jun 2020 15:26:49 -0700 Subject: [PATCH 33/62] Add comment, remove print --- hooks/command | 3 --- lib/job.jsonnet | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/hooks/command b/hooks/command index ce19a02..2749bc1 100755 --- a/hooks/command +++ b/hooks/command @@ -57,9 +57,6 @@ echo "--- :kubernetes: Starting Kubernetes Job" export patchFunc=${BUILDKITE_PLUGIN_K8S_PATCH:-"function(f) f"} -echo "BUILDKITE_ENV_FILE contents:" -cat "$BUILDKITE_ENV_FILE" - jsonnet \ --tla-str "jobName=${job_name}" \ --tla-str-file "stepEnvFile=${BUILDKITE_ENV_FILE}" \ diff --git a/lib/job.jsonnet b/lib/job.jsonnet index d2eff2c..402cf65 100644 --- a/lib/job.jsonnet +++ b/lib/job.jsonnet @@ -76,6 +76,7 @@ function(jobName, agentEnv={}, stepEnvFile='', patchFunc=identity) patchFunc({ { local kv = std.splitLimit(l, '=', 1), name: kv[0], + # Remove outer qoutes added by Buildkite. value: std.stripChars(kv[1], '"'), } for l in std.split(stepEnvFile, '\n') From e1b42c2812685b58316ca637b37d42b0c07d8142 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Wed, 3 Jun 2020 17:24:41 -0700 Subject: [PATCH 34/62] Remove quote handling for now --- lib/job.jsonnet | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/job.jsonnet b/lib/job.jsonnet index 402cf65..1eb7a9b 100644 --- a/lib/job.jsonnet +++ b/lib/job.jsonnet @@ -76,8 +76,7 @@ function(jobName, agentEnv={}, stepEnvFile='', patchFunc=identity) patchFunc({ { local kv = std.splitLimit(l, '=', 1), name: kv[0], - # Remove outer qoutes added by Buildkite. - value: std.stripChars(kv[1], '"'), + value: kv[1], } for l in std.split(stepEnvFile, '\n') if l != '' && !std.startsWith(l, 'BUILDKITE') From 7f0ba66e96eadadd66b80cf9d0bbdfbadfc8fd47 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Thu, 4 Jun 2020 22:46:09 -0700 Subject: [PATCH 35/62] Limit label values to 63 as per k8s spec --- lib/job.jsonnet | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/job.jsonnet b/lib/job.jsonnet index 1eb7a9b..2799f82 100644 --- a/lib/job.jsonnet +++ b/lib/job.jsonnet @@ -35,10 +35,11 @@ local numberSuffix(s) = local labelChars = std.set(std.stringChars('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.')); local labelValue(s) = - std.join('', [ + local sanitizedValue = std.join('', [ if std.setMember(c, labelChars) then c else '_' for c in std.stringChars(s) ]); + if std.length(sanitizedValue) < 63 then sanitizedValue else std.substr(sanitizedValue, 0, 63); function(jobName, agentEnv={}, stepEnvFile='', patchFunc=identity) patchFunc({ local buildSubPath = std.join('/', [ From 1b34a6ddc241de732683b4f69a061fe6e9ab60e7 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Fri, 5 Jun 2020 01:32:28 -0700 Subject: [PATCH 36/62] Fix default TTL parseInt --- lib/job.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/job.jsonnet b/lib/job.jsonnet index 2799f82..eaee22e 100644 --- a/lib/job.jsonnet +++ b/lib/job.jsonnet @@ -69,7 +69,7 @@ function(jobName, agentEnv={}, stepEnvFile='', patchFunc=identity) patchFunc({ BUILDKITE_PLUGIN_K8S_RESOURCES_REQUEST_MEMORY: '', BUILDKITE_PLUGIN_K8S_RESOURCES_LIMIT_MEMORY: '', BUILDKITE_PLUGIN_K8S_WORKDIR: std.join('/', [env.BUILDKITE_BUILD_PATH, buildSubPath]), - BUILDKITE_PLUGIN_K8S_JOB_TTL_SECONDS_AFTER_FINISHED: 86400, + BUILDKITE_PLUGIN_K8S_JOB_TTL_SECONDS_AFTER_FINISHED: '86400', } + agentEnv, local stepEnv = From 6493b5d37fbbf00fe167de79702476c2833f7823 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Mon, 6 Jul 2020 15:47:00 -0700 Subject: [PATCH 37/62] Remove build/message from annotation due to k8s annotations length limit --- lib/job.jsonnet | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/job.jsonnet b/lib/job.jsonnet index eaee22e..bf73f51 100644 --- a/lib/job.jsonnet +++ b/lib/job.jsonnet @@ -130,7 +130,6 @@ function(jobName, agentEnv={}, stepEnvFile='', patchFunc=identity) patchFunc({ 'build/creator-email': env.BUILDKITE_BUILD_CREATOR_EMAIL, 'build/id': env.BUILDKITE_BUILD_ID, 'build/url': env.BUILDKITE_BUILD_URL, - 'build/message': env.BUILDKITE_MESSAGE, 'build/number': env.BUILDKITE_BUILD_NUMBER, 'build/organization': env.BUILDKITE_ORGANIZATION_SLUG, 'build/repo': env.BUILDKITE_REPO, From b494d8856a4c335a31458d742b5d94bd941fd820 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Mon, 20 Jul 2020 17:03:05 -0700 Subject: [PATCH 38/62] Add BUILDKITE_BUILD_ID to allowedEnvs --- lib/job.jsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/job.jsonnet b/lib/job.jsonnet index bf73f51..c6c795b 100644 --- a/lib/job.jsonnet +++ b/lib/job.jsonnet @@ -8,6 +8,7 @@ local allowedEnvs = std.set( 'BUILDKITE_MESSAGE', 'BUILDKITE_BUILD_CREATOR', 'BUILDKITE_BUILD_CREATOR_EMAIL', + 'BUILDKITE_BUILD_ID', 'BUILDKITE_BUILD_NUMBER', 'BUILDKITE_BUILD_PATH', 'BUILDKITE_BUILD_URL', From 9c8e6c4b159e99c6743a799a19430f93d18ca125 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 21 Jul 2020 14:24:11 -0700 Subject: [PATCH 39/62] Add more variables to allow overrides --- hooks/command | 13 +++++++++---- hooks/pre-exit | 12 +++++++----- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/hooks/command b/hooks/command index 2749bc1..2faf781 100755 --- a/hooks/command +++ b/hooks/command @@ -13,10 +13,15 @@ fi ((timeout=BUILDKITE_TIMEOUT*60)) export BUILDKITE_TIMEOUT +# Default values can be overridden by setting "K8S_PLUGIN_*" env vars as used below. readonly job_apply_loop_interval_seconds="${K8S_PLUGIN_JOB_APPLY_LOOP_INTERVAL:-5}" readonly job_apply_loop_timeout_seconds="${K8S_PLUGIN_JOB_APPLY_LOOP_TIMEOUT:-120}" readonly log_loop_interval_seconds="${K8S_PLUGIN_LOG_LOOP_INTERVAL:-3}" +readonly log_loop_timeout_seconds="${K8S_PLUGIN_LOG_LOOP_TIMEOUT:-5}" +readonly log_loop_snapshot_limit_bytes="${K8S_PLUGIN_LOG_LOOP_SNAPSHOT_LIMIT_BYTES:-1024}" readonly job_status_loop_sleep_interval="${K8S_PLUGIN_JOB_STATUS_LOOP_INTERVAL:-5}" +readonly log_complete_loop_interval_seconds="${K8S_PLUGIN_LOG_COMPLETE_LOOP_INTERVAL:-1}" +readonly log_complete_loop_timeout_seconds="${K8S_PLUGIN_LOG_COMPLETE_LOOP_TIMEOUT:-30}" readonly job_json="$(mktemp)" readonly job_log_complete_marker_file="$(mktemp)" @@ -33,7 +38,7 @@ function tail_logs { while true; do set +e - log_snapshot="$(timeout 5 kubectl logs --limit-bytes 1024 "job/${job_name}" 2>/dev/null)" + log_snapshot="$(timeout "$log_loop_timeout_seconds" kubectl logs --limit-bytes "$log_loop_snapshot_limit_bytes" "job/${job_name}" 2>/dev/null)" set -e if [[ -n "$log_snapshot" ]]; then break @@ -104,11 +109,11 @@ done echo "--- :kubernetes: Job status: $jobstatus" -# Wait for logs to be printed because printing runs in a fork and we're racing with it. +# Wait for logs to be fully printed, printing runs in a separate process and we're racing with it. readonly log_wait_start_time="$SECONDS" -while [[ "$(cat "$job_log_complete_marker_file")" != "0" ]] && [[ "$((SECONDS - log_wait_start_time))" -lt 30 ]] +while [[ "$(cat "$job_log_complete_marker_file")" != "0" ]] && [[ "$((SECONDS - log_wait_start_time))" -lt "$log_complete_loop_timeout_seconds" ]] do - sleep 0.5 + sleep "$log_complete_loop_interval_seconds" done status=1 diff --git a/hooks/pre-exit b/hooks/pre-exit index 3e950e4..67f9b24 100755 --- a/hooks/pre-exit +++ b/hooks/pre-exit @@ -6,15 +6,17 @@ job_name="$(cat /tmp/job_name)" echo "--- :kubernetes: Cleanup" -if [[ "$BUILDKITE_COMMAND_EXIT_STATUS" == "0" ]]; then - # If command succeeded there is no need to cleanup k8s jobs (reduces load on apiserver). - exit 0 -fi - +# Default values can be overridden by setting "K8S_PLUGIN_*" env vars as used below. +readonly job_cleanup_if_successful="${K8S_PLUGIN_JOB_CLEANUP_IF_SUCCESSFUL:-true}" readonly job_cleanup_loop_interval_seconds="${K8S_PLUGIN_JOB_CLEANUP_LOOP_INTERVAL:-5}" readonly job_cleanup_loop_timeout_seconds="${K8S_PLUGIN_JOB_CLEANUP_LOOP_TIMEOUT:-60}" readonly job_cleanup_start_time="$SECONDS" +# Optionally skip cleanup if Job succeeded (reduces load on k8s apiserver). +if [[ "$BUILDKITE_COMMAND_EXIT_STATUS" == "0" && "$job_cleanup_if_successful" != "true" ]]; then + exit 0 +fi + job_cleanup_exit_code="" while [[ "$((SECONDS - job_cleanup_start_time))" -lt "$job_cleanup_loop_timeout_seconds" ]] do From 154765b2f8a7a16d575e33bab185e01d0474ea55 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 21 Jul 2020 16:37:53 -0700 Subject: [PATCH 40/62] Add comment about job cleanup --- hooks/pre-exit | 1 + 1 file changed, 1 insertion(+) diff --git a/hooks/pre-exit b/hooks/pre-exit index 67f9b24..c04492a 100755 --- a/hooks/pre-exit +++ b/hooks/pre-exit @@ -13,6 +13,7 @@ readonly job_cleanup_loop_timeout_seconds="${K8S_PLUGIN_JOB_CLEANUP_LOOP_TIMEOUT readonly job_cleanup_start_time="$SECONDS" # Optionally skip cleanup if Job succeeded (reduces load on k8s apiserver). +# Set to "false" if you have TTL Job Controller, https://github.com/lwolf/kube-cleanup-operator or other cleanup controller. if [[ "$BUILDKITE_COMMAND_EXIT_STATUS" == "0" && "$job_cleanup_if_successful" != "true" ]]; then exit 0 fi From 0873b04a7de16377e6d5aa950105a95fe5d0eff7 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Wed, 22 Jul 2020 18:40:35 -0700 Subject: [PATCH 41/62] Add debug echo for job spec --- hooks/command | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hooks/command b/hooks/command index a4db073..a04b971 100755 --- a/hooks/command +++ b/hooks/command @@ -23,6 +23,7 @@ readonly job_status_loop_sleep_interval="${K8S_PLUGIN_JOB_STATUS_LOOP_INTERVAL:- readonly log_complete_loop_interval_seconds="${K8S_PLUGIN_LOG_COMPLETE_LOOP_INTERVAL:-1}" readonly log_complete_loop_timeout_seconds="${K8S_PLUGIN_LOG_COMPLETE_LOOP_TIMEOUT:-30}" readonly use_agent_node_affinity=${BUILDKITE_PLUGIN_K8S_USE_AGENT_NODE_AFFINITY:-false} +readonly print_resulting_job_spec=${BUILDKITE_PLUGIN_K8S_PRINT_RESULTING_JOB_SPEC:-false} readonly job_log_complete_marker_file="$(mktemp)" @@ -63,7 +64,7 @@ echo "--- :kubernetes: Starting Kubernetes Job" export patchFunc=${BUILDKITE_PLUGIN_K8S_PATCH:-"function(f) f"} -jobspec="$(jsonnet \ +job_spec="$(jsonnet \ --tla-str "jobName=${job_name}" \ --tla-str-file "stepEnvFile=${BUILDKITE_ENV_FILE}" \ --tla-code "agentEnv=$(jq -c -n env)" \ @@ -73,17 +74,21 @@ jobspec="$(jsonnet \ if [[ "$use_agent_node_affinity" == "true" ]]; then for field in affinity tolerations; do buildkite_agent_value="$(kubectl get pod "$(cat /etc/hostname)" -o json | jq ".spec.$field")" - jobspec="$(echo "$jobspec" | jq ".spec.template.spec.$field=$buildkite_agent_value")" + job_spec="$(echo "$job_spec" | jq ".spec.template.spec.$field=$buildkite_agent_value")" done fi +if [[ "$print_resulting_job_spec" == "true" ]]; then + echo -e "Resulting k8s job spec:\n$job_spec" +fi + readonly job_apply_start_time="$SECONDS" job_apply_exit_code="" while [[ "$((SECONDS - job_apply_start_time))" -lt "$job_apply_loop_timeout_seconds" ]] do set +e - echo "$jobspec" | kubectl apply -f - + echo "$job_spec" | kubectl apply -f - job_apply_exit_code="$?" set -e From 98c92c82921182e5f52640d72f3fb081818dcec2 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Wed, 22 Jul 2020 18:58:22 -0700 Subject: [PATCH 42/62] Fix var name --- hooks/command | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hooks/command b/hooks/command index a04b971..1b000da 100755 --- a/hooks/command +++ b/hooks/command @@ -122,8 +122,8 @@ done echo "--- :kubernetes: Job status: $jobstatus" # Wait for logs to be fully printed, printing runs in a separate process and we're racing with it. -readonly log_wait_start_time="$SECONDS" -while [[ "$(cat "$job_log_complete_marker_file")" != "0" ]] && [[ "$((SECONDS - log_wait_start_time))" -lt "$log_complete_loop_timeout_seconds" ]] +readonly log_complete_start_time="$SECONDS" +while [[ "$(cat "$job_log_complete_marker_file")" != "0" ]] && [[ "$((SECONDS - log_complete_start_time))" -lt "$log_complete_loop_timeout_seconds" ]] do sleep "$log_complete_loop_interval_seconds" done From 8dc6865887deeb8612b59c5c3f664f0923617343 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Wed, 22 Jul 2020 23:33:41 -0700 Subject: [PATCH 43/62] Remove Job TTL from plugin.yaml for now --- plugin.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/plugin.yml b/plugin.yml index c65f6b5..8d529b8 100644 --- a/plugin.yml +++ b/plugin.yml @@ -29,8 +29,6 @@ configuration: type: [string, array] privileged: type: boolean - job-ttl-seconds-after-finished: - type: number secret-name: type: string git-credentials-secret-name: From 259cd3b5db83909011d5696509ae28537e9ed219 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 15 Sep 2020 14:33:45 -0700 Subject: [PATCH 44/62] WIP propagate exit code --- .gitignore | 1 + hooks/command | 4 ++++ 2 files changed, 5 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9f11b75 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea/ diff --git a/hooks/command b/hooks/command index 1b000da..98d0ca4 100755 --- a/hooks/command +++ b/hooks/command @@ -132,6 +132,10 @@ status=1 if [[ "$jobstatus" == "Complete" ]] ; then echo "success" status=0 +else + readonly pod_name=$(kubectl get pod -l "job-name=$job_name" --output=jsonpath="{.items[*].metadata.name}") + readonly pod_json=$(kubectl get pod "$pod_name" -o json) + status="$(echo "$pod_json" | jq ".status.containerStatuses[0].state.terminated.exitCode")" fi exit $status From 65192202c691578f25d42d04d57fb4b12d5b49c6 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 15 Sep 2020 14:38:32 -0700 Subject: [PATCH 45/62] Wrap in loop --- hooks/command | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/hooks/command b/hooks/command index 98d0ca4..7bfb2be 100755 --- a/hooks/command +++ b/hooks/command @@ -128,14 +128,22 @@ do sleep "$log_complete_loop_interval_seconds" done -status=1 +status="" if [[ "$jobstatus" == "Complete" ]] ; then echo "success" status=0 else - readonly pod_name=$(kubectl get pod -l "job-name=$job_name" --output=jsonpath="{.items[*].metadata.name}") - readonly pod_json=$(kubectl get pod "$pod_name" -o json) - status="$(echo "$pod_json" | jq ".status.containerStatuses[0].state.terminated.exitCode")" + while [[ -z "$status" ]] ; do + set +e + pod_name=$(kubectl get pod -l "job-name=$job_name" --output=jsonpath="{.items[*].metadata.name}") + pod_json=$(kubectl get pod "$pod_name" -o json) + status="$(echo "$pod_json" | jq ".status.containerStatuses[0].state.terminated.exitCode")" + set -e + sleep "$job_status_loop_sleep_interval" + if [[ $timeout -gt 0 ]]; then + (( counter -= job_status_loop_sleep_interval )) || status="1" + fi + done fi exit $status From 32b9e289feb46b0caf056da9b27bdce752732752 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Wed, 30 Sep 2020 16:13:01 -0700 Subject: [PATCH 46/62] Inline log_loop_snapshot_limit_bytes --- hooks/command | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hooks/command b/hooks/command index 7bfb2be..c6edfc3 100755 --- a/hooks/command +++ b/hooks/command @@ -18,7 +18,6 @@ readonly job_apply_loop_interval_seconds="${K8S_PLUGIN_JOB_APPLY_LOOP_INTERVAL:- readonly job_apply_loop_timeout_seconds="${K8S_PLUGIN_JOB_APPLY_LOOP_TIMEOUT:-120}" readonly log_loop_interval_seconds="${K8S_PLUGIN_LOG_LOOP_INTERVAL:-3}" readonly log_loop_timeout_seconds="${K8S_PLUGIN_LOG_LOOP_TIMEOUT:-5}" -readonly log_loop_snapshot_limit_bytes="${K8S_PLUGIN_LOG_LOOP_SNAPSHOT_LIMIT_BYTES:-1024}" readonly job_status_loop_sleep_interval="${K8S_PLUGIN_JOB_STATUS_LOOP_INTERVAL:-5}" readonly log_complete_loop_interval_seconds="${K8S_PLUGIN_LOG_COMPLETE_LOOP_INTERVAL:-1}" readonly log_complete_loop_timeout_seconds="${K8S_PLUGIN_LOG_COMPLETE_LOOP_TIMEOUT:-30}" @@ -40,7 +39,7 @@ function tail_logs { while true; do set +e - log_snapshot="$(timeout "$log_loop_timeout_seconds" kubectl logs --limit-bytes "$log_loop_snapshot_limit_bytes" "job/${job_name}" 2>/dev/null)" + log_snapshot="$(timeout "$log_loop_timeout_seconds" kubectl logs --limit-bytes "1024" "job/${job_name}" 2>/dev/null)" set -e if [[ -n "$log_snapshot" ]]; then break From e8e9cfc5aa2b4b379ba5dff08ccd7ce165a15352 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Wed, 30 Sep 2020 16:16:22 -0700 Subject: [PATCH 47/62] Use one convention for ENV var names --- hooks/command | 16 ++++++++-------- hooks/pre-exit | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/hooks/command b/hooks/command index c6edfc3..12957a2 100755 --- a/hooks/command +++ b/hooks/command @@ -13,14 +13,14 @@ fi ((timeout=BUILDKITE_TIMEOUT*60)) export BUILDKITE_TIMEOUT -# Default values can be overridden by setting "K8S_PLUGIN_*" env vars as used below. -readonly job_apply_loop_interval_seconds="${K8S_PLUGIN_JOB_APPLY_LOOP_INTERVAL:-5}" -readonly job_apply_loop_timeout_seconds="${K8S_PLUGIN_JOB_APPLY_LOOP_TIMEOUT:-120}" -readonly log_loop_interval_seconds="${K8S_PLUGIN_LOG_LOOP_INTERVAL:-3}" -readonly log_loop_timeout_seconds="${K8S_PLUGIN_LOG_LOOP_TIMEOUT:-5}" -readonly job_status_loop_sleep_interval="${K8S_PLUGIN_JOB_STATUS_LOOP_INTERVAL:-5}" -readonly log_complete_loop_interval_seconds="${K8S_PLUGIN_LOG_COMPLETE_LOOP_INTERVAL:-1}" -readonly log_complete_loop_timeout_seconds="${K8S_PLUGIN_LOG_COMPLETE_LOOP_TIMEOUT:-30}" +# Default values can be overridden by setting "BUILDKITE_PLUGIN_K8S_*" env vars as used below. +readonly job_apply_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_INTERVAL:-5}" +readonly job_apply_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_TIMEOUT:-120}" +readonly log_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_LOG_LOOP_INTERVAL:-3}" +readonly log_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_LOG_LOOP_TIMEOUT:-5}" +readonly job_status_loop_sleep_interval="${BUILDKITE_PLUGIN_K8S_JOB_STATUS_LOOP_INTERVAL:-5}" +readonly log_complete_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_INTERVAL:-1}" +readonly log_complete_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_TIMEOUT:-30}" readonly use_agent_node_affinity=${BUILDKITE_PLUGIN_K8S_USE_AGENT_NODE_AFFINITY:-false} readonly print_resulting_job_spec=${BUILDKITE_PLUGIN_K8S_PRINT_RESULTING_JOB_SPEC:-false} diff --git a/hooks/pre-exit b/hooks/pre-exit index c04492a..3ff2671 100755 --- a/hooks/pre-exit +++ b/hooks/pre-exit @@ -6,10 +6,10 @@ job_name="$(cat /tmp/job_name)" echo "--- :kubernetes: Cleanup" -# Default values can be overridden by setting "K8S_PLUGIN_*" env vars as used below. -readonly job_cleanup_if_successful="${K8S_PLUGIN_JOB_CLEANUP_IF_SUCCESSFUL:-true}" -readonly job_cleanup_loop_interval_seconds="${K8S_PLUGIN_JOB_CLEANUP_LOOP_INTERVAL:-5}" -readonly job_cleanup_loop_timeout_seconds="${K8S_PLUGIN_JOB_CLEANUP_LOOP_TIMEOUT:-60}" +# Default values can be overridden by setting "BUILDKITE_PLUGIN_K8S_*" env vars as used below. +readonly job_cleanup_if_successful="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_IF_SUCCESSFUL:-true}" +readonly job_cleanup_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_INTERVAL:-5}" +readonly job_cleanup_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_TIMEOUT:-60}" readonly job_cleanup_start_time="$SECONDS" # Optionally skip cleanup if Job succeeded (reduces load on k8s apiserver). From 8603721412ac327b45c2844aed8e74f5b8c4113e Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Wed, 30 Sep 2020 17:07:14 -0700 Subject: [PATCH 48/62] Add docs for new env variables --- README.md | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++ hooks/command | 4 +-- 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index facb19f..74a4693 100644 --- a/README.md +++ b/README.md @@ -239,6 +239,81 @@ patch: | } ``` +### Configurable Environment Variables + +Some of the plugin options can be configured via environment variables as following ([also see Buildkite docs](https://buildkite.com/docs/pipelines/environment-variables#defining-your-own)): + +```yaml +env: + BUILDKITE_PLUGIN_K8S_PRINT_RESULTING_JOB_SPEC: "true" +``` + +#### BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_INTERVAL + +- Configures loop interval between plugin attempts to schedule the k8s job +- Default: `5` +- Unit type: integer seconds + +#### BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_TIMEOUT + +- Configures time limit for plugin attempts to schedule the k8s job +- Default: `120` +- Unit type: integer seconds + +#### BUILDKITE_PLUGIN_K8S_JOB_STATUS_LOOP_INTERVAL + +- Configures loop interval for plugin attempts to get k8s job status +- Default: `5` +- Unit type: integer seconds + +#### BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_INTERVAL + +- Configures loop interval for plugin attempts to verify that log streaming has ended +- Default: `1` +- Unit type: integer seconds + +#### BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_TIMEOUT + +- Configures time limit for plugin attempts to verify that log streaming has ended +- Default: `30` +- Unit type: integer seconds + +#### BUILDKITE_PLUGIN_K8S_LOG_LOOP_INTERVAL + +- Configures loop interval for plugin attempts to stream job logs +- Default: `3` +- Unit type: integer seconds + +#### BUILDKITE_PLUGIN_K8S_LOG_LOOP_ATTEMPT_TIMEOUT + +- Configures time limit for a _single_ plugin attempt to stream job logs +- Default: `5` +- Unit type: integer seconds + +#### BUILDKITE_PLUGIN_K8S_PRINT_RESULTING_JOB_SPEC + +- Configures whether plugin should print resulting k8s job spec into the log +- Default: `false` +- Unit type: `true` or `false` string + +#### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_IF_SUCCESSFUL + +- Configures whether plugin should cleanup k8s job after its successful termination, you might want to disable it in case you rely on [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) or [lwolf/kube-cleanup-operator](https://github.com/lwolf/kube-cleanup-operator) +- Default: `true` +- Unit type: `true` or `false` string + +#### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_INTERVAL + +- Configures loop interval for plugin attempts to cleanup finished jobs +- Default: `5` +- Unit type: integer seconds + +#### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_TIMEOUT + +- Configures time limit for plugin attempts to cleanup finished jobs +- Default: `60` +- Unit type: integer seconds + ## Contributing We welcome community contributions to this project. diff --git a/hooks/command b/hooks/command index 12957a2..967513f 100755 --- a/hooks/command +++ b/hooks/command @@ -17,7 +17,7 @@ export BUILDKITE_TIMEOUT readonly job_apply_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_INTERVAL:-5}" readonly job_apply_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_TIMEOUT:-120}" readonly log_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_LOG_LOOP_INTERVAL:-3}" -readonly log_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_LOG_LOOP_TIMEOUT:-5}" +readonly log_loop_attempt_timeout_seconds="${BUILDKITE_PLUGIN_K8S_LOG_LOOP_ATTEMPT_TIMEOUT:-5}" readonly job_status_loop_sleep_interval="${BUILDKITE_PLUGIN_K8S_JOB_STATUS_LOOP_INTERVAL:-5}" readonly log_complete_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_INTERVAL:-1}" readonly log_complete_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_TIMEOUT:-30}" @@ -39,7 +39,7 @@ function tail_logs { while true; do set +e - log_snapshot="$(timeout "$log_loop_timeout_seconds" kubectl logs --limit-bytes "1024" "job/${job_name}" 2>/dev/null)" + log_snapshot="$(timeout "$log_loop_attempt_timeout_seconds" kubectl logs --limit-bytes "1024" "job/${job_name}" 2>/dev/null)" set -e if [[ -n "$log_snapshot" ]]; then break From dffecae8a1b975348da05debdd0d40855c5dc2b9 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Thu, 1 Oct 2020 13:38:32 -0700 Subject: [PATCH 49/62] Return old jobs cleanup back --- README.md | 6 ++++++ hooks/command | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/README.md b/README.md index 74a4693..cc1eaa1 100644 --- a/README.md +++ b/README.md @@ -314,6 +314,12 @@ env: - Default: `60` - Unit type: integer seconds +#### BUILDKITE_PLUGIN_K8S_REMOVE_OLD_JOBS + +- Configures whether plugin should cleanup finished k8s jobs if they're older than 1 day, you might want to disable this in case you rely on [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) or [lwolf/kube-cleanup-operator](https://github.com/lwolf/kube-cleanup-operator) +- Default: `true` +- Unit type: `true` or `false` string + ## Contributing We welcome community contributions to this project. diff --git a/hooks/command b/hooks/command index 967513f..71e87ff 100755 --- a/hooks/command +++ b/hooks/command @@ -23,11 +23,17 @@ readonly log_complete_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE readonly log_complete_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_TIMEOUT:-30}" readonly use_agent_node_affinity=${BUILDKITE_PLUGIN_K8S_USE_AGENT_NODE_AFFINITY:-false} readonly print_resulting_job_spec=${BUILDKITE_PLUGIN_K8S_PRINT_RESULTING_JOB_SPEC:-false} +readonly remove_old_jobs=${BUILDKITE_PLUGIN_K8S_REMOVE_OLD_JOBS:-true} readonly job_log_complete_marker_file="$(mktemp)" function cleanup { rm -f "$job_log_complete_marker_file" + + if [[ "$remove_old_jobs" == "true" ]]; then + # Delete all jobs older than a day. + kubectl delete job "$(kubectl get job -l buildkite/plugin=k8s | awk 'match($4,/[0-9]+d/) {print $1}')" 2>/dev/null || true + fi } trap cleanup EXIT From 5daa8a1fd8a8a99df32b62d228e6432bc66325b8 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Thu, 1 Oct 2020 13:40:29 -0700 Subject: [PATCH 50/62] Check job_apply_exit_code --- hooks/command | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hooks/command b/hooks/command index 71e87ff..047dd5a 100755 --- a/hooks/command +++ b/hooks/command @@ -104,7 +104,11 @@ do sleep "$job_apply_loop_interval_seconds" fi done -echo "Apply job exit code '$job_apply_exit_code'" + +if [[ "$job_apply_exit_code" != "0" ]]; then + echo "Failed to apply job, exit code '$job_apply_exit_code'" + exit "$job_apply_exit_code" +fi echo "Timeout: ${timeout}s" From 941e86b9d4c16e30eabf6cf28fcff49513b72a7e Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Thu, 1 Oct 2020 13:44:54 -0700 Subject: [PATCH 51/62] Dont sleep if jobstatus has been resolved --- hooks/command | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hooks/command b/hooks/command index 047dd5a..75fe715 100755 --- a/hooks/command +++ b/hooks/command @@ -122,7 +122,13 @@ while [[ -z "$jobstatus" ]] ; do set +e jobstatus=$(kubectl get job "${job_name}" -o 'jsonpath={.status.conditions[].type}') set -e - sleep "$job_status_loop_sleep_interval" + + if [[ -n "$jobstatus" ]]; then + break + else + sleep "$job_status_loop_sleep_interval" + fi + if [[ $timeout -gt 0 ]]; then (( counter -= job_status_loop_sleep_interval )) || jobstatus="timeout" fi From 2085c8813b41245cb44a24a7df22c60168661784 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Thu, 1 Oct 2020 13:50:54 -0700 Subject: [PATCH 52/62] Cleanup finished jobs by default --- README.md | 6 +++--- hooks/pre-exit | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index cc1eaa1..9fea723 100644 --- a/README.md +++ b/README.md @@ -296,9 +296,9 @@ env: - Default: `false` - Unit type: `true` or `false` string -#### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_IF_SUCCESSFUL +#### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_AFTER_FINISH -- Configures whether plugin should cleanup k8s job after its successful termination, you might want to disable it in case you rely on [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) or [lwolf/kube-cleanup-operator](https://github.com/lwolf/kube-cleanup-operator) +- Configures whether plugin should cleanup k8s job after it finishes, you might want to disable it in case you rely on [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) or [lwolf/kube-cleanup-operator](https://github.com/lwolf/kube-cleanup-operator) - Default: `true` - Unit type: `true` or `false` string @@ -316,7 +316,7 @@ env: #### BUILDKITE_PLUGIN_K8S_REMOVE_OLD_JOBS -- Configures whether plugin should cleanup finished k8s jobs if they're older than 1 day, you might want to disable this in case you rely on [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) or [lwolf/kube-cleanup-operator](https://github.com/lwolf/kube-cleanup-operator) +- Configures whether plugin should cleanup k8s jobs if they're older than 1 day, you might want to disable this in case you rely on [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) or [lwolf/kube-cleanup-operator](https://github.com/lwolf/kube-cleanup-operator) - Default: `true` - Unit type: `true` or `false` string diff --git a/hooks/pre-exit b/hooks/pre-exit index 3ff2671..6a0cad2 100755 --- a/hooks/pre-exit +++ b/hooks/pre-exit @@ -7,14 +7,14 @@ job_name="$(cat /tmp/job_name)" echo "--- :kubernetes: Cleanup" # Default values can be overridden by setting "BUILDKITE_PLUGIN_K8S_*" env vars as used below. -readonly job_cleanup_if_successful="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_IF_SUCCESSFUL:-true}" +readonly job_cleanup_after_finish="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_AFTER_FINISH:-true}" readonly job_cleanup_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_INTERVAL:-5}" readonly job_cleanup_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_TIMEOUT:-60}" readonly job_cleanup_start_time="$SECONDS" # Optionally skip cleanup if Job succeeded (reduces load on k8s apiserver). # Set to "false" if you have TTL Job Controller, https://github.com/lwolf/kube-cleanup-operator or other cleanup controller. -if [[ "$BUILDKITE_COMMAND_EXIT_STATUS" == "0" && "$job_cleanup_if_successful" != "true" ]]; then +if [[ "$job_cleanup_after_finish" != "true" ]]; then exit 0 fi From 1cef288f592c60828aecd46b589855b079010c56 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Thu, 1 Oct 2020 13:53:30 -0700 Subject: [PATCH 53/62] Document BUILDKITE_PLUGIN_K8S_JOB_TTL_SECONDS_AFTER_FINISHED --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 9fea723..e2e2921 100644 --- a/README.md +++ b/README.md @@ -320,6 +320,12 @@ env: - Default: `true` - Unit type: `true` or `false` string +#### BUILDKITE_PLUGIN_K8S_JOB_TTL_SECONDS_AFTER_FINISHED + +- Configures [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) on the k8s job, requires TTL Controller enabled in the cluster. +- Default: `86400` +- Unit type: integer seconds + ## Contributing We welcome community contributions to this project. From f89209b9fea3d572bb58c05e8b5f91ec66e1da4c Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Thu, 1 Oct 2020 18:16:13 -0700 Subject: [PATCH 54/62] Move some of the properties to YAML --- README.md | 56 +++++++++++++++++++++++--------------------------- hooks/command | 4 ++-- hooks/pre-exit | 1 - plugin.yml | 9 ++++++++ 4 files changed, 37 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index e2e2921..77da02b 100644 --- a/README.md +++ b/README.md @@ -239,93 +239,89 @@ patch: | } ``` -### Configurable Environment Variables +### `print-resulting-job-spec` (optional, boolean) + +If set to `true`, the resulting k8s job spec is printed to the log, can be useful for debugging. + +### `job-ttl-seconds-after-finished` (optinal, integer) + +Configures [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) on the k8s job, requires TTL Controller enabled in the cluster, otherwise ignored. +Default value: `86400`. If you have TTL controller running, it is highly recommended to set `builtin-jobs-cleanup` to `false` to reduce load on k8s api servers. + +### `builtin-jobs-cleanup` (optional, boolean) + +If set to `true` plugin cleans up k8s jobs older than 1 day even if they're still running. +Default value: `true`. + +## Low Level Configuration via Environment Variables Some of the plugin options can be configured via environment variables as following ([also see Buildkite docs](https://buildkite.com/docs/pipelines/environment-variables#defining-your-own)): ```yaml env: - BUILDKITE_PLUGIN_K8S_PRINT_RESULTING_JOB_SPEC: "true" + BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_INTERVAL: "10" ``` -#### BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_INTERVAL +### BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_INTERVAL - Configures loop interval between plugin attempts to schedule the k8s job - Default: `5` - Unit type: integer seconds -#### BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_TIMEOUT +### BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_TIMEOUT - Configures time limit for plugin attempts to schedule the k8s job - Default: `120` - Unit type: integer seconds -#### BUILDKITE_PLUGIN_K8S_JOB_STATUS_LOOP_INTERVAL +### BUILDKITE_PLUGIN_K8S_JOB_STATUS_LOOP_INTERVAL - Configures loop interval for plugin attempts to get k8s job status - Default: `5` - Unit type: integer seconds -#### BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_INTERVAL +### BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_INTERVAL - Configures loop interval for plugin attempts to verify that log streaming has ended - Default: `1` - Unit type: integer seconds -#### BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_TIMEOUT +### BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_TIMEOUT - Configures time limit for plugin attempts to verify that log streaming has ended - Default: `30` - Unit type: integer seconds -#### BUILDKITE_PLUGIN_K8S_LOG_LOOP_INTERVAL +### BUILDKITE_PLUGIN_K8S_LOG_LOOP_INTERVAL - Configures loop interval for plugin attempts to stream job logs - Default: `3` - Unit type: integer seconds -#### BUILDKITE_PLUGIN_K8S_LOG_LOOP_ATTEMPT_TIMEOUT +### BUILDKITE_PLUGIN_K8S_LOG_LOOP_ATTEMPT_TIMEOUT - Configures time limit for a _single_ plugin attempt to stream job logs - Default: `5` - Unit type: integer seconds -#### BUILDKITE_PLUGIN_K8S_PRINT_RESULTING_JOB_SPEC - -- Configures whether plugin should print resulting k8s job spec into the log -- Default: `false` -- Unit type: `true` or `false` string - -#### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_AFTER_FINISH +### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_AFTER_FINISH - Configures whether plugin should cleanup k8s job after it finishes, you might want to disable it in case you rely on [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) or [lwolf/kube-cleanup-operator](https://github.com/lwolf/kube-cleanup-operator) - Default: `true` - Unit type: `true` or `false` string -#### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_INTERVAL +### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_INTERVAL - Configures loop interval for plugin attempts to cleanup finished jobs - Default: `5` - Unit type: integer seconds -#### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_TIMEOUT +### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_TIMEOUT - Configures time limit for plugin attempts to cleanup finished jobs - Default: `60` - Unit type: integer seconds -#### BUILDKITE_PLUGIN_K8S_REMOVE_OLD_JOBS - -- Configures whether plugin should cleanup k8s jobs if they're older than 1 day, you might want to disable this in case you rely on [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) or [lwolf/kube-cleanup-operator](https://github.com/lwolf/kube-cleanup-operator) -- Default: `true` -- Unit type: `true` or `false` string - -#### BUILDKITE_PLUGIN_K8S_JOB_TTL_SECONDS_AFTER_FINISHED - -- Configures [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) on the k8s job, requires TTL Controller enabled in the cluster. -- Default: `86400` -- Unit type: integer seconds - ## Contributing We welcome community contributions to this project. diff --git a/hooks/command b/hooks/command index 75fe715..e0dfb2e 100755 --- a/hooks/command +++ b/hooks/command @@ -23,14 +23,14 @@ readonly log_complete_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE readonly log_complete_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_TIMEOUT:-30}" readonly use_agent_node_affinity=${BUILDKITE_PLUGIN_K8S_USE_AGENT_NODE_AFFINITY:-false} readonly print_resulting_job_spec=${BUILDKITE_PLUGIN_K8S_PRINT_RESULTING_JOB_SPEC:-false} -readonly remove_old_jobs=${BUILDKITE_PLUGIN_K8S_REMOVE_OLD_JOBS:-true} +readonly builtin_jobs_cleanup=${BUILDKITE_PLUGIN_K8S_BUILTIN_JOBS_CLEANUP:-} readonly job_log_complete_marker_file="$(mktemp)" function cleanup { rm -f "$job_log_complete_marker_file" - if [[ "$remove_old_jobs" == "true" ]]; then + if [[ "$builtin_jobs_cleanup" == "true" ]]; then # Delete all jobs older than a day. kubectl delete job "$(kubectl get job -l buildkite/plugin=k8s | awk 'match($4,/[0-9]+d/) {print $1}')" 2>/dev/null || true fi diff --git a/hooks/pre-exit b/hooks/pre-exit index 6a0cad2..070282c 100755 --- a/hooks/pre-exit +++ b/hooks/pre-exit @@ -6,7 +6,6 @@ job_name="$(cat /tmp/job_name)" echo "--- :kubernetes: Cleanup" -# Default values can be overridden by setting "BUILDKITE_PLUGIN_K8S_*" env vars as used below. readonly job_cleanup_after_finish="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_AFTER_FINISH:-true}" readonly job_cleanup_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_INTERVAL:-5}" readonly job_cleanup_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_TIMEOUT:-60}" diff --git a/plugin.yml b/plugin.yml index 8d529b8..313d040 100644 --- a/plugin.yml +++ b/plugin.yml @@ -63,6 +63,15 @@ configuration: type: string use-agent-node-affinity: type: boolean + print-resulting-job-spec: + type: boolean + default: false + job-ttl-seconds-after-finished: + type: integer + default: 86400 + builtin-jobs-cleanup: + type: boolean + default: true required: - image additionalProperties: false From 071bb474257fb4a69a817c2bdd29900ea43d3ec0 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 6 Oct 2020 16:03:22 -0700 Subject: [PATCH 55/62] Add jobs-cleanup-via-plugin and job-cleanup-after-finished-via-plugin to YAML --- README.md | 13 +++++++++++-- hooks/command | 4 ++-- hooks/pre-exit | 6 ++---- plugin.yml | 5 ++++- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 77da02b..c038101 100644 --- a/README.md +++ b/README.md @@ -246,13 +246,22 @@ If set to `true`, the resulting k8s job spec is printed to the log, can be usefu ### `job-ttl-seconds-after-finished` (optinal, integer) Configures [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) on the k8s job, requires TTL Controller enabled in the cluster, otherwise ignored. -Default value: `86400`. If you have TTL controller running, it is highly recommended to set `builtin-jobs-cleanup` to `false` to reduce load on k8s api servers. +Default value: `86400`. -### `builtin-jobs-cleanup` (optional, boolean) +### `jobs-cleanup-via-plugin` (optional, boolean) If set to `true` plugin cleans up k8s jobs older than 1 day even if they're still running. Default value: `true`. +If you have TTL controller or https://github.com/lwolf/kube-cleanup-operator running, it is highly recommended to set the value to `false` to reduce load on k8s api servers. + +### `job-cleanup-after-finished-via-plugin` (optional, boolean) + +If set to `true` plugin cleans up finished k8s job. +Default value: `true`. + +If you have TTL controller or https://github.com/lwolf/kube-cleanup-operator running, it is highly recommended to set the value to `false` to reduce load on k8s api servers. + ## Low Level Configuration via Environment Variables Some of the plugin options can be configured via environment variables as following ([also see Buildkite docs](https://buildkite.com/docs/pipelines/environment-variables#defining-your-own)): diff --git a/hooks/command b/hooks/command index e0dfb2e..2361fa2 100755 --- a/hooks/command +++ b/hooks/command @@ -23,14 +23,14 @@ readonly log_complete_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE readonly log_complete_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_TIMEOUT:-30}" readonly use_agent_node_affinity=${BUILDKITE_PLUGIN_K8S_USE_AGENT_NODE_AFFINITY:-false} readonly print_resulting_job_spec=${BUILDKITE_PLUGIN_K8S_PRINT_RESULTING_JOB_SPEC:-false} -readonly builtin_jobs_cleanup=${BUILDKITE_PLUGIN_K8S_BUILTIN_JOBS_CLEANUP:-} +readonly jobs_cleanup_via_plugin=${BUILDKITE_PLUGIN_K8S_JOBS_CLEANUP_VIA_PLUGIN:-} readonly job_log_complete_marker_file="$(mktemp)" function cleanup { rm -f "$job_log_complete_marker_file" - if [[ "$builtin_jobs_cleanup" == "true" ]]; then + if [[ "$jobs_cleanup_via_plugin" == "true" ]]; then # Delete all jobs older than a day. kubectl delete job "$(kubectl get job -l buildkite/plugin=k8s | awk 'match($4,/[0-9]+d/) {print $1}')" 2>/dev/null || true fi diff --git a/hooks/pre-exit b/hooks/pre-exit index 070282c..34e53f6 100755 --- a/hooks/pre-exit +++ b/hooks/pre-exit @@ -6,14 +6,12 @@ job_name="$(cat /tmp/job_name)" echo "--- :kubernetes: Cleanup" -readonly job_cleanup_after_finish="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_AFTER_FINISH:-true}" +readonly job_cleanup_after_finished_via_plugin="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_AFTER_FINISHED_VIA_PLUGIN:-}" readonly job_cleanup_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_INTERVAL:-5}" readonly job_cleanup_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_TIMEOUT:-60}" readonly job_cleanup_start_time="$SECONDS" -# Optionally skip cleanup if Job succeeded (reduces load on k8s apiserver). -# Set to "false" if you have TTL Job Controller, https://github.com/lwolf/kube-cleanup-operator or other cleanup controller. -if [[ "$job_cleanup_after_finish" != "true" ]]; then +if [[ "$job_cleanup_after_finished_via_plugin" != "true" ]]; then exit 0 fi diff --git a/plugin.yml b/plugin.yml index 313d040..7a13384 100644 --- a/plugin.yml +++ b/plugin.yml @@ -69,7 +69,10 @@ configuration: job-ttl-seconds-after-finished: type: integer default: 86400 - builtin-jobs-cleanup: + jobs-cleanup-via-plugin: + type: boolean + default: true + job-cleanup-after-finished-via-plugin: type: boolean default: true required: From b5a69698ee2b315b4fbe32d5c737aef6bfde7d37 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 6 Oct 2020 16:12:27 -0700 Subject: [PATCH 56/62] Unify retry interval and timeout env var names --- hooks/command | 36 ++++++++++++++++++------------------ hooks/pre-exit | 8 ++++---- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/hooks/command b/hooks/command index 2361fa2..3b1592a 100755 --- a/hooks/command +++ b/hooks/command @@ -14,13 +14,13 @@ fi export BUILDKITE_TIMEOUT # Default values can be overridden by setting "BUILDKITE_PLUGIN_K8S_*" env vars as used below. -readonly job_apply_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_INTERVAL:-5}" -readonly job_apply_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_TIMEOUT:-120}" -readonly log_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_LOG_LOOP_INTERVAL:-3}" -readonly log_loop_attempt_timeout_seconds="${BUILDKITE_PLUGIN_K8S_LOG_LOOP_ATTEMPT_TIMEOUT:-5}" -readonly job_status_loop_sleep_interval="${BUILDKITE_PLUGIN_K8S_JOB_STATUS_LOOP_INTERVAL:-5}" -readonly log_complete_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_INTERVAL:-1}" -readonly log_complete_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_TIMEOUT:-30}" +readonly job_apply_retry_interval_sec="${BUILDKITE_PLUGIN_K8S_JOB_APPLY_RETRY_INTERVAL_SEC:-5}" +readonly job_apply_timeout_sec="${BUILDKITE_PLUGIN_K8S_JOB_APPLY_TIMEOUT_SEC:-120}" +readonly log_loop_retry_interval_sec="${BUILDKITE_PLUGIN_K8S_LOG_RETRY_INTERVAL_SEC:-3}" +readonly log_attempt_timeout_sec="${BUILDKITE_PLUGIN_K8S_LOG_ATTEMPT_TIMEOUT_SEC:-5}" +readonly job_status_retry_interval_sec="${BUILDKITE_PLUGIN_K8S_JOB_STATUS_RETRY_INTERVAL_SEC:-5}" +readonly log_complete_retry_interval_sec="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_RETRY_INTERVAL_SEC:-1}" +readonly log_complete_timeout_sec="${BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_TIMEOUT_SEC:-30}" readonly use_agent_node_affinity=${BUILDKITE_PLUGIN_K8S_USE_AGENT_NODE_AFFINITY:-false} readonly print_resulting_job_spec=${BUILDKITE_PLUGIN_K8S_PRINT_RESULTING_JOB_SPEC:-false} readonly jobs_cleanup_via_plugin=${BUILDKITE_PLUGIN_K8S_JOBS_CLEANUP_VIA_PLUGIN:-} @@ -45,12 +45,12 @@ function tail_logs { while true; do set +e - log_snapshot="$(timeout "$log_loop_attempt_timeout_seconds" kubectl logs --limit-bytes "1024" "job/${job_name}" 2>/dev/null)" + log_snapshot="$(timeout "$log_attempt_timeout_sec" kubectl logs --limit-bytes "1024" "job/${job_name}" 2>/dev/null)" set -e if [[ -n "$log_snapshot" ]]; then break fi - sleep "$log_loop_interval_seconds" + sleep "$log_loop_retry_interval_sec" done # Run kubectl logs --follow in a loop since it can fail: @@ -59,7 +59,7 @@ function tail_logs { # 3) It can hang not providing any result, that's why we check not only exit code but also contents in the loop above. while ! kubectl logs --follow "job/${job_name}" 2>/dev/null; do - sleep "$log_loop_interval_seconds" + sleep "$log_loop_retry_interval_sec" done echo "0" > "$job_log_complete_marker_file" @@ -90,7 +90,7 @@ fi readonly job_apply_start_time="$SECONDS" job_apply_exit_code="" -while [[ "$((SECONDS - job_apply_start_time))" -lt "$job_apply_loop_timeout_seconds" ]] +while [[ "$((SECONDS - job_apply_start_time))" -lt "$job_apply_timeout_sec" ]] do set +e echo "$job_spec" | kubectl apply -f - @@ -101,7 +101,7 @@ do break else echo "Attempt to apply the job failed, exit code '$job_apply_exit_code'" - sleep "$job_apply_loop_interval_seconds" + sleep "$job_apply_retry_interval_sec" fi done @@ -126,11 +126,11 @@ while [[ -z "$jobstatus" ]] ; do if [[ -n "$jobstatus" ]]; then break else - sleep "$job_status_loop_sleep_interval" + sleep "$job_status_retry_interval_sec" fi if [[ $timeout -gt 0 ]]; then - (( counter -= job_status_loop_sleep_interval )) || jobstatus="timeout" + (( counter -= job_status_retry_interval_sec )) || jobstatus="timeout" fi done @@ -138,9 +138,9 @@ echo "--- :kubernetes: Job status: $jobstatus" # Wait for logs to be fully printed, printing runs in a separate process and we're racing with it. readonly log_complete_start_time="$SECONDS" -while [[ "$(cat "$job_log_complete_marker_file")" != "0" ]] && [[ "$((SECONDS - log_complete_start_time))" -lt "$log_complete_loop_timeout_seconds" ]] +while [[ "$(cat "$job_log_complete_marker_file")" != "0" ]] && [[ "$((SECONDS - log_complete_start_time))" -lt "$log_complete_timeout_sec" ]] do - sleep "$log_complete_loop_interval_seconds" + sleep "$log_complete_retry_interval_sec" done status="" @@ -154,9 +154,9 @@ else pod_json=$(kubectl get pod "$pod_name" -o json) status="$(echo "$pod_json" | jq ".status.containerStatuses[0].state.terminated.exitCode")" set -e - sleep "$job_status_loop_sleep_interval" + sleep "$job_status_retry_interval_sec" if [[ $timeout -gt 0 ]]; then - (( counter -= job_status_loop_sleep_interval )) || status="1" + (( counter -= job_status_retry_interval_sec )) || status="1" fi done fi diff --git a/hooks/pre-exit b/hooks/pre-exit index 34e53f6..89883f0 100755 --- a/hooks/pre-exit +++ b/hooks/pre-exit @@ -7,8 +7,8 @@ job_name="$(cat /tmp/job_name)" echo "--- :kubernetes: Cleanup" readonly job_cleanup_after_finished_via_plugin="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_AFTER_FINISHED_VIA_PLUGIN:-}" -readonly job_cleanup_loop_interval_seconds="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_INTERVAL:-5}" -readonly job_cleanup_loop_timeout_seconds="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_TIMEOUT:-60}" +readonly job_cleanup_retry_sec="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_RETRY_INTERVAL_SEC:-5}" +readonly job_cleanup_timeout_sec="${BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_TIMEOUT_SEC:-60}" readonly job_cleanup_start_time="$SECONDS" if [[ "$job_cleanup_after_finished_via_plugin" != "true" ]]; then @@ -16,7 +16,7 @@ if [[ "$job_cleanup_after_finished_via_plugin" != "true" ]]; then fi job_cleanup_exit_code="" -while [[ "$((SECONDS - job_cleanup_start_time))" -lt "$job_cleanup_loop_timeout_seconds" ]] +while [[ "$((SECONDS - job_cleanup_start_time))" -lt "$job_cleanup_timeout_sec" ]] do set +e pod=$(kubectl get pod --output=name -l "job-name=${job_name}") @@ -31,7 +31,7 @@ do break else echo "Attempt to cleanup the job failed, exit code '$job_cleanup_exit_code'" - sleep "$job_cleanup_loop_interval_seconds" + sleep "$job_cleanup_retry_sec" fi done From 3d1589a796870279e56f449dcad215971ea63d1d Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Tue, 6 Oct 2020 18:22:57 -0700 Subject: [PATCH 57/62] Update README with new env var names --- README.md | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index c038101..21f7c92 100644 --- a/README.md +++ b/README.md @@ -268,64 +268,58 @@ Some of the plugin options can be configured via environment variables as follow ```yaml env: - BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_INTERVAL: "10" + BUILDKITE_PLUGIN_K8S_JOB_APPLY_RETRY_INTERVAL_SEC: "10" ``` -### BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_INTERVAL +### BUILDKITE_PLUGIN_K8S_JOB_APPLY_RETRY_INTERVAL_SEC - Configures loop interval between plugin attempts to schedule the k8s job - Default: `5` - Unit type: integer seconds -### BUILDKITE_PLUGIN_K8S_JOB_APPLY_LOOP_TIMEOUT +### BUILDKITE_PLUGIN_K8S_JOB_APPLY_TIMEOUT_SEC - Configures time limit for plugin attempts to schedule the k8s job - Default: `120` - Unit type: integer seconds -### BUILDKITE_PLUGIN_K8S_JOB_STATUS_LOOP_INTERVAL +### BUILDKITE_PLUGIN_K8S_JOB_STATUS_RETRY_INTERVAL_SEC - Configures loop interval for plugin attempts to get k8s job status - Default: `5` - Unit type: integer seconds -### BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_INTERVAL +### BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_RETRY_INTERVAL_SEC - Configures loop interval for plugin attempts to verify that log streaming has ended - Default: `1` - Unit type: integer seconds -### BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_LOOP_TIMEOUT +### BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_TIMEOUT_SEC - Configures time limit for plugin attempts to verify that log streaming has ended - Default: `30` - Unit type: integer seconds -### BUILDKITE_PLUGIN_K8S_LOG_LOOP_INTERVAL +### BUILDKITE_PLUGIN_K8S_LOG_RETRY_INTERVAL_SEC - Configures loop interval for plugin attempts to stream job logs - Default: `3` - Unit type: integer seconds -### BUILDKITE_PLUGIN_K8S_LOG_LOOP_ATTEMPT_TIMEOUT +### BUILDKITE_PLUGIN_K8S_LOG_ATTEMPT_TIMEOUT_SEC - Configures time limit for a _single_ plugin attempt to stream job logs - Default: `5` - Unit type: integer seconds -### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_AFTER_FINISH - -- Configures whether plugin should cleanup k8s job after it finishes, you might want to disable it in case you rely on [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) or [lwolf/kube-cleanup-operator](https://github.com/lwolf/kube-cleanup-operator) -- Default: `true` -- Unit type: `true` or `false` string - -### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_INTERVAL +### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_RETRY_INTERVAL_SEC - Configures loop interval for plugin attempts to cleanup finished jobs - Default: `5` - Unit type: integer seconds -### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_LOOP_TIMEOUT +### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_TIMEOUT_SEC - Configures time limit for plugin attempts to cleanup finished jobs - Default: `60` From f46dac54bab327034f45ddd97b2f05dd4eb30d6e Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Fri, 9 Oct 2020 16:10:23 -0700 Subject: [PATCH 58/62] Use init container exit code if it failed --- hooks/command | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/hooks/command b/hooks/command index 3b1592a..f60194c 100755 --- a/hooks/command +++ b/hooks/command @@ -146,19 +146,34 @@ done status="" if [[ "$jobstatus" == "Complete" ]] ; then echo "success" - status=0 + status="0" else - while [[ -z "$status" ]] ; do + while true + do set +e pod_name=$(kubectl get pod -l "job-name=$job_name" --output=jsonpath="{.items[*].metadata.name}") pod_json=$(kubectl get pod "$pod_name" -o json) - status="$(echo "$pod_json" | jq ".status.containerStatuses[0].state.terminated.exitCode")" + init_container_status="$(echo "$pod_json" | jq ".status.initContainerStatuses[0].state.terminated.exitCode")" + if [[ -n "$init_container_status" && "$init_container_status" != "0" ]]; then + status="$init_container_status" + else + status="$(echo "$pod_json" | jq ".status.containerStatuses[0].state.terminated.exitCode")" + fi set -e - sleep "$job_status_retry_interval_sec" - if [[ $timeout -gt 0 ]]; then - (( counter -= job_status_retry_interval_sec )) || status="1" + if [[ -n "$status" ]]; then + break + else + sleep "$job_status_retry_interval_sec" + if [[ $timeout -gt 0 ]]; then + (( counter -= job_status_retry_interval_sec )) + fi fi done fi -exit $status +if [[ -z "$status" ]]; then + echo "Warning: could not get actual exit code of the job" + status="42" +fi + +exit "$status" From d948c351cd54224c358b9548293370a16ad3d7a4 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Fri, 9 Oct 2020 16:16:14 -0700 Subject: [PATCH 59/62] Add warning about init container failure --- hooks/command | 1 + 1 file changed, 1 insertion(+) diff --git a/hooks/command b/hooks/command index f60194c..4975451 100755 --- a/hooks/command +++ b/hooks/command @@ -155,6 +155,7 @@ else pod_json=$(kubectl get pod "$pod_name" -o json) init_container_status="$(echo "$pod_json" | jq ".status.initContainerStatuses[0].state.terminated.exitCode")" if [[ -n "$init_container_status" && "$init_container_status" != "0" ]]; then + echo "Warning: init container failed with exit code $init_container_status, this usually indicates plugin misconfiguration or infrastructure failure" status="$init_container_status" else status="$(echo "$pod_json" | jq ".status.containerStatuses[0].state.terminated.exitCode")" From cb2f53ee79b1df6249cd4560feb7a9b910653c7c Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Fri, 9 Oct 2020 16:33:23 -0700 Subject: [PATCH 60/62] Print bootstrap container logs --- hooks/command | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/hooks/command b/hooks/command index 4975451..a2feed4 100755 --- a/hooks/command +++ b/hooks/command @@ -25,10 +25,11 @@ readonly use_agent_node_affinity=${BUILDKITE_PLUGIN_K8S_USE_AGENT_NODE_AFFINITY: readonly print_resulting_job_spec=${BUILDKITE_PLUGIN_K8S_PRINT_RESULTING_JOB_SPEC:-false} readonly jobs_cleanup_via_plugin=${BUILDKITE_PLUGIN_K8S_JOBS_CLEANUP_VIA_PLUGIN:-} -readonly job_log_complete_marker_file="$(mktemp)" +readonly bootstrap_container_log_complete_marker_file="$(mktemp)" +readonly step_container_log_complete_marker_file="$(mktemp)" function cleanup { - rm -f "$job_log_complete_marker_file" + rm -f "$bootstrap_container_log_complete_marker_file" "$step_container_log_complete_marker_file" if [[ "$jobs_cleanup_via_plugin" == "true" ]]; then # Delete all jobs older than a day. @@ -39,13 +40,17 @@ function cleanup { trap cleanup EXIT function tail_logs { + local -r pod_name="$1" + local -r container_name="$2" + local -r log_completion_marker_file="$3" + local log_snapshot="" # Once logs are not empty we start attempting to stream them. # Keep looping otherwise since empty output means that there is no useful log to display so we're not losing information by looping. while true; do set +e - log_snapshot="$(timeout "$log_attempt_timeout_sec" kubectl logs --limit-bytes "1024" "job/${job_name}" 2>/dev/null)" + log_snapshot="$(timeout "$log_attempt_timeout_sec" kubectl logs --limit-bytes "1024" "pod/$pod_name" --container "$container_name" 2>/dev/null)" set -e if [[ -n "$log_snapshot" ]]; then break @@ -57,12 +62,12 @@ function tail_logs { # 1) It can fail due to pod not being initialized yet: "Error from server (BadRequest): container "step" in pod "somepod" is waiting to start: PodInitializing" # 2) It can fail mid-streaming, in this case we unfortunately will display logs multiple times (partially). # 3) It can hang not providing any result, that's why we check not only exit code but also contents in the loop above. - while ! kubectl logs --follow "job/${job_name}" 2>/dev/null; + while ! kubectl logs --follow "pod/$pod_name" --container "$container_name" 2>/dev/null; do sleep "$log_loop_retry_interval_sec" done - echo "0" > "$job_log_complete_marker_file" + echo "0" > "$log_completion_marker_file" } echo "--- :kubernetes: Starting Kubernetes Job" @@ -112,11 +117,34 @@ fi echo "Timeout: ${timeout}s" -echo "+++ :kubernetes: Running image: ${BUILDKITE_PLUGIN_K8S_IMAGE}" +echo "--- :kubernetes: Running image: ${BUILDKITE_PLUGIN_K8S_IMAGE}" + +counter="$timeout" +while true +do + set +e + pod_name=$(kubectl get pod -l "job-name=$job_name" --output=jsonpath="{.items[*].metadata.name}") + set -e + + if [[ -n "$pod_name" ]]; then + break + else + sleep "$job_status_retry_interval_sec" + if [[ $timeout -gt 0 ]]; then + (( counter -= job_status_retry_interval_sec )) + fi + fi +done + +echo "Pod is running: $pod_name" + +echo "--- :kubernetes: bootstrap container" +tail_log "$pod_name" "bootstrap" "$bootstrap_container_log_complete_marker_file" -tail_logs & +echo "+++ :kubernetes: step container" +tail_logs "$pod_name" "step" "$step_container_log_complete_marker_file" & -counter=${timeout} +counter="$timeout" jobstatus="" while [[ -z "$jobstatus" ]] ; do set +e @@ -138,7 +166,7 @@ echo "--- :kubernetes: Job status: $jobstatus" # Wait for logs to be fully printed, printing runs in a separate process and we're racing with it. readonly log_complete_start_time="$SECONDS" -while [[ "$(cat "$job_log_complete_marker_file")" != "0" ]] && [[ "$((SECONDS - log_complete_start_time))" -lt "$log_complete_timeout_sec" ]] +while [[ "$(cat "$step_container_log_complete_marker_file")" != "0" ]] && [[ "$((SECONDS - log_complete_start_time))" -lt "$log_complete_timeout_sec" ]] do sleep "$log_complete_retry_interval_sec" done @@ -151,7 +179,6 @@ else while true do set +e - pod_name=$(kubectl get pod -l "job-name=$job_name" --output=jsonpath="{.items[*].metadata.name}") pod_json=$(kubectl get pod "$pod_name" -o json) init_container_status="$(echo "$pod_json" | jq ".status.initContainerStatuses[0].state.terminated.exitCode")" if [[ -n "$init_container_status" && "$init_container_status" != "0" ]]; then From 09dd224638349f580c17f3074655721a7eabf369 Mon Sep 17 00:00:00 2001 From: Artem Zinnatullin Date: Fri, 9 Oct 2020 16:35:52 -0700 Subject: [PATCH 61/62] fix func name --- hooks/command | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hooks/command b/hooks/command index a2feed4..009cf4f 100755 --- a/hooks/command +++ b/hooks/command @@ -139,7 +139,7 @@ done echo "Pod is running: $pod_name" echo "--- :kubernetes: bootstrap container" -tail_log "$pod_name" "bootstrap" "$bootstrap_container_log_complete_marker_file" +tail_logs "$pod_name" "bootstrap" "$bootstrap_container_log_complete_marker_file" echo "+++ :kubernetes: step container" tail_logs "$pod_name" "step" "$step_container_log_complete_marker_file" & From f7a867da1b819be559e745342145517b154544a4 Mon Sep 17 00:00:00 2001 From: "Artem Zinnatullin :slowpoke" Date: Mon, 12 Oct 2020 21:04:27 -0700 Subject: [PATCH 62/62] Apply suggestions from code review Co-authored-by: Ifeanyi Ubah --- README.md | 24 ++++++++++++------------ hooks/command | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 21f7c92..f7f91eb 100644 --- a/README.md +++ b/README.md @@ -241,19 +241,19 @@ patch: | ### `print-resulting-job-spec` (optional, boolean) -If set to `true`, the resulting k8s job spec is printed to the log, can be useful for debugging. +If set to `true`, the resulting k8s job spec is printed to the log. This can be useful when debugging. -### `job-ttl-seconds-after-finished` (optinal, integer) +### `job-ttl-seconds-after-finished` (optional, integer) Configures [`spec.ttlSecondsAfterFinished`](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) on the k8s job, requires TTL Controller enabled in the cluster, otherwise ignored. Default value: `86400`. ### `jobs-cleanup-via-plugin` (optional, boolean) -If set to `true` plugin cleans up k8s jobs older than 1 day even if they're still running. +If set to `true`, the plugin cleans up k8s jobs older than 1 day even if they're still running. Default value: `true`. -If you have TTL controller or https://github.com/lwolf/kube-cleanup-operator running, it is highly recommended to set the value to `false` to reduce load on k8s api servers. +If you have [TTL Controller](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/) enabled or some other means to cleanup finished jobs, it is recommended to set the value to `false` in order to reduce load on k8s api servers. ### `job-cleanup-after-finished-via-plugin` (optional, boolean) @@ -273,37 +273,37 @@ env: ### BUILDKITE_PLUGIN_K8S_JOB_APPLY_RETRY_INTERVAL_SEC -- Configures loop interval between plugin attempts to schedule the k8s job +- Configures the interval between attempts to schedule the k8s job - Default: `5` - Unit type: integer seconds ### BUILDKITE_PLUGIN_K8S_JOB_APPLY_TIMEOUT_SEC -- Configures time limit for plugin attempts to schedule the k8s job +- Configures the total time limit across attempts to schedule the k8s job - Default: `120` - Unit type: integer seconds ### BUILDKITE_PLUGIN_K8S_JOB_STATUS_RETRY_INTERVAL_SEC -- Configures loop interval for plugin attempts to get k8s job status +- Configures the interval between attempts to get k8s job status - Default: `5` - Unit type: integer seconds ### BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_RETRY_INTERVAL_SEC -- Configures loop interval for plugin attempts to verify that log streaming has ended +- Configures the interval between attempts to verify that log streaming has ended - Default: `1` - Unit type: integer seconds ### BUILDKITE_PLUGIN_K8S_LOG_COMPLETE_TIMEOUT_SEC -- Configures time limit for plugin attempts to verify that log streaming has ended +- Configures the total time limit across attempts to verify that log streaming has ended - Default: `30` - Unit type: integer seconds ### BUILDKITE_PLUGIN_K8S_LOG_RETRY_INTERVAL_SEC -- Configures loop interval for plugin attempts to stream job logs +- Configures the interval between attempts to stream job logs - Default: `3` - Unit type: integer seconds @@ -315,13 +315,13 @@ env: ### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_RETRY_INTERVAL_SEC -- Configures loop interval for plugin attempts to cleanup finished jobs +- Configures the interval between attempts to cleanup finished jobs - Default: `5` - Unit type: integer seconds ### BUILDKITE_PLUGIN_K8S_JOB_CLEANUP_TIMEOUT_SEC -- Configures time limit for plugin attempts to cleanup finished jobs +- Configures the total time limit across attempts to cleanup finished jobs - Default: `60` - Unit type: integer seconds diff --git a/hooks/command b/hooks/command index 009cf4f..50a5b1e 100755 --- a/hooks/command +++ b/hooks/command @@ -158,7 +158,7 @@ while [[ -z "$jobstatus" ]] ; do fi if [[ $timeout -gt 0 ]]; then - (( counter -= job_status_retry_interval_sec )) || jobstatus="timeout" + (( counter -= job_status_retry_interval_sec )) || jobstatus="timed-out" fi done