From 953d655882848ddbccb3f2c261363bc966fb10e0 Mon Sep 17 00:00:00 2001 From: George Cheng Date: Fri, 1 Mar 2019 11:18:58 +0800 Subject: [PATCH] Fix auto retries when out of memory. (#1108) * Distinguish cgroup OOM from dmesg. * Remove cgroup OOM detection Make all OOM cause exiting by 5 * Exit 55 when OOM --- src/rest-server/src/models/job.js | 3 +- .../templates/yarnContainerScript.mustache | 35 +++++++++++++++++-- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/rest-server/src/models/job.js b/src/rest-server/src/models/job.js index e27617ba12..0f83b1b0ce 100644 --- a/src/rest-server/src/models/job.js +++ b/src/rest-server/src/models/job.js @@ -425,7 +425,8 @@ class Job { 'frameworkInfoWebhdfsUri': launcherConfig.frameworkInfoWebhdfsPath(data.jobName), 'taskData': data.taskRoles[idx], 'jobData': data, - 'inspectFormat': '{{.State.Pid}}', + 'inspectPidFormat': '{{.State.Pid}}', + 'inspectOOMKilledFormat': '{{.State.OOMKilled}}', 'jobEnvs': jobEnvs, 'azRDMA': azureEnv.azRDMA === 'false' ? false : true, 'reqAzRDMA': data.jobEnvs && data.jobEnvs.paiAzRDMA === true ? true : false, diff --git a/src/rest-server/src/templates/yarnContainerScript.mustache b/src/rest-server/src/templates/yarnContainerScript.mustache index 3fc6cc33b9..2db339f1aa 100644 --- a/src/rest-server/src/templates/yarnContainerScript.mustache +++ b/src/rest-server/src/templates/yarnContainerScript.mustache @@ -30,9 +30,33 @@ BASH_XTRACEFD=13 function exit_handler() { rc=$? - printf "[DEBUG] EXIT signal received in yarn container, performing clean up action...\n" + echo "Exited with $rc" + local handler="Yarn container exit handler" + debug_log "$handler" "EXIT signal received in yarn container, performing clean up action..." + + debug_log "$handler" "trying to kill docker container $docker_name" + docker logs $docker_name + docker inspect $docker_name + pid=$(docker inspect --format={{{ inspectPidFormat }}} $docker_name 2>/dev/null) + if [ $pid -gt 0 ]; then + kill -9 $pid &&\ + debug_log "$handler" "docker caontainer $docker_name killed successfully." ||\ + debug_log "$handler" "tries to kill the container $docker_name but failed. Maybe it has already exited." + else + debug_log "$handler" "docker container $docker_name has already exited" + is_oom=$(docker inspect --format={{{ inspectOOMKilledFormat }}} $docker_name 2>/dev/null) + debug_log "$handler" "docker container $docker_name is exited by OOM? $is_oom" + if [ "$is_oom" = "true" ]; then + rc=55 + fi + fi - printf "[DEBUG] Write exit code $rc to file /var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode.\n" + docker container rm $docker_name + pkill --parent $$ + + debug_log "$handler" "write exit code to file" + debug_log "$handler" "yarn container exit code: $rc" + debug_log "$handler" "exit code file path: /var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode" echo $rc > "/var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode" exit $rc @@ -266,7 +290,7 @@ docker pull {{ jobData.image }} \ ## network consumption docker run --name $docker_name \ --init \ - --rm \ + --detach \ --tty \ --privileged=false \ --oom-score-adj=1000 \ @@ -306,3 +330,8 @@ docker run --name $docker_name \ {{ jobData.image }} \ /bin/bash '/pai/bootstrap/docker_bootstrap.sh' +docker_pid=$(docker inspect --format "{{{ inspectPidFormat }}}" $docker_name) + +echo "Docker container pid is $docker_pid" + +docker attach --no-stdin $docker_name