Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
Fix auto retries when out of memory. (#1108)
Browse files Browse the repository at this point in the history
* Distinguish cgroup OOM from dmesg.

* Remove cgroup OOM detection

Make all OOM cause exiting by 5

* Exit 55 when OOM
  • Loading branch information
Gerhut authored Mar 1, 2019
1 parent 9ef4bb8 commit 953d655
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 4 deletions.
3 changes: 2 additions & 1 deletion src/rest-server/src/models/job.js
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,8 @@ class Job {
'frameworkInfoWebhdfsUri': launcherConfig.frameworkInfoWebhdfsPath(data.jobName),
'taskData': data.taskRoles[idx],
'jobData': data,
'inspectFormat': '{{.State.Pid}}',
'inspectPidFormat': '{{.State.Pid}}',
'inspectOOMKilledFormat': '{{.State.OOMKilled}}',
'jobEnvs': jobEnvs,
'azRDMA': azureEnv.azRDMA === 'false' ? false : true,
'reqAzRDMA': data.jobEnvs && data.jobEnvs.paiAzRDMA === true ? true : false,
Expand Down
35 changes: 32 additions & 3 deletions src/rest-server/src/templates/yarnContainerScript.mustache
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,33 @@ BASH_XTRACEFD=13
function exit_handler()
{
rc=$?
printf "[DEBUG] EXIT signal received in yarn container, performing clean up action...\n"
echo "Exited with $rc"
local handler="Yarn container exit handler"
debug_log "$handler" "EXIT signal received in yarn container, performing clean up action..."

debug_log "$handler" "trying to kill docker container $docker_name"
docker logs $docker_name
docker inspect $docker_name
pid=$(docker inspect --format={{{ inspectPidFormat }}} $docker_name 2>/dev/null)
if [ $pid -gt 0 ]; then
kill -9 $pid &&\
debug_log "$handler" "docker caontainer $docker_name killed successfully." ||\
debug_log "$handler" "tries to kill the container $docker_name but failed. Maybe it has already exited."
else
debug_log "$handler" "docker container $docker_name has already exited"
is_oom=$(docker inspect --format={{{ inspectOOMKilledFormat }}} $docker_name 2>/dev/null)
debug_log "$handler" "docker container $docker_name is exited by OOM? $is_oom"
if [ "$is_oom" = "true" ]; then
rc=55
fi
fi

printf "[DEBUG] Write exit code $rc to file /var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode.\n"
docker container rm $docker_name
pkill --parent $$

debug_log "$handler" "write exit code to file"
debug_log "$handler" "yarn container exit code: $rc"
debug_log "$handler" "exit code file path: /var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode"
echo $rc > "/var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode"

exit $rc
Expand Down Expand Up @@ -266,7 +290,7 @@ docker pull {{ jobData.image }} \
## network consumption
docker run --name $docker_name \
--init \
--rm \
--detach \
--tty \
--privileged=false \
--oom-score-adj=1000 \
Expand Down Expand Up @@ -306,3 +330,8 @@ docker run --name $docker_name \
{{ jobData.image }} \
/bin/bash '/pai/bootstrap/docker_bootstrap.sh'

docker_pid=$(docker inspect --format "{{{ inspectPidFormat }}}" $docker_name)

echo "Docker container pid is $docker_pid"

docker attach --no-stdin $docker_name

0 comments on commit 953d655

Please sign in to comment.