Skip to content

Commit

Permalink
Add OOM guard (#141)
Browse files Browse the repository at this point in the history
* Add OOM guard

* Update neuro toolset

* Update GH Action

* Fix neuro-extras version in the header
  • Loading branch information
anayden authored May 27, 2021
1 parent e2c220f commit 38dfe09
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 7 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
run: |
python -m venv venv
source venv/bin/activate
pip install -U neuromation
pip install -U neuro-cli
- name: Configure environment
run: |
Expand Down
13 changes: 13 additions & 0 deletions files/root/oom_guard.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

# This script ensures only the process with PID = 1
# would have minimal oom_score_adj value
# That means it will only be killed by oom_killer at the last resort

for pid in $(ps x | awk 'NR>1 {print $1}' | xargs)
do
if [ "$pid" != "1" ]
then
echo 1000 > /proc/"$pid"/oom_score_adj
fi
done
21 changes: 15 additions & 6 deletions targets/python37-jupyter-pytorch-tensorflow-jupyterlab/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
# jupyterlab latest (pip)
# pytorch 1.6.0 (docker-hub)
# tensorflow 2.3.0 (pip)
# neuro-cli 21.3.3 (pip)
# neuro-flow 21.3.17 (pip)
# neuro-extras 20.12.16 (pip)
# neuro-cli 21.5.17 (pip)
# neuro-flow 21.5.25 (pip)
# neuro-extras 21.3.19 (pip)
# ==================================================================

FROM pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel
Expand All @@ -31,6 +31,7 @@ RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
apt-utils \
build-essential \
ca-certificates \
cron \
curl \
git \
libssl-dev \
Expand Down Expand Up @@ -124,9 +125,9 @@ RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
# ------------------------------------------------------------------

$PIP_INSTALL \
neuro-cli==21.3.3 \
neuro-flow==21.3.17 \
neuro-extras==20.12.16 \
neuro-cli==21.5.17 \
neuro-flow==21.5.25 \
neuro-extras==21.3.19 \
&& \

# ==================================================================
Expand Down Expand Up @@ -191,6 +192,14 @@ RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
$PIP_INSTALL --global-option="--cpp_ext" --global-option="--cuda_ext" \
git+https://github.com/NVIDIA/apex@2ec84ebdca59278eaf15e8ddf32476d9d6d8b904

# ==================================================================
# OOM guard
# Adds a script to tune oom_killer behavior and puts it into the crontab
# ==================================================================

COPY files/root/oom_guard.sh /root/oom_guard.sh
RUN crontab -l 2>/dev/null | { cat; echo '* * * * * /root/oom_guard.sh'; } | crontab

# ==================================================================
# Documentation notebook
# ------------------------------------------------------------------
Expand Down

0 comments on commit 38dfe09

Please sign in to comment.