From 0cdf130e0a9ed17e4561d585c09a7bc52ad80c31 Mon Sep 17 00:00:00 2001
From: bloodearnest <simon.davy@thedatalab.org>
Date: Thu, 26 Sep 2024 20:56:56 +0100
Subject: [PATCH 1/5] Switch to using --mount instead of --volume

The --mount option is more strict that the --volume option, and will
error if the host source directory does not exist, whereas --volume will
silently create a directory.  We want the strict behaviour, so this
changes to use --mount.

The motivation for this change is to support job-runner running in
docker, which requires that the paths it uses are correct from the
host's perspective.  Currently, a incorrect path will be sort of work,
as the directory would be created inside the container, but only later
would the issue arise.

https://docs.docker.com/engine/storage/bind-mounts/#differences-between--v-and---mount-behavior
---
 jobrunner/executors/local.py   | 2 ++
 jobrunner/executors/volumes.py | 2 ++
 jobrunner/lib/docker.py        | 5 ++++-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/jobrunner/executors/local.py b/jobrunner/executors/local.py
index 277173de..fa1a547c 100644
--- a/jobrunner/executors/local.py
+++ b/jobrunner/executors/local.py
@@ -175,7 +175,9 @@ def execute(self, job_definition):
                 label=LABEL,
                 labels=get_job_labels(job_definition),
                 extra_args=extra_args,
+                volume_type=volume_api.volume_type,
             )
+
         except Exception as exc:
             return JobStatus(
                 ExecutorState.ERROR, f"Failed to start docker container: {exc}"
diff --git a/jobrunner/executors/volumes.py b/jobrunner/executors/volumes.py
index 879cfdf9..cef0a1d2 100644
--- a/jobrunner/executors/volumes.py
+++ b/jobrunner/executors/volumes.py
@@ -37,6 +37,7 @@ class DockerVolumeAPI:
     # don't run with UIDs for now. We maybe be able to support this in future.
     requires_root = True
     supported_platforms = ("linux", "win32", "darwin")
+    volume_type = "volume"  # https://docs.docker.com/engine/storage/volumes/
 
     def volume_name(job):
         return docker_volume_name(job)
@@ -93,6 +94,7 @@ class BindMountVolumeAPI:
     # Only works running jobs with uid:gid
     requires_root = False
     supported_platforms = ("linux",)
+    volume_type = "bind"  # https://docs.docker.com/engine/storage/bind-mounts/
 
     def volume_name(job):
         """Return the absolute path to the volume directory.
diff --git a/jobrunner/lib/docker.py b/jobrunner/lib/docker.py
index 54df55f3..2f6a4a06 100644
--- a/jobrunner/lib/docker.py
+++ b/jobrunner/lib/docker.py
@@ -405,6 +405,7 @@ def run(
     label=None,
     labels=None,
     extra_args=None,
+    volume_type="volume",
 ):
     run_args = ["run", "--init", "--detach", "--label", LABEL, "--name", name]
     if extra_args is not None:
@@ -413,7 +414,9 @@ def run(
     if not allow_network_access:
         run_args.extend(["--network", "none"])
     if volume:
-        run_args.extend(["--volume", f"{volume[0]}:{volume[1]}"])
+        run_args.extend(
+            ["--mount", f"type={volume_type},source={volume[0]},target={volume[1]}"]
+        )
     # These lables are in addition to the default LABEL which is always applied
     # Single unary label
     if label is not None:

From 68fa1249b6d474d7d36483149920d579ffcb9295 Mon Sep 17 00:00:00 2001
From: bloodearnest <simon.davy@thedatalab.org>
Date: Fri, 27 Sep 2024 11:46:15 +0100
Subject: [PATCH 2/5] Remove docker-over-ssh experiment support

Was slow, complex, and unused
---
 docker/dependencies.txt    |  1 -
 docker/docker-compose.yaml |  2 --
 docker/justfile            | 38 --------------------------------------
 docker/ssh/config          |  9 ---------
 4 files changed, 50 deletions(-)
 delete mode 100644 docker/ssh/config

diff --git a/docker/dependencies.txt b/docker/dependencies.txt
index 1fe8743b..4ed2aed1 100644
--- a/docker/dependencies.txt
+++ b/docker/dependencies.txt
@@ -2,4 +2,3 @@
 docker.io
 sqlite3
 git
-openssh-client
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index 5eae6d7c..cbd3661d 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -27,8 +27,6 @@ services:
     volumes:
       # use the default dev workdir
       - ../workdir:/workdir
-      # used to configure ssh access for docker
-      - ./ssh:/home/opensafely/.ssh
       # docker control
       - /var/run/docker.sock:/var/run/docker.sock
     # paths relative to docker-compose.yaml file
diff --git a/docker/justfile b/docker/justfile
index ed802209..f32fb3b6 100644
--- a/docker/justfile
+++ b/docker/justfile
@@ -3,15 +3,6 @@ set dotenv-load := true
 export DOCKER_USERID := `id -u`
 export DOCKER_GROUPID := `id -g`
 export PYTEST_HOST_TMP := "/tmp/jobrunner-docker"
-export SSH_KEY := "ssh/id_jobrunner_dev"
-
-# used to identify the dev ssh key we generate
-
-export SSH_COMMENT := "local jobrunner dev key"
-
-# what is the hosts ip from docker containers POV?
-
-export SSH_HOST := `docker network inspect bridge --format='{{(index .IPAM.Config 0).Gateway}}'`
 
 # the dev build needs to know the hosts docker group ID, to match it in the container
 
@@ -71,32 +62,3 @@ clean:
     #!/bin/bash
     rm -rf $PYTEST_HOST_TMP
     docker image rm job-runner job-runner-dev || true
-    # clean up local ssh config
-    rm -f ssh/known_hosts $SSH_KEY*
-    sed -i '/$(SSH_COMMENT)/d' ~/.ssh/authorized_keys
-
-# setup dev ssh key and config
-setup-ssh:
-    #!/bin/bash -x
-    test -f "$SSH_KEY" && { echo "ssh key aleady set up"; exit 0; }
-
-    # create known_hosts file so ssh works without prompting
-    ssh-keyscan $SSH_HOST > ssh/known_hosts
-
-    # create the key
-    ssh-keygen -t ed25519 -N '' -C "$SSH_COMMENT" -f "$SSH_KEY"
-
-    # this is a little unpleasent, modifying the users authorizesd keys
-    # we do use the comment in clean command  to clean it up automatically though.
-    grep -q "$(cat $SSH_KEY.pub)" ~/.ssh/authorized_keys || cat $SSH_KEY.pub >> ~/.ssh/authorized_keys
-
-    # quick test to fail early if ssh doesn't work for some reason
-    ssh -i $SSH_KEY -o UserKnownHostsFile=ssh/known_hosts $USER@$SSH_HOST true || { echo "Failed to ssh into $SSH_HOST"; tail /var/log/auth.log; exit 1; }
-
-# enable using ssh to talk to docker on host
-enable-ssh: setup-ssh
-    echo "DOCKER_HOST=ssh://$USER@$SSH_HOST" > docker-compose.env
-
-# disable using ssh to talk to docker on host
-disable-ssh:
-    rm docker-compose.env
diff --git a/docker/ssh/config b/docker/ssh/config
deleted file mode 100644
index 2931d410..00000000
--- a/docker/ssh/config
+++ /dev/null
@@ -1,9 +0,0 @@
-# As recommended by docker: 
-#
-# https://docs.docker.com/engine/security/protect-access/#ssh-tips
-#
-# These settings maintain a persistant SSH connection, which speeds up the
-# tests quite a bit.
-ControlMaster     auto
-ControlPath       /tmp/control-%C
-ControlPersist    yes

From 596c58ace8dc563c09392d5e618f9e74757b4026 Mon Sep 17 00:00:00 2001
From: bloodearnest <simon.davy@thedatalab.org>
Date: Fri, 27 Sep 2024 11:54:01 +0100
Subject: [PATCH 3/5] Move to explicit 22.04 base image

---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 1c52e3ca..cc0c63a7 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -7,7 +7,7 @@
 # the latest base image, by design.
 #
 # hadolint ignore=DL3007
-FROM ghcr.io/opensafely-core/base-docker:latest as base-python
+FROM ghcr.io/opensafely-core/base-docker:22.04 as base-python
 
 # we are going to use an apt cache on the host, so disable the default debian
 # docker clean up that deletes that cache on every apt install

From f3d19041426f807a455e162c390c7221dfa39c5a Mon Sep 17 00:00:00 2001
From: bloodearnest <simon.davy@thedatalab.org>
Date: Fri, 27 Sep 2024 12:01:28 +0100
Subject: [PATCH 4/5] Fix docker image permissions

This simplifies the docker container by removing all the previous
user/group management inside the container as it is not needed. Instead,
we used the much simpler solution of adding an additional group to the
running user that is the host's docker group id. This allows the
container to have permission to access the host's bind-mounted
/var/run/docker.sock docker, but the container itself can run as the
local running user, which DTRT with respect to file permissions on bind
mounts.
---
 docker/Dockerfile          | 39 --------------------------------------
 docker/docker-compose.yaml | 20 ++++++++-----------
 2 files changed, 8 insertions(+), 51 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index cc0c63a7..e9228cc7 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -13,12 +13,6 @@ FROM ghcr.io/opensafely-core/base-docker:22.04 as base-python
 # docker clean up that deletes that cache on every apt install
 RUN rm -f /etc/apt/apt.conf.d/docker-clean
 
-# preemptively create a docker group using specific gid, before docker is
-# installed. This allows us to set the host docker gid to the same thing in
-# systems we control, and then we can just mount /var/run/docker.sock into the
-# container.
-RUN groupadd --gid 10010 docker
-
 # ensure fully working base python3 installation
 # see: https://gist.github.com/tiran/2dec9e03c6f901814f6d1e8dad09528e
 # use space efficient utility from base image
@@ -81,13 +75,6 @@ RUN --mount=type=cache,target=/root/.cache \
 #    version of the code included when the prod image was built.
 FROM base-python as job-runner-base
 
-# Create a non-root opensafely user to run the app as
-# We currently use host bind mounts in production backends, which means if we
-# need to match the uid inside the container to the one on the host.  To do
-# this, we reserve uid/gid 10000 to use as our running user on the host.
-# Note: we are currently not running as this user, but we will in future.
-RUN useradd --create-home --user-group --uid 10000 -G docker opensafely
-
 # copy venv over from builder image. These will have root:root ownership, but
 # are readable by all.
 COPY --from=builder /opt/venv /opt/venv
@@ -105,11 +92,6 @@ CMD ["/opt/venv/bin/python", "-m", "jobrunner.service"]
 # This may not be necessary, but it probably doesn't hurt
 ENV PYTHONPATH=/app
 
-# We are not ready to do this step yet
-# switch to running as the user
-# USER opensafely
-
-
 ##################################################
 #
 # Production image
@@ -145,9 +127,6 @@ LABEL org.opencontainers.image.revision=$GITREF
 #
 FROM job-runner-base as job-runner-dev
 
-# switch back to root to run the install of dev requirements.txt
-#USER root
-
 # TODO: its possible python dev dependencies might need some additional build packages installed?
 
 # install development requirements
@@ -156,21 +135,3 @@ COPY requirements.dev.txt /tmp/requirements.dev.txt
 # hadolint ignore=DL3042
 RUN --mount=type=cache,target=/root/.cache \
     python -m pip install --requirement /tmp/requirements.dev.txt
-
-# modify container docker gid to match host
-ARG DOCKER_HOST_GROUPID
-RUN groupmod -g $DOCKER_HOST_GROUPID docker
-
-# in dev, ensure opensafely uid matches host user id
-ARG DEV_USERID=1000
-ARG DEV_GROUPID=1000
-RUN usermod -u $DEV_USERID opensafely
-# Modify opensafely only if group id does not already exist. We run dev
-# containers with an explicit group id anyway, so file permissions on the host
-# will be correct, and we do not actually rely on named opensafely group access to
-# anything.
-RUN grep -q ":$DEV_GROUPID:" /etc/group || groupmod -g $DEV_GROUPID opensafely
-
-
-# switch back to opensafely
-#USER opensafely
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index cbd3661d..4eeb78a1 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -23,12 +23,19 @@ services:
         - GITREF
     # use dockers builitin PID daemon
     init: true
-    # paths relative to docker-compose.yaml
+    # run as the the current user to preserve file permissions
+    user: "${DEV_USERID:-1000}:${DEV_GROUPID:-1000}"
+    # run with additional group docker (the *hosts* gid for docker)
+    group_add: ["${DOCKER_HOST_GROUPID}"]
+    # paths are relative to docker-compose.yaml
     volumes:
       # use the default dev workdir
       - ../workdir:/workdir
       # docker control
       - /var/run/docker.sock:/var/run/docker.sock
+      # used to avoid ugly uid/gid lookup failures, but not strictly required
+      - /etc/group:/etc/group:ro
+      - /etc/passwd:/etc/passwd:ro
     # paths relative to docker-compose.yaml file
     env_file:
       - ../.env
@@ -48,20 +55,9 @@ services:
         service: prod
     image: job-runner-dev
     container_name: job-runner-dev
-    # running as a specific uid/gid allows files written to mounted volumes by
-    # the docker container's default user to match the host user's uid/gid, for
-    # convienience.
-    user: ${DEV_USERID:-1000}:${DEV_GROUPID:-1000}
-    # also run with additional group docker
-    group_add: [docker]
     build:
       # the dev stage in the Dockerfile
       target: job-runner-dev
-      # pass the uid/gid as build arg
-      args:
-        - DEV_USERID=${DEV_USERID:-1000}
-        - DEV_GROUPID=${DEV_GROUPID:-1000}
-        - DOCKER_HOST_GROUPID=${DOCKER_HOST_GROUPID}
     # Some tricks are needed here to be able to test the BindMountVolumeAPI
     # when running inside docker, as we need the volumes to be mountable by the
     # host docker. Our pytest fixtures create the directories in /tmp, so we

From 7d73d6abb4294169193f1b1bd1bc4ad1d6ebc452 Mon Sep 17 00:00:00 2001
From: bloodearnest <simon.davy@thedatalab.org>
Date: Fri, 27 Sep 2024 12:05:58 +0100
Subject: [PATCH 5/5] Fix docker configuration

The job-runner container uses the host's docker service. This means that
when it bind mounts directories with `docker run`, the source path must
exist on the *hosts* file system.

We've fixed this already for tests, with PYTEST_HOST_TMP, but we had not
actually fixed it for actual using the docker image for real. Before the
previous change to use --mount rather than --volume, a job would
succeed, but the files would be in the wrong location. Now we have that
change, the job fails.

We use a simple if inelegant fix for this: we mount the STORAGE_BASE
directories inside the container at the same path as on the host. This
means that the same config value can be used for job-runner copying
files itself, or for passing the host's docker to mount.

As part of this, I changedthe way we supply these values to being in
one time generated docker/.env file. This is more discoverable, usable
outside of just, and more efficant than before. I chose to use
`docker/medium` and `docker/high` as the volume mount locations, for
simplicity, and also, so the the docker machinery is isolated from the
non-docker venv set up.

This include a basic `just docker/functional-test` command to spin up
a job-runner container and run an actual job on it.
---
 docker/.gitignore          |  2 ++
 docker/docker-compose.yaml | 24 ++++++++++++-----
 docker/justfile            | 54 ++++++++++++++++++++++++++------------
 3 files changed, 57 insertions(+), 23 deletions(-)
 create mode 100644 docker/.gitignore

diff --git a/docker/.gitignore b/docker/.gitignore
new file mode 100644
index 00000000..58c81adb
--- /dev/null
+++ b/docker/.gitignore
@@ -0,0 +1,2 @@
+medium/
+high/
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index 4eeb78a1..56cd491c 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -1,6 +1,8 @@
 # note: this docker-compose file is intended to define the *building* of
 # job-runner images, and running them locally in dev, not for running them in
 # production
+name: job-runner
+
 services:
   prod:
     # image name, both locally and public
@@ -29,24 +31,34 @@ services:
     group_add: ["${DOCKER_HOST_GROUPID}"]
     # paths are relative to docker-compose.yaml
     volumes:
-      # use the default dev workdir
-      - ../workdir:/workdir
       # docker control
       - /var/run/docker.sock:/var/run/docker.sock
+      # use the default dev workdir
+      - ../workdir:/workdir
       # used to avoid ugly uid/gid lookup failures, but not strictly required
       - /etc/group:/etc/group:ro
       - /etc/passwd:/etc/passwd:ro
+      # these paths must be absolute on the host, and must be mount to the
+      # identical absolute paths inside the container. This allows for
+      # job-runner to use the same value for both its own file access, and for
+      # what to pass to the host docker as a bindmount source directory, which
+      # needs to be on the host. 
+      # We can potentially fix this by adding an explicit bind mount source configs, if we want to
+      - ${MEDIUM_PRIVACY_STORAGE_BASE}:${MEDIUM_PRIVACY_STORAGE_BASE}
+      - ${HIGH_PRIVACY_STORAGE_BASE}:${HIGH_PRIVACY_STORAGE_BASE}
     # paths relative to docker-compose.yaml file
     env_file:
+      # load jobrunner local dev config
       - ../.env
-      - docker-compose.env
+      # explicitly load local .env after parent dir .env
+      - .env
     # ensure WORKDIR environment points to fixed location
     environment:
       # default dev config
       WORKDIR: /workdir
-      MEDIUM_PRIVACY_STORAGE_BASE: /workdir/workspaces
-      HIGH_PRIVACY_STORAGE_BASE: /workdir/workspaces
-      DOCKER_HOST: ${DOCKER_HOST:-unix:///var/run/docker.sock}
+      # force using mounted docker socket
+      DOCKER_HOST: unix:///var/run/docker.sock
+      # used to enable in CI
       PRIVATE_REPO_TOKEN: ${PRIVATE_REPO_TOKEN:-}
 
   # main development service
diff --git a/docker/justfile b/docker/justfile
index f32fb3b6..95834ffa 100644
--- a/docker/justfile
+++ b/docker/justfile
@@ -1,26 +1,13 @@
 set dotenv-load := true
 
-export DOCKER_USERID := `id -u`
-export DOCKER_GROUPID := `id -g`
 export PYTEST_HOST_TMP := "/tmp/jobrunner-docker"
 
-# the dev build needs to know the hosts docker group ID, to match it in the container
-
-export DOCKER_HOST_GROUPID := `getent group docker | awk -F: '{print $3}'`
-
-# dev builds remap's appuser's uid to the running user, for easy file
-# permissions when we mount things in.
-
-export DEV_USERID := `id -u`
-export DEV_GROUPID := `id -g`
-
-build env="dev": tmpdir
+build env="dev": tmpdir docker-compose-env
     #!/usr/bin/env bash
     set -eu
 
-    # ensure env files exist
+    # ensure parent env files exist
     test -f ../.env || cp ../dotenv-sample ../.env
-    touch docker-compose.env
 
     # enable modern docker build features
     export DOCKER_BUILDKIT=1
@@ -41,17 +28,50 @@ tmpdir:
     fi
     mkdir $PYTEST_HOST_TMP
 
+docker-compose-env:
+    #!/bin/bash
+    mkdir -p {medium,high}
+    test -f .env && exit 0
+    cat <<EOF > .env
+    DEV_USERID=$(id -u)
+    DEV_GROUPID=$(id -g)
+    DOCKER_HOST_GROUPID=$(getent group docker | awk -F: '{print $3}')
+    PYTEST_HOST_TMP=$PYTEST_HOST_TMP
+    MEDIUM_PRIVACY_STORAGE_BASE=$(realpath $PWD/medium)
+    HIGH_PRIVACY_STORAGE_BASE=$(realpath $PWD/high)
+    EOF
+
 # run tests in docker container
 test *args: build
     #!/bin/bash
     docker compose run --rm test {{ args }}
 
+functional-test:
+    #!/bin/bash
+    container=job-runner-prod-1
+    trap 'docker compose kill prod' EXIT
+    docker compose up -d prod
+    docker compose exec prod python3 -m jobrunner.cli.add_job https://github.com/opensafely/research-template generate_dataset
+    attempts=""
+    # use docker logs reather than docker compose logs, as those will include previous run's logs 
+    while ! docker logs -n 10 $container |& grep -q "Completed successfully";
+    do
+        if test "$attempts" = ".........."; then
+            docker compose logs prod
+            exit 1;
+        fi
+        attempts="${attempts}."
+        echo "waiting..."
+        sleep 1
+    done
+    docker logs -n 10 $container |& grep "Completed successfully"
+
 # run dev server in docker container
 service: build
     docker compose up dev
 
 # run command in dev container
-run env="dev" *args="bash": build
+run env="dev" *args="": build
     docker compose run --rm {{ env }} {{ args }}
 
 # exec command in existing dev container
@@ -60,5 +80,5 @@ exec *args="bash":
 
 clean:
     #!/bin/bash
-    rm -rf $PYTEST_HOST_TMP
+    rm -rf $PYTEST_HOST_TMP .env
     docker image rm job-runner job-runner-dev || true