From a47b93230f65a4c4791b163e977b98ee7e38e37f Mon Sep 17 00:00:00 2001 From: Peter Inglesby Date: Thu, 7 Apr 2022 13:26:07 +0100 Subject: [PATCH] Limit the allowed size of moderately-sensitive outputs Fixes #298 --- jobrunner/config.py | 3 +++ jobrunner/executors/local.py | 13 ++++++++++++ jobrunner/lib/docker.py | 19 ++++++++++++++++++ tests/test_local_executor.py | 39 ++++++++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+) diff --git a/jobrunner/config.py b/jobrunner/config.py index 0ccc340f..a7316d14 100644 --- a/jobrunner/config.py +++ b/jobrunner/config.py @@ -38,6 +38,9 @@ def _is_valid_backend_name(name): "HIGH_PRIVACY_ARCHIVE_DIR", HIGH_PRIVACY_STORAGE_BASE / "archives" ) +# The maximum size of all moderately-sensitive output files +MAX_OUTPUT_SIZE_BYTES = 32 * 2**20 # 32MB + # valid archive formats ARCHIVE_FORMATS = (".tar.gz", ".tar.zstd", ".tar.xz") diff --git a/jobrunner/executors/local.py b/jobrunner/executors/local.py index 16923796..396c5c0f 100644 --- a/jobrunner/executors/local.py +++ b/jobrunner/executors/local.py @@ -252,10 +252,23 @@ def finalize_job(job): image_id=container_metadata["Image"], message=message, ) + output_size_bytes = get_output_size_bytes(job, results.outputs) + if output_size_bytes > config.MAX_OUTPUT_SIZE_BYTES: + if results.message is None: + results.message = f"total size of moderately-sensitive outputs exceeded {config.MAX_OUTPUT_SIZE_BYTES}" persist_outputs(job, results.outputs, container_metadata) RESULTS[job.id] = results +def get_output_size_bytes(job, outputs): + volume = volume_name(job) + return sum( + docker.get_file_size_bytes(volume, filename) + for filename, privacy_level in outputs.items() + if privacy_level == "moderately_sensitive" + ) + + def persist_outputs(job, outputs, container_metadata): """Copy logs and generated outputs to persistant storage.""" # job_metadata is a big dict capturing everything we know about the state diff --git a/jobrunner/lib/docker.py b/jobrunner/lib/docker.py index 6e49e0fd..7eff7bc4 100644 --- a/jobrunner/lib/docker.py +++ b/jobrunner/lib/docker.py @@ -280,6 +280,25 @@ def find_newer_files(volume_name, reference_file): return sorted(files) +def get_file_size_bytes(volume_name, path): + """Return size in bytes of file at path.""" + response = docker( + [ + "container", + "exec", + manager_name(volume_name), + "stat", + "-c%s", + f"{VOLUME_MOUNT_POINT}/{path}", + ], + check=True, + capture_output=True, + text=True, + encoding="utf-8", + ) + return int(response.stdout.strip()) + + def manager_name(volume_name): return f"{volume_name}-manager" diff --git a/tests/test_local_executor.py b/tests/test_local_executor.py index a238e07e..34ea3de0 100644 --- a/tests/test_local_executor.py +++ b/tests/test_local_executor.py @@ -459,6 +459,45 @@ def test_finalize_failed_137(use_api, docker_cleanup, test_repo, tmp_work_dir): assert local.RESULTS[job.id].message == "likely means it ran out of memory" +@pytest.mark.needs_docker +def test_finalize_failed_too_many_outputs( + use_api, docker_cleanup, test_repo, tmp_work_dir +): + ensure_docker_images_present("busybox") + + job = JobDefinition( + id="test_finalize_failed_too_many_outputs", + study=test_repo.study, + workspace="test", + action="action", + image="ghcr.io/opensafely-core/busybox", + args=["cp", "/workspace/output/input.csv", "/workspace/output/summary.csv"], + env={}, + inputs=["output/input.csv"], + output_spec={"output/summary.csv": "moderately_sensitive"}, + allow_database_access=False, + ) + + populate_workspace(job.workspace, "output/input.csv", content="X" * (33 * 2**20)) + + api = local.LocalDockerAPI() + + status = api.prepare(job) + assert status.state == ExecutorState.PREPARING + status = api.execute(job) + assert status.state == ExecutorState.EXECUTING + wait_for_state(api, job, ExecutorState.EXECUTED) + status = api.finalize(job) + assert status.state == ExecutorState.FINALIZING + + # we don't need to wait + assert api.get_status(job).state == ExecutorState.FINALIZED + assert job.id in local.RESULTS + results = api.get_results(job) + assert results.exit_code == 0 + assert "outputs exceeded" in local.RESULTS[job.id].message + + @pytest.mark.needs_docker def test_cleanup_success(use_api, docker_cleanup, test_repo, tmp_work_dir): ensure_docker_images_present("busybox")