Skip to content

Commit

Permalink
Limit the allowed size of moderately-sensitive outputs
Browse files Browse the repository at this point in the history
Fixes #298
inglesp committed Apr 7, 2022

Unverified

This user has not yet uploaded their public signing key.
1 parent 5b097e6 commit a47b932
Showing 4 changed files with 74 additions and 0 deletions.
3 changes: 3 additions & 0 deletions jobrunner/config.py
Original file line number Diff line number Diff line change
@@ -38,6 +38,9 @@ def _is_valid_backend_name(name):
"HIGH_PRIVACY_ARCHIVE_DIR", HIGH_PRIVACY_STORAGE_BASE / "archives"
)

# The maximum size of all moderately-sensitive output files
MAX_OUTPUT_SIZE_BYTES = 32 * 2**20 # 32MB

# valid archive formats
ARCHIVE_FORMATS = (".tar.gz", ".tar.zstd", ".tar.xz")

13 changes: 13 additions & 0 deletions jobrunner/executors/local.py
Original file line number Diff line number Diff line change
@@ -252,10 +252,23 @@ def finalize_job(job):
image_id=container_metadata["Image"],
message=message,
)
output_size_bytes = get_output_size_bytes(job, results.outputs)
if output_size_bytes > config.MAX_OUTPUT_SIZE_BYTES:
if results.message is None:
results.message = f"total size of moderately-sensitive outputs exceeded {config.MAX_OUTPUT_SIZE_BYTES}"
persist_outputs(job, results.outputs, container_metadata)
RESULTS[job.id] = results


def get_output_size_bytes(job, outputs):
volume = volume_name(job)
return sum(
docker.get_file_size_bytes(volume, filename)
for filename, privacy_level in outputs.items()
if privacy_level == "moderately_sensitive"
)


def persist_outputs(job, outputs, container_metadata):
"""Copy logs and generated outputs to persistant storage."""
# job_metadata is a big dict capturing everything we know about the state
19 changes: 19 additions & 0 deletions jobrunner/lib/docker.py
Original file line number Diff line number Diff line change
@@ -280,6 +280,25 @@ def find_newer_files(volume_name, reference_file):
return sorted(files)


def get_file_size_bytes(volume_name, path):
"""Return size in bytes of file at path."""
response = docker(
[
"container",
"exec",
manager_name(volume_name),
"stat",
"-c%s",
f"{VOLUME_MOUNT_POINT}/{path}",
],
check=True,
capture_output=True,
text=True,
encoding="utf-8",
)
return int(response.stdout.strip())


def manager_name(volume_name):
return f"{volume_name}-manager"

39 changes: 39 additions & 0 deletions tests/test_local_executor.py
Original file line number Diff line number Diff line change
@@ -459,6 +459,45 @@ def test_finalize_failed_137(use_api, docker_cleanup, test_repo, tmp_work_dir):
assert local.RESULTS[job.id].message == "likely means it ran out of memory"


@pytest.mark.needs_docker
def test_finalize_failed_too_many_outputs(
use_api, docker_cleanup, test_repo, tmp_work_dir
):
ensure_docker_images_present("busybox")

job = JobDefinition(
id="test_finalize_failed_too_many_outputs",
study=test_repo.study,
workspace="test",
action="action",
image="ghcr.io/opensafely-core/busybox",
args=["cp", "/workspace/output/input.csv", "/workspace/output/summary.csv"],
env={},
inputs=["output/input.csv"],
output_spec={"output/summary.csv": "moderately_sensitive"},
allow_database_access=False,
)

populate_workspace(job.workspace, "output/input.csv", content="X" * (33 * 2**20))

api = local.LocalDockerAPI()

status = api.prepare(job)
assert status.state == ExecutorState.PREPARING
status = api.execute(job)
assert status.state == ExecutorState.EXECUTING
wait_for_state(api, job, ExecutorState.EXECUTED)
status = api.finalize(job)
assert status.state == ExecutorState.FINALIZING

# we don't need to wait
assert api.get_status(job).state == ExecutorState.FINALIZED
assert job.id in local.RESULTS
results = api.get_results(job)
assert results.exit_code == 0
assert "outputs exceeded" in local.RESULTS[job.id].message


@pytest.mark.needs_docker
def test_cleanup_success(use_api, docker_cleanup, test_repo, tmp_work_dir):
ensure_docker_images_present("busybox")

0 comments on commit a47b932

Please sign in to comment.