Skip to content

Commit

Permalink
Add level4_excluded_files results filed to Job db
Browse files Browse the repository at this point in the history
This allows us to keep a record of how many files fail the check, for
audit purposes.

It also allows us to surface the information better in local_run, as we
can pull it from the db.
  • Loading branch information
bloodearnest committed Nov 10, 2023
1 parent a8932da commit 7cb29d4
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 0 deletions.
7 changes: 7 additions & 0 deletions jobrunner/cli/local_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,13 @@ def create_and_run_jobs(
print(" outputs:")
outputs = sorted(job.outputs.items()) if job.outputs else []
print(tabulate(outputs, separator=" - ", indent=5, empty="(no outputs)"))

if job.level4_excluded_files:
print(" invalid moderately_sensitive outputs:")
print(
tabulate(job.level4_excluded_files.items(), separator=" - ", indent=5)
)

# If a job exited with an error code then try to display the end of the
# log output in case that makes the problem immediately obvious
if job.status_code == StatusCode.NONZERO_EXIT:
Expand Down
10 changes: 10 additions & 0 deletions jobrunner/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ class Job:
completed_at INT,
trace_context TEXT,
status_code_updated_at INT,
level4_excluded_files TEXT,
PRIMARY KEY (id)
);
Expand All @@ -179,6 +180,13 @@ class Job:
""",
)

migration(
2,
"""
ALTER TABLE job ADD COLUMN level4_excluded_files TEXT;
""",
)

id: str = None # noqa: A003
job_request_id: str = None
state: State = None
Expand Down Expand Up @@ -237,6 +245,8 @@ class Job:
# used to track the OTel trace context for this job
trace_context: dict = None

level4_excluded_files: dict = None

# used to cache the job_request json by the tracing code
_job_request = None

Expand Down
1 change: 1 addition & 0 deletions jobrunner/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,7 @@ def save_results(job, job_definition, results):
"""Extract the results of the execution and update the job accordingly."""
# save job outputs
job.outputs = results.outputs
job.level4_excluded_files = results.level4_excluded_files

message = None
error = False
Expand Down
31 changes: 31 additions & 0 deletions tests/cli/test_local_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import pytest
from pipeline import load_pipeline
from ruyaml import YAML

from jobrunner import config
from jobrunner.actions import get_action_specification
Expand Down Expand Up @@ -40,6 +41,36 @@ def test_local_run_limits_applied(db, tmp_path, docker_cleanup):
assert metadata["HostConfig"]["NanoCpus"] == 1.5 * 1e9


@pytest.mark.slow_test
@pytest.mark.needs_docker
def test_local_run_level_4_checks_applied_and_logged(
db, tmp_path, docker_cleanup, capsys
):
project_dir = tmp_path / "project"
shutil.copytree(str(FIXTURE_DIR / "full_project"), project_dir)
project_yaml = project_dir / "project.yaml"
yaml = YAML()
project = yaml.load(project_yaml)
outputs = project["actions"]["generate_dataset"]["outputs"].pop("highly_sensitive")
project["actions"]["generate_dataset"]["outputs"]["moderately_sensitive"] = outputs
yaml.dump(project, project_yaml)

local_run.main(
project_dir=project_dir,
actions=["generate_dataset"],
debug=True, # preserves containers for inspection
)

job = list(database.find_all(Job))[0]
assert job.level4_excluded_files == {
"output/dataset.csv": "File has patient_id column"
}

stdout = capsys.readouterr().out
assert "invalid moderately_sensitive outputs:" in stdout
assert "output/dataset.csv - File has patient_id column" in stdout


@pytest.mark.parametrize("extraction_tool", ["cohortextractor", "databuilder"])
@pytest.mark.slow_test
@pytest.mark.needs_docker
Expand Down

0 comments on commit 7cb29d4

Please sign in to comment.