Add level4_excluded_files results filed to Job db

This allows us to keep a record of how many files fail the check, for audit purposes. It also allows us to surface the information better in local_run, as we can pull it from the db.
opensafely-core · Nov 10, 2023 · 7cb29d4 · 7cb29d4
1 parent a8932da
commit 7cb29d4
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 0 deletions.
diff --git a/jobrunner/cli/local_run.py b/jobrunner/cli/local_run.py
@@ -426,6 +426,13 @@ def create_and_run_jobs(
         print("   outputs:")
         outputs = sorted(job.outputs.items()) if job.outputs else []
         print(tabulate(outputs, separator="  - ", indent=5, empty="(no outputs)"))
+
+        if job.level4_excluded_files:
+            print("   invalid moderately_sensitive outputs:")
+            print(
+                tabulate(job.level4_excluded_files.items(), separator="  - ", indent=5)
+            )
+
         # If a job exited with an error code then try to display the end of the
         # log output in case that makes the problem immediately obvious
         if job.status_code == StatusCode.NONZERO_EXIT:

diff --git a/jobrunner/models.py b/jobrunner/models.py
@@ -157,6 +157,7 @@ class Job:
             completed_at INT,
             trace_context TEXT,
             status_code_updated_at INT,
+            level4_excluded_files TEXT,
 
             PRIMARY KEY (id)
         );
@@ -179,6 +180,13 @@ class Job:
         """,
     )
 
+    migration(
+        2,
+        """
+        ALTER TABLE job ADD COLUMN level4_excluded_files TEXT;
+        """,
+    )
+
     id: str = None  # noqa: A003
     job_request_id: str = None
     state: State = None
@@ -237,6 +245,8 @@ class Job:
     # used to track the OTel trace context for this job
     trace_context: dict = None
 
+    level4_excluded_files: dict = None
+
     # used to cache the job_request json by the tracing code
     _job_request = None
 

diff --git a/jobrunner/run.py b/jobrunner/run.py
@@ -434,6 +434,7 @@ def save_results(job, job_definition, results):
     """Extract the results of the execution and update the job accordingly."""
     # save job outputs
     job.outputs = results.outputs
+    job.level4_excluded_files = results.level4_excluded_files
 
     message = None
     error = False

diff --git a/tests/cli/test_local_run.py b/tests/cli/test_local_run.py
@@ -8,6 +8,7 @@
 
 import pytest
 from pipeline import load_pipeline
+from ruyaml import YAML
 
 from jobrunner import config
 from jobrunner.actions import get_action_specification
@@ -40,6 +41,36 @@ def test_local_run_limits_applied(db, tmp_path, docker_cleanup):
         assert metadata["HostConfig"]["NanoCpus"] == 1.5 * 1e9
 
 
+@pytest.mark.slow_test
+@pytest.mark.needs_docker
+def test_local_run_level_4_checks_applied_and_logged(
+    db, tmp_path, docker_cleanup, capsys
+):
+    project_dir = tmp_path / "project"
+    shutil.copytree(str(FIXTURE_DIR / "full_project"), project_dir)
+    project_yaml = project_dir / "project.yaml"
+    yaml = YAML()
+    project = yaml.load(project_yaml)
+    outputs = project["actions"]["generate_dataset"]["outputs"].pop("highly_sensitive")
+    project["actions"]["generate_dataset"]["outputs"]["moderately_sensitive"] = outputs
+    yaml.dump(project, project_yaml)
+
+    local_run.main(
+        project_dir=project_dir,
+        actions=["generate_dataset"],
+        debug=True,  # preserves containers for inspection
+    )
+
+    job = list(database.find_all(Job))[0]
+    assert job.level4_excluded_files == {
+        "output/dataset.csv": "File has patient_id column"
+    }
+
+    stdout = capsys.readouterr().out
+    assert "invalid moderately_sensitive outputs:" in stdout
+    assert "output/dataset.csv  - File has patient_id column" in stdout
+
+
 @pytest.mark.parametrize("extraction_tool", ["cohortextractor", "databuilder"])
 @pytest.mark.slow_test
 @pytest.mark.needs_docker