From a77c3927f6a7a7ef4d9383eb806c3221aaf015dd Mon Sep 17 00:00:00 2001
From: Alice Wong <alice.wong@thedatalab.org>
Date: Thu, 3 Oct 2024 18:07:32 +0100
Subject: [PATCH 1/3] change LATEST_VERSION to 4

---
 pipeline/features.py   | 19 +++++++++++++------
 tests/test_features.py | 19 +++++++++++++++++++
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/pipeline/features.py b/pipeline/features.py
index 5ad4fe5..a007ca6 100644
--- a/pipeline/features.py
+++ b/pipeline/features.py
@@ -3,21 +3,28 @@
 from types import SimpleNamespace
 
 
-# The version of `project.yaml` where each feature was introduced
+# The versions of `project.yaml` where each feature applies to
+# Tuple of (version where the feature was introduced, version after which the feature is deprecated)
 FEATURE_FLAGS_BY_VERSION = {
-    "UNIQUE_OUTPUT_PATH": 2,
-    "EXPECTATIONS_POPULATION": 3,
+    "UNIQUE_OUTPUT_PATH": (2, None),
+    "EXPECTATIONS_POPULATION": (3, 3),
+    "REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR": (4, None),
 }
 
-
-LATEST_VERSION = max(FEATURE_FLAGS_BY_VERSION.values())
+LATEST_VERSION = max([v[0] for v in FEATURE_FLAGS_BY_VERSION.values()])
 
 
 def get_feature_flags_for_version(version: float) -> SimpleNamespace:
+    if version > LATEST_VERSION:
+        raise ValueError(f"The latest version is v{LATEST_VERSION}, but got v{version}")
     feat = SimpleNamespace()
 
     for k, v in FEATURE_FLAGS_BY_VERSION.items():
-        value = v <= version  # is this feature turned on the requested version?
+        # is this feature turned on the requested version?
+        if v[1] is None:
+            value = v[0] <= version
+        else:
+            value = v[0] <= version <= v[1]
 
         setattr(feat, k, value)
 
diff --git a/tests/test_features.py b/tests/test_features.py
index 019399a..5f44dfd 100644
--- a/tests/test_features.py
+++ b/tests/test_features.py
@@ -1,3 +1,5 @@
+import pytest
+
 from pipeline.features import get_feature_flags_for_version
 
 
@@ -6,6 +8,7 @@ def test_get_feature_flags_for_version_with_v1():
 
     assert not flags.UNIQUE_OUTPUT_PATH
     assert not flags.EXPECTATIONS_POPULATION
+    assert not flags.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR
 
 
 def test_get_feature_flags_for_version_with_v2():
@@ -13,6 +16,7 @@ def test_get_feature_flags_for_version_with_v2():
 
     assert flags.UNIQUE_OUTPUT_PATH
     assert not flags.EXPECTATIONS_POPULATION
+    assert not flags.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR
 
 
 def test_get_feature_flags_for_version_with_v3():
@@ -20,3 +24,18 @@ def test_get_feature_flags_for_version_with_v3():
 
     assert flags.UNIQUE_OUTPUT_PATH
     assert flags.EXPECTATIONS_POPULATION
+    assert not flags.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR
+
+
+def test_get_feature_flags_for_version_with_v4():
+    flags = get_feature_flags_for_version(4)
+
+    assert flags.UNIQUE_OUTPUT_PATH
+    assert not flags.EXPECTATIONS_POPULATION
+    assert flags.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR
+
+
+def test_get_feature_flags_for_version_with_v5():
+    msg = "The latest version is v4, but got v5"
+    with pytest.raises(ValueError, match=msg):
+        get_feature_flags_for_version(5)

From 82e2182003a400ddbeacb83260e574ea3bd8f44a Mon Sep 17 00:00:00 2001
From: Alice Wong <alice.wong@thedatalab.org>
Date: Thu, 3 Oct 2024 18:20:45 +0100
Subject: [PATCH 2/3] Align existing tests with v4

---
 tests/fixtures/valid_yaml/project.yaml | 29 ++++++++------------------
 tests/test_models.py                   | 14 ++++++-------
 2 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/tests/fixtures/valid_yaml/project.yaml b/tests/fixtures/valid_yaml/project.yaml
index 23b7c3c..b609593 100644
--- a/tests/fixtures/valid_yaml/project.yaml
+++ b/tests/fixtures/valid_yaml/project.yaml
@@ -1,40 +1,29 @@
-version: '3.0'
-
-expectations:
-  population_size: 100
+version: '4.0'
 
 actions:
-  generate_cohort:
-    run: cohortextractor:latest generate_cohort
-    outputs:
-      highly_sensitive:
-        cohort: output/input.csv
-
-  generate_cohort_with_dummy_data:
-    # we provide --output-dir here to distinguish the action from the one above
-    run: cohortextractor:latest generate_cohort --output-dir output/extra
-    dummy_data_file: test-data/dummy-data.csv
+  generate_dataset:
+    run: ehrql:v1 generate-dataset analysis/dataset_definition.py --output output/dataset.csv.gz
     outputs:
       highly_sensitive:
-        cohort: output/extra/input.csv
+        dataset: output/dataset.csv.gz
 
   prepare_data_m:
     run: python:latest python analysis/filter_by_sex.py M output/input.csv male.csv
-    needs: [generate_cohort]
+    needs: [generate_dataset]
     outputs:
       highly_sensitive:
         male_cohort: male*.csv
 
   prepare_data_f:
     run: python:latest python analysis/filter_by_sex.py F output/input.csv female.csv
-    needs: [generate_cohort]
+    needs: [generate_dataset]
     outputs:
       highly_sensitive:
         female_cohort: female*.csv
 
   prepare_data_with_quote_in_filename:
     run: python:latest python analysis/filter_by_sex.py F output/input.csv "qu'ote.csv"
-    needs: [generate_cohort]
+    needs: [generate_dataset]
     outputs:
       highly_sensitive:
         quote_cohort: "qu'ote*.csv"
@@ -52,14 +41,14 @@ actions:
     run: minimal-action:v1.1.0 output/input.csv
     config:
       suffix: .backup
-    needs: [generate_cohort]
+    needs: [generate_dataset]
     outputs:
       highly_sensitive:
         cohort: output/input.backup.csv
 
   test_cancellation:
     run: python:latest python analysis/filter_by_sex.py F output/input.csv somefile.csv
-    needs: [generate_cohort]
+    needs: [generate_dataset]
     outputs:
       highly_sensitive:
         somefile: somefile.csv
diff --git a/tests/test_models.py b/tests/test_models.py
index 55e2465..a73966f 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -7,13 +7,12 @@
 
 def test_success():
     data = {
-        "version": "3",
-        "expectations": {"population_size": 10},
+        "version": "4",
         "actions": {
             "action1": {
                 "run": "test:latest",
                 "outputs": {
-                    "moderately_sensitive": {"cohort": "output.csv"},
+                    "moderately_sensitive": {"dataset": "output.csv"},
                 },
             },
         },
@@ -169,7 +168,7 @@ def test_expectations_before_v3_has_a_default_set():
     assert config.expectations.population_size == 1000
 
 
-def test_expectations_exists():
+def test_expectations_exists_for_v3():
     # our logic for this is custom so ensure it works as expected
     data = {
         "version": 3,
@@ -186,7 +185,7 @@ def test_expectations_exists():
         Pipeline.build(**data)
 
 
-def test_expectations_population_size_exists():
+def test_expectations_population_size_exists_for_v3():
     data = {
         "version": 3,
         "expectations": {},
@@ -203,7 +202,7 @@ def test_expectations_population_size_exists():
         Pipeline.build(**data)
 
 
-def test_expectations_population_size_is_a_number():
+def test_expectations_population_size_is_a_number_for_v3():
     data = {
         "version": 3,
         "expectations": {"population_size": "test"},
@@ -225,8 +224,7 @@ def test_pipeline_all_actions(test_file):
     config = load_pipeline(test_file)
 
     assert config.all_actions == [
-        "generate_cohort",
-        "generate_cohort_with_dummy_data",
+        "generate_dataset",
         "prepare_data_m",
         "prepare_data_f",
         "prepare_data_with_quote_in_filename",

From 2c5cb81865db21d44a7a50905d3f9d6a7009ce75 Mon Sep 17 00:00:00 2001
From: Alice Wong <alice.wong@thedatalab.org>
Date: Fri, 4 Oct 2024 13:05:32 +0100
Subject: [PATCH 3/3] Remove cohort extractor support in v4

---
 pipeline/models.py     | 31 +++++++++++++++++++++----------
 pipeline/validation.py |  7 +++++++
 tests/test_models.py   | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/pipeline/models.py b/pipeline/models.py
index 65fc257..9d38772 100644
--- a/pipeline/models.py
+++ b/pipeline/models.py
@@ -15,6 +15,7 @@
     validate_databuilder_outputs,
     validate_glob_pattern,
     validate_no_kwargs,
+    validate_not_cohort_extractor_action,
     validate_type,
 )
 
@@ -238,7 +239,7 @@ def is_database_action(self) -> bool:
 class Pipeline:
     version: float
     actions: dict[str, Action]
-    expectations: Expectations
+    expectations: Expectations | None
 
     @classmethod
     def build(
@@ -262,6 +263,7 @@ def build(
             raise ValidationError(
                 f"`version` must be a number between 1 and {LATEST_VERSION}"
             )
+        feat = get_feature_flags_for_version(version)
 
         validate_type(actions, dict, "Project `actions` section")
         actions = {
@@ -269,6 +271,10 @@ def build(
             for action_id, action_config in actions.items()
         }
 
+        if feat.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR:
+            for config in actions.values():
+                validate_not_cohort_extractor_action(config)
+
         seen: dict[Command, list[str]] = defaultdict(list)
         for name, config in actions.items():
             run = config.run
@@ -278,7 +284,7 @@ def build(
                 )
             seen[run].append(name)
 
-        if get_feature_flags_for_version(version).UNIQUE_OUTPUT_PATH:
+        if feat.UNIQUE_OUTPUT_PATH:
             # find duplicate paths defined in the outputs section
             seen_files = []
             for config in actions.values():
@@ -298,19 +304,24 @@ def build(
                         f"Action `{a.action_id}` references an unknown action in its `needs` list: {n}"
                     )
 
-        feat = get_feature_flags_for_version(version)
-        if feat.EXPECTATIONS_POPULATION:
+        if feat.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR:
+            if expectations is not None:
+                raise ValidationError(
+                    "Project includes `expectations` section, which is not supported in this version"
+                )
+        elif feat.EXPECTATIONS_POPULATION:
             if expectations is None:
                 raise ValidationError("Project must include `expectations` section")
         else:
             expectations = {"population_size": 1000}
 
-        validate_type(expectations, dict, "Project `expectations` section")
-        if "population_size" not in expectations:
-            raise ValidationError(
-                "Project `expectations` section must include `population_size` section",
-            )
-        expectations = Expectations.build(**expectations)
+        if expectations is not None:
+            validate_type(expectations, dict, "Project `expectations` section")
+            if "population_size" not in expectations:
+                raise ValidationError(
+                    "Project `expectations` section must include `population_size` section",
+                )
+            expectations = Expectations.build(**expectations)
 
         return cls(version, actions, expectations)
 
diff --git a/pipeline/validation.py b/pipeline/validation.py
index d1cb2f2..6b32a67 100644
--- a/pipeline/validation.py
+++ b/pipeline/validation.py
@@ -79,6 +79,13 @@ def validate_glob_pattern(pattern: str, privacy_level: str) -> None:
         raise InvalidPatternError("is an absolute path")
 
 
+def validate_not_cohort_extractor_action(action: Action) -> None:
+    if action.run.parts[0].startswith("cohortextractor"):
+        raise ValidationError(
+            f"Action {action.action_id} uses cohortextractor actions, which are not supported in this version."
+        )
+
+
 def validate_cohortextractor_outputs(action_id: str, action: Action) -> None:
     """
     Check cohortextractor's output config is valid for this command
diff --git a/tests/test_models.py b/tests/test_models.py
index a73966f..1c89dc5 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -129,6 +129,23 @@ def test_action_extraction_command_with_one_outputs():
     assert len(outputs.values()) == 1
 
 
+def test_cohortextractor_actions_not_used_after_v3():
+    data = {
+        "version": "4",
+        "actions": {
+            "generate_cohort": {
+                "run": "cohortextractor:latest generate_cohort",
+                "outputs": {
+                    "highly_sensitive": {"cohort": "output/input.csv"},
+                },
+            },
+        },
+    }
+    msg = "uses cohortextractor actions, which are not supported in this version."
+    with pytest.raises(ValidationError, match=msg):
+        Pipeline.build(**data)
+
+
 def test_command_properties():
     data = {
         "version": 1,
@@ -168,6 +185,22 @@ def test_expectations_before_v3_has_a_default_set():
     assert config.expectations.population_size == 1000
 
 
+def test_expectations_does_not_exist_after_v3():
+    data = {
+        "version": 4,
+        "expectations": {},
+        "actions": {
+            "generate_dataset": {
+                "run": "ehrql:v1 generate-dataset args --output output/dataset.csv.gz",
+                "outputs": {"highly_sensitive": {"dataset": "output/dataset.csv.gz"}},
+            }
+        },
+    }
+    msg = "Project includes `expectations` section"
+    with pytest.raises(ValidationError, match=msg):
+        Pipeline.build(**data)
+
+
 def test_expectations_exists_for_v3():
     # our logic for this is custom so ensure it works as expected
     data = {