From a77c3927f6a7a7ef4d9383eb806c3221aaf015dd Mon Sep 17 00:00:00 2001 From: Alice Wong <alice.wong@thedatalab.org> Date: Thu, 3 Oct 2024 18:07:32 +0100 Subject: [PATCH 1/3] change LATEST_VERSION to 4 --- pipeline/features.py | 19 +++++++++++++------ tests/test_features.py | 19 +++++++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/pipeline/features.py b/pipeline/features.py index 5ad4fe5..a007ca6 100644 --- a/pipeline/features.py +++ b/pipeline/features.py @@ -3,21 +3,28 @@ from types import SimpleNamespace -# The version of `project.yaml` where each feature was introduced +# The versions of `project.yaml` where each feature applies to +# Tuple of (version where the feature was introduced, version after which the feature is deprecated) FEATURE_FLAGS_BY_VERSION = { - "UNIQUE_OUTPUT_PATH": 2, - "EXPECTATIONS_POPULATION": 3, + "UNIQUE_OUTPUT_PATH": (2, None), + "EXPECTATIONS_POPULATION": (3, 3), + "REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR": (4, None), } - -LATEST_VERSION = max(FEATURE_FLAGS_BY_VERSION.values()) +LATEST_VERSION = max([v[0] for v in FEATURE_FLAGS_BY_VERSION.values()]) def get_feature_flags_for_version(version: float) -> SimpleNamespace: + if version > LATEST_VERSION: + raise ValueError(f"The latest version is v{LATEST_VERSION}, but got v{version}") feat = SimpleNamespace() for k, v in FEATURE_FLAGS_BY_VERSION.items(): - value = v <= version # is this feature turned on the requested version? + # is this feature turned on the requested version? + if v[1] is None: + value = v[0] <= version + else: + value = v[0] <= version <= v[1] setattr(feat, k, value) diff --git a/tests/test_features.py b/tests/test_features.py index 019399a..5f44dfd 100644 --- a/tests/test_features.py +++ b/tests/test_features.py @@ -1,3 +1,5 @@ +import pytest + from pipeline.features import get_feature_flags_for_version @@ -6,6 +8,7 @@ def test_get_feature_flags_for_version_with_v1(): assert not flags.UNIQUE_OUTPUT_PATH assert not flags.EXPECTATIONS_POPULATION + assert not flags.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR def test_get_feature_flags_for_version_with_v2(): @@ -13,6 +16,7 @@ def test_get_feature_flags_for_version_with_v2(): assert flags.UNIQUE_OUTPUT_PATH assert not flags.EXPECTATIONS_POPULATION + assert not flags.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR def test_get_feature_flags_for_version_with_v3(): @@ -20,3 +24,18 @@ def test_get_feature_flags_for_version_with_v3(): assert flags.UNIQUE_OUTPUT_PATH assert flags.EXPECTATIONS_POPULATION + assert not flags.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR + + +def test_get_feature_flags_for_version_with_v4(): + flags = get_feature_flags_for_version(4) + + assert flags.UNIQUE_OUTPUT_PATH + assert not flags.EXPECTATIONS_POPULATION + assert flags.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR + + +def test_get_feature_flags_for_version_with_v5(): + msg = "The latest version is v4, but got v5" + with pytest.raises(ValueError, match=msg): + get_feature_flags_for_version(5) From 82e2182003a400ddbeacb83260e574ea3bd8f44a Mon Sep 17 00:00:00 2001 From: Alice Wong <alice.wong@thedatalab.org> Date: Thu, 3 Oct 2024 18:20:45 +0100 Subject: [PATCH 2/3] Align existing tests with v4 --- tests/fixtures/valid_yaml/project.yaml | 29 ++++++++------------------ tests/test_models.py | 14 ++++++------- 2 files changed, 15 insertions(+), 28 deletions(-) diff --git a/tests/fixtures/valid_yaml/project.yaml b/tests/fixtures/valid_yaml/project.yaml index 23b7c3c..b609593 100644 --- a/tests/fixtures/valid_yaml/project.yaml +++ b/tests/fixtures/valid_yaml/project.yaml @@ -1,40 +1,29 @@ -version: '3.0' - -expectations: - population_size: 100 +version: '4.0' actions: - generate_cohort: - run: cohortextractor:latest generate_cohort - outputs: - highly_sensitive: - cohort: output/input.csv - - generate_cohort_with_dummy_data: - # we provide --output-dir here to distinguish the action from the one above - run: cohortextractor:latest generate_cohort --output-dir output/extra - dummy_data_file: test-data/dummy-data.csv + generate_dataset: + run: ehrql:v1 generate-dataset analysis/dataset_definition.py --output output/dataset.csv.gz outputs: highly_sensitive: - cohort: output/extra/input.csv + dataset: output/dataset.csv.gz prepare_data_m: run: python:latest python analysis/filter_by_sex.py M output/input.csv male.csv - needs: [generate_cohort] + needs: [generate_dataset] outputs: highly_sensitive: male_cohort: male*.csv prepare_data_f: run: python:latest python analysis/filter_by_sex.py F output/input.csv female.csv - needs: [generate_cohort] + needs: [generate_dataset] outputs: highly_sensitive: female_cohort: female*.csv prepare_data_with_quote_in_filename: run: python:latest python analysis/filter_by_sex.py F output/input.csv "qu'ote.csv" - needs: [generate_cohort] + needs: [generate_dataset] outputs: highly_sensitive: quote_cohort: "qu'ote*.csv" @@ -52,14 +41,14 @@ actions: run: minimal-action:v1.1.0 output/input.csv config: suffix: .backup - needs: [generate_cohort] + needs: [generate_dataset] outputs: highly_sensitive: cohort: output/input.backup.csv test_cancellation: run: python:latest python analysis/filter_by_sex.py F output/input.csv somefile.csv - needs: [generate_cohort] + needs: [generate_dataset] outputs: highly_sensitive: somefile: somefile.csv diff --git a/tests/test_models.py b/tests/test_models.py index 55e2465..a73966f 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -7,13 +7,12 @@ def test_success(): data = { - "version": "3", - "expectations": {"population_size": 10}, + "version": "4", "actions": { "action1": { "run": "test:latest", "outputs": { - "moderately_sensitive": {"cohort": "output.csv"}, + "moderately_sensitive": {"dataset": "output.csv"}, }, }, }, @@ -169,7 +168,7 @@ def test_expectations_before_v3_has_a_default_set(): assert config.expectations.population_size == 1000 -def test_expectations_exists(): +def test_expectations_exists_for_v3(): # our logic for this is custom so ensure it works as expected data = { "version": 3, @@ -186,7 +185,7 @@ def test_expectations_exists(): Pipeline.build(**data) -def test_expectations_population_size_exists(): +def test_expectations_population_size_exists_for_v3(): data = { "version": 3, "expectations": {}, @@ -203,7 +202,7 @@ def test_expectations_population_size_exists(): Pipeline.build(**data) -def test_expectations_population_size_is_a_number(): +def test_expectations_population_size_is_a_number_for_v3(): data = { "version": 3, "expectations": {"population_size": "test"}, @@ -225,8 +224,7 @@ def test_pipeline_all_actions(test_file): config = load_pipeline(test_file) assert config.all_actions == [ - "generate_cohort", - "generate_cohort_with_dummy_data", + "generate_dataset", "prepare_data_m", "prepare_data_f", "prepare_data_with_quote_in_filename", From 2c5cb81865db21d44a7a50905d3f9d6a7009ce75 Mon Sep 17 00:00:00 2001 From: Alice Wong <alice.wong@thedatalab.org> Date: Fri, 4 Oct 2024 13:05:32 +0100 Subject: [PATCH 3/3] Remove cohort extractor support in v4 --- pipeline/models.py | 31 +++++++++++++++++++++---------- pipeline/validation.py | 7 +++++++ tests/test_models.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 10 deletions(-) diff --git a/pipeline/models.py b/pipeline/models.py index 65fc257..9d38772 100644 --- a/pipeline/models.py +++ b/pipeline/models.py @@ -15,6 +15,7 @@ validate_databuilder_outputs, validate_glob_pattern, validate_no_kwargs, + validate_not_cohort_extractor_action, validate_type, ) @@ -238,7 +239,7 @@ def is_database_action(self) -> bool: class Pipeline: version: float actions: dict[str, Action] - expectations: Expectations + expectations: Expectations | None @classmethod def build( @@ -262,6 +263,7 @@ def build( raise ValidationError( f"`version` must be a number between 1 and {LATEST_VERSION}" ) + feat = get_feature_flags_for_version(version) validate_type(actions, dict, "Project `actions` section") actions = { @@ -269,6 +271,10 @@ def build( for action_id, action_config in actions.items() } + if feat.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR: + for config in actions.values(): + validate_not_cohort_extractor_action(config) + seen: dict[Command, list[str]] = defaultdict(list) for name, config in actions.items(): run = config.run @@ -278,7 +284,7 @@ def build( ) seen[run].append(name) - if get_feature_flags_for_version(version).UNIQUE_OUTPUT_PATH: + if feat.UNIQUE_OUTPUT_PATH: # find duplicate paths defined in the outputs section seen_files = [] for config in actions.values(): @@ -298,19 +304,24 @@ def build( f"Action `{a.action_id}` references an unknown action in its `needs` list: {n}" ) - feat = get_feature_flags_for_version(version) - if feat.EXPECTATIONS_POPULATION: + if feat.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR: + if expectations is not None: + raise ValidationError( + "Project includes `expectations` section, which is not supported in this version" + ) + elif feat.EXPECTATIONS_POPULATION: if expectations is None: raise ValidationError("Project must include `expectations` section") else: expectations = {"population_size": 1000} - validate_type(expectations, dict, "Project `expectations` section") - if "population_size" not in expectations: - raise ValidationError( - "Project `expectations` section must include `population_size` section", - ) - expectations = Expectations.build(**expectations) + if expectations is not None: + validate_type(expectations, dict, "Project `expectations` section") + if "population_size" not in expectations: + raise ValidationError( + "Project `expectations` section must include `population_size` section", + ) + expectations = Expectations.build(**expectations) return cls(version, actions, expectations) diff --git a/pipeline/validation.py b/pipeline/validation.py index d1cb2f2..6b32a67 100644 --- a/pipeline/validation.py +++ b/pipeline/validation.py @@ -79,6 +79,13 @@ def validate_glob_pattern(pattern: str, privacy_level: str) -> None: raise InvalidPatternError("is an absolute path") +def validate_not_cohort_extractor_action(action: Action) -> None: + if action.run.parts[0].startswith("cohortextractor"): + raise ValidationError( + f"Action {action.action_id} uses cohortextractor actions, which are not supported in this version." + ) + + def validate_cohortextractor_outputs(action_id: str, action: Action) -> None: """ Check cohortextractor's output config is valid for this command diff --git a/tests/test_models.py b/tests/test_models.py index a73966f..1c89dc5 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -129,6 +129,23 @@ def test_action_extraction_command_with_one_outputs(): assert len(outputs.values()) == 1 +def test_cohortextractor_actions_not_used_after_v3(): + data = { + "version": "4", + "actions": { + "generate_cohort": { + "run": "cohortextractor:latest generate_cohort", + "outputs": { + "highly_sensitive": {"cohort": "output/input.csv"}, + }, + }, + }, + } + msg = "uses cohortextractor actions, which are not supported in this version." + with pytest.raises(ValidationError, match=msg): + Pipeline.build(**data) + + def test_command_properties(): data = { "version": 1, @@ -168,6 +185,22 @@ def test_expectations_before_v3_has_a_default_set(): assert config.expectations.population_size == 1000 +def test_expectations_does_not_exist_after_v3(): + data = { + "version": 4, + "expectations": {}, + "actions": { + "generate_dataset": { + "run": "ehrql:v1 generate-dataset args --output output/dataset.csv.gz", + "outputs": {"highly_sensitive": {"dataset": "output/dataset.csv.gz"}}, + } + }, + } + msg = "Project includes `expectations` section" + with pytest.raises(ValidationError, match=msg): + Pipeline.build(**data) + + def test_expectations_exists_for_v3(): # our logic for this is custom so ensure it works as expected data = {