Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create version 4.0 #341

Merged
merged 3 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions pipeline/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,28 @@
from types import SimpleNamespace


# The version of `project.yaml` where each feature was introduced
# The versions of `project.yaml` where each feature applies to
# Tuple of (version where the feature was introduced, version after which the feature is deprecated)
FEATURE_FLAGS_BY_VERSION = {
"UNIQUE_OUTPUT_PATH": 2,
"EXPECTATIONS_POPULATION": 3,
"UNIQUE_OUTPUT_PATH": (2, None),
"EXPECTATIONS_POPULATION": (3, 3),
"REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR": (4, None),
}


LATEST_VERSION = max(FEATURE_FLAGS_BY_VERSION.values())
LATEST_VERSION = max([v[0] for v in FEATURE_FLAGS_BY_VERSION.values()])


def get_feature_flags_for_version(version: float) -> SimpleNamespace:
if version > LATEST_VERSION:
raise ValueError(f"The latest version is v{LATEST_VERSION}, but got v{version}")
feat = SimpleNamespace()

for k, v in FEATURE_FLAGS_BY_VERSION.items():
value = v <= version # is this feature turned on the requested version?
# is this feature turned on the requested version?
if v[1] is None:
value = v[0] <= version
else:
value = v[0] <= version <= v[1]

setattr(feat, k, value)

Expand Down
31 changes: 21 additions & 10 deletions pipeline/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
validate_databuilder_outputs,
validate_glob_pattern,
validate_no_kwargs,
validate_not_cohort_extractor_action,
validate_type,
)

Expand Down Expand Up @@ -238,7 +239,7 @@ def is_database_action(self) -> bool:
class Pipeline:
version: float
actions: dict[str, Action]
expectations: Expectations
expectations: Expectations | None

@classmethod
def build(
Expand All @@ -262,13 +263,18 @@ def build(
raise ValidationError(
f"`version` must be a number between 1 and {LATEST_VERSION}"
)
feat = get_feature_flags_for_version(version)

validate_type(actions, dict, "Project `actions` section")
actions = {
action_id: Action.build(action_id, **action_config)
for action_id, action_config in actions.items()
}

if feat.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR:
for config in actions.values():
validate_not_cohort_extractor_action(config)

seen: dict[Command, list[str]] = defaultdict(list)
for name, config in actions.items():
run = config.run
Expand All @@ -278,7 +284,7 @@ def build(
)
seen[run].append(name)

if get_feature_flags_for_version(version).UNIQUE_OUTPUT_PATH:
if feat.UNIQUE_OUTPUT_PATH:
# find duplicate paths defined in the outputs section
seen_files = []
for config in actions.values():
Expand All @@ -298,19 +304,24 @@ def build(
f"Action `{a.action_id}` references an unknown action in its `needs` list: {n}"
)

feat = get_feature_flags_for_version(version)
if feat.EXPECTATIONS_POPULATION:
if feat.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR:
if expectations is not None:
raise ValidationError(
"Project includes `expectations` section, which is not supported in this version"
)
elif feat.EXPECTATIONS_POPULATION:
if expectations is None:
raise ValidationError("Project must include `expectations` section")
else:
expectations = {"population_size": 1000}

validate_type(expectations, dict, "Project `expectations` section")
if "population_size" not in expectations:
raise ValidationError(
"Project `expectations` section must include `population_size` section",
)
expectations = Expectations.build(**expectations)
if expectations is not None:
validate_type(expectations, dict, "Project `expectations` section")
if "population_size" not in expectations:
raise ValidationError(
"Project `expectations` section must include `population_size` section",
)
expectations = Expectations.build(**expectations)

return cls(version, actions, expectations)

Expand Down
7 changes: 7 additions & 0 deletions pipeline/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,13 @@ def validate_glob_pattern(pattern: str, privacy_level: str) -> None:
raise InvalidPatternError("is an absolute path")


def validate_not_cohort_extractor_action(action: Action) -> None:
if action.run.parts[0].startswith("cohortextractor"):
raise ValidationError(
f"Action {action.action_id} uses cohortextractor actions, which are not supported in this version."
)


def validate_cohortextractor_outputs(action_id: str, action: Action) -> None:
"""
Check cohortextractor's output config is valid for this command
Expand Down
29 changes: 9 additions & 20 deletions tests/fixtures/valid_yaml/project.yaml
Original file line number Diff line number Diff line change
@@ -1,40 +1,29 @@
version: '3.0'

expectations:
population_size: 100
version: '4.0'

actions:
generate_cohort:
run: cohortextractor:latest generate_cohort
outputs:
highly_sensitive:
cohort: output/input.csv

generate_cohort_with_dummy_data:
# we provide --output-dir here to distinguish the action from the one above
run: cohortextractor:latest generate_cohort --output-dir output/extra
dummy_data_file: test-data/dummy-data.csv
generate_dataset:
run: ehrql:v1 generate-dataset analysis/dataset_definition.py --output output/dataset.csv.gz
outputs:
highly_sensitive:
cohort: output/extra/input.csv
dataset: output/dataset.csv.gz

prepare_data_m:
run: python:latest python analysis/filter_by_sex.py M output/input.csv male.csv
needs: [generate_cohort]
needs: [generate_dataset]
outputs:
highly_sensitive:
male_cohort: male*.csv

prepare_data_f:
run: python:latest python analysis/filter_by_sex.py F output/input.csv female.csv
needs: [generate_cohort]
needs: [generate_dataset]
outputs:
highly_sensitive:
female_cohort: female*.csv

prepare_data_with_quote_in_filename:
run: python:latest python analysis/filter_by_sex.py F output/input.csv "qu'ote.csv"
needs: [generate_cohort]
needs: [generate_dataset]
outputs:
highly_sensitive:
quote_cohort: "qu'ote*.csv"
Expand All @@ -52,14 +41,14 @@ actions:
run: minimal-action:v1.1.0 output/input.csv
config:
suffix: .backup
needs: [generate_cohort]
needs: [generate_dataset]
outputs:
highly_sensitive:
cohort: output/input.backup.csv

test_cancellation:
run: python:latest python analysis/filter_by_sex.py F output/input.csv somefile.csv
needs: [generate_cohort]
needs: [generate_dataset]
outputs:
highly_sensitive:
somefile: somefile.csv
19 changes: 19 additions & 0 deletions tests/test_features.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pytest

from pipeline.features import get_feature_flags_for_version


Expand All @@ -6,17 +8,34 @@ def test_get_feature_flags_for_version_with_v1():

assert not flags.UNIQUE_OUTPUT_PATH
assert not flags.EXPECTATIONS_POPULATION
assert not flags.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR


def test_get_feature_flags_for_version_with_v2():
flags = get_feature_flags_for_version(2)

assert flags.UNIQUE_OUTPUT_PATH
assert not flags.EXPECTATIONS_POPULATION
assert not flags.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR


def test_get_feature_flags_for_version_with_v3():
flags = get_feature_flags_for_version(3)

assert flags.UNIQUE_OUTPUT_PATH
assert flags.EXPECTATIONS_POPULATION
assert not flags.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR


def test_get_feature_flags_for_version_with_v4():
flags = get_feature_flags_for_version(4)

assert flags.UNIQUE_OUTPUT_PATH
assert not flags.EXPECTATIONS_POPULATION
assert flags.REMOVE_SUPPORT_FOR_COHORT_EXTRACTOR


def test_get_feature_flags_for_version_with_v5():
msg = "The latest version is v4, but got v5"
with pytest.raises(ValueError, match=msg):
get_feature_flags_for_version(5)
47 changes: 39 additions & 8 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@

def test_success():
data = {
"version": "3",
"expectations": {"population_size": 10},
"version": "4",
"actions": {
"action1": {
"run": "test:latest",
"outputs": {
"moderately_sensitive": {"cohort": "output.csv"},
"moderately_sensitive": {"dataset": "output.csv"},
},
},
},
Expand Down Expand Up @@ -130,6 +129,23 @@ def test_action_extraction_command_with_one_outputs():
assert len(outputs.values()) == 1


def test_cohortextractor_actions_not_used_after_v3():
data = {
"version": "4",
"actions": {
"generate_cohort": {
"run": "cohortextractor:latest generate_cohort",
"outputs": {
"highly_sensitive": {"cohort": "output/input.csv"},
},
},
},
}
msg = "uses cohortextractor actions, which are not supported in this version."
with pytest.raises(ValidationError, match=msg):
Pipeline.build(**data)


def test_command_properties():
data = {
"version": 1,
Expand Down Expand Up @@ -169,7 +185,23 @@ def test_expectations_before_v3_has_a_default_set():
assert config.expectations.population_size == 1000


def test_expectations_exists():
def test_expectations_does_not_exist_after_v3():
data = {
"version": 4,
"expectations": {},
"actions": {
"generate_dataset": {
"run": "ehrql:v1 generate-dataset args --output output/dataset.csv.gz",
"outputs": {"highly_sensitive": {"dataset": "output/dataset.csv.gz"}},
}
},
}
msg = "Project includes `expectations` section"
with pytest.raises(ValidationError, match=msg):
Pipeline.build(**data)


def test_expectations_exists_for_v3():
# our logic for this is custom so ensure it works as expected
data = {
"version": 3,
Expand All @@ -186,7 +218,7 @@ def test_expectations_exists():
Pipeline.build(**data)


def test_expectations_population_size_exists():
def test_expectations_population_size_exists_for_v3():
data = {
"version": 3,
"expectations": {},
Expand All @@ -203,7 +235,7 @@ def test_expectations_population_size_exists():
Pipeline.build(**data)


def test_expectations_population_size_is_a_number():
def test_expectations_population_size_is_a_number_for_v3():
data = {
"version": 3,
"expectations": {"population_size": "test"},
Expand All @@ -225,8 +257,7 @@ def test_pipeline_all_actions(test_file):
config = load_pipeline(test_file)

assert config.all_actions == [
"generate_cohort",
"generate_cohort_with_dummy_data",
"generate_dataset",
"prepare_data_m",
"prepare_data_f",
"prepare_data_with_quote_in_filename",
Expand Down