From 83a4b969b7fe5388b143cad5e014ea4424f44a45 Mon Sep 17 00:00:00 2001 From: Becky Smith Date: Fri, 18 Aug 2023 14:05:12 +0100 Subject: [PATCH 1/7] Add a T1OO table to the test tpp schema --- tests/tpp_backend_setup.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/tpp_backend_setup.py b/tests/tpp_backend_setup.py index 1ef713ad..ca342aac 100644 --- a/tests/tpp_backend_setup.py +++ b/tests/tpp_backend_setup.py @@ -1055,3 +1055,11 @@ class VmpMapping(Base): id = Column(String(collation="Latin1_General_CI_AS")) prev_id = Column(String(collation="Latin1_General_CI_AS")) + + +class T1OO(Base): + __tablename__ = "T1OO" + # fake pk to satisfy the ORM + # Patient_ID might be the primary key, TBC + pk = Column(Integer, primary_key=True) + Patient_ID = Column(types.BIGINT) From 02725df236158ed99f339386f11623dc787ffaa9 Mon Sep 17 00:00:00 2001 From: Becky Smith Date: Fri, 18 Aug 2023 15:27:58 +0100 Subject: [PATCH 2/7] Read the OPENSAFELY_INCLUDE_T1OO flag in StudyDefinition and pass to backends --- cohortextractor/emis_backend.py | 1 + cohortextractor/study_definition.py | 6 ++++++ cohortextractor/tpp_backend.py | 8 ++++++-- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/cohortextractor/emis_backend.py b/cohortextractor/emis_backend.py index a8b236fe..4716a00c 100644 --- a/cohortextractor/emis_backend.py +++ b/cohortextractor/emis_backend.py @@ -49,6 +49,7 @@ def __init__( covariate_definitions, temporary_database=None, dummy_data=False, # Ignored + include_t1oo=False, # Ignored ): self.database_url = database_url self.covariate_definitions = covariate_definitions diff --git a/cohortextractor/study_definition.py b/cohortextractor/study_definition.py index 85d9d542..403f837b 100644 --- a/cohortextractor/study_definition.py +++ b/cohortextractor/study_definition.py @@ -36,6 +36,11 @@ def __init__( self._original_default_expectations = default_expectations or {} self.set_index_date(index_date) self.pandas_csv_args = self.get_pandas_csv_args(self.covariate_definitions) + + self.include_t1oo = ( + os.environ.get("OPENSAFELY_INCLUDE_T1OO", "").lower().strip() == "true" + ) + self.database_url = os.environ.get("DATABASE_URL") self.temporary_database = os.environ.get("TEMP_DATABASE_NAME") if self.database_url: @@ -200,6 +205,7 @@ def create_backend(self, database_url=None, dummy_data=False): self.covariate_definitions, temporary_database=self.temporary_database, dummy_data=dummy_data, + include_t1oo=self.include_t1oo, ) def recreate_backend(self): diff --git a/cohortextractor/tpp_backend.py b/cohortextractor/tpp_backend.py index 851789b0..b7fbd994 100644 --- a/cohortextractor/tpp_backend.py +++ b/cohortextractor/tpp_backend.py @@ -47,6 +47,7 @@ def __init__( covariate_definitions, temporary_database=None, dummy_data=False, + include_t1oo=False, ): self.database_url = database_url self.covariate_definitions = covariate_definitions @@ -55,8 +56,11 @@ def __init__( self.next_temp_table_id = 1 self._therapeutics_table_name = None self.truncate_sql_logs = False + if self.covariate_definitions: - self.queries = self.get_queries(self.covariate_definitions) + self.queries = self.get_queries( + self.covariate_definitions, include_t1oo=include_t1oo + ) else: self.queries = [] @@ -307,7 +311,7 @@ def get_therapeutic_risk_groups(self): therapeutics_risk_group_variables.add(name) return therapeutics_risk_group_variables - def get_queries(self, covariate_definitions): + def get_queries(self, covariate_definitions, include_t1oo=False): output_columns = {} table_queries = {} for name, (query_type, query_args) in covariate_definitions.items(): From e3ca61625e053da562b37bbb427e9aef17644516 Mon Sep 17 00:00:00 2001 From: Becky Smith Date: Fri, 18 Aug 2023 15:29:17 +0100 Subject: [PATCH 3/7] feat: Exclude T1OO patients by default in TPP backend --- cohortextractor/tpp_backend.py | 22 ++++++++++++++- tests/conftest.py | 12 +++++++++ tests/test_tpp_backend.py | 49 ++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) diff --git a/cohortextractor/tpp_backend.py b/cohortextractor/tpp_backend.py index b7fbd994..4abba0e3 100644 --- a/cohortextractor/tpp_backend.py +++ b/cohortextractor/tpp_backend.py @@ -387,7 +387,27 @@ def get_queries(self, covariate_definitions, include_t1oo=False): for name in table_queries if name != "population" ] + + wheres = [f'{output_columns["population"]} = 1'] + + def get_t1oo_exclude_expressions(): + # If this query has been explictly flagged as including T1OO patients then + # return unmodified + if include_t1oo: + return [], [] + # Otherwise we add an extra LEFT OUTER JOIN on the T1OO table and + # WHERE clause which will exclude any patient IDs found in the T1OO table + return ( + [f"LEFT OUTER JOIN T1OO ON T1OO.Patient_ID = {patient_id_expr}"], + ["T1OO.Patient_ID IS null"], + ) + + t100_join, t1oo_where = get_t1oo_exclude_expressions() + joins.extend(t100_join) joins_str = "\n ".join(joins) + wheres.extend(t1oo_where) + where_str = " AND ".join(wheres) + joined_output_query = f""" -- Join all columns for final output SELECT @@ -395,7 +415,7 @@ def get_queries(self, covariate_definitions, include_t1oo=False): FROM {primary_table} {joins_str} - WHERE {output_columns["population"]} = 1 + WHERE {where_str} """ all_queries = [] for sql_list in table_queries.values(): diff --git a/tests/conftest.py b/tests/conftest.py index 31d81033..0b9974c6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,8 @@ import os from pathlib import Path +import pytest + def pytest_generate_tests(metafunc): """Source the test environment""" @@ -9,3 +11,13 @@ def pytest_generate_tests(metafunc): k = k.strip() if not os.environ.get(k): os.environ[k] = v.strip() + + +@pytest.fixture() +def include_t1oo(): + # This flag should only be passed by job-runner, if it's + # been set in tests, something is wrong + assert "OPENSAFELY_INCLUDE_T1OO" not in os.environ + os.environ["OPENSAFELY_INCLUDE_T1OO"] = "True" + yield + del os.environ["OPENSAFELY_INCLUDE_T1OO"] diff --git a/tests/test_tpp_backend.py b/tests/test_tpp_backend.py index 482b22cd..0b944bf7 100644 --- a/tests/test_tpp_backend.py +++ b/tests/test_tpp_backend.py @@ -30,6 +30,7 @@ EC, ICNARC, OPA, + T1OO, UKRR, APCS_Der, Appointment, @@ -128,6 +129,7 @@ def setup_function(function): session.query(UKRR).delete() session.query(Patient).delete() session.query(BuildProgress).delete() + session.query(T1OO).delete() session.commit() @@ -181,6 +183,53 @@ def test_minimal_study_with_reserved_keywords(): assert_results(study.to_dicts(), all=["M", "F"], asc=["40", "55"]) +def test_minimal_study_with_t1oo_default(): + # Test that type 1 opt-outs are excluded by default + assert "OPENSAFELY_INCLUDE_T1OO" not in os.environ + session = make_session() + patient_1 = Patient(Patient_ID=1, DateOfBirth="1980-01-01", Sex="M") + patient_2 = Patient(Patient_ID=2, DateOfBirth="1965-01-01", Sex="F") + t1oo_1 = T1OO(Patient_ID=1) + session.add_all([patient_1, patient_2, t1oo_1]) + session.commit() + study = StudyDefinition( + population=patients.all(), + all=patients.sex(), + asc=patients.age_as_of("2020-01-01"), + ) + # patient_1 (M, age 40) excluded + assert_results(study.to_dicts(), all=["F"], asc=["55"]) + + +@pytest.mark.parametrize( + "flag,expected", + [ + ("", ["1", "4"]), + ("False", ["1", "4"]), + ("false", ["1", "4"]), + ("1", ["1", "4"]), + ("True", ["1", "2", "3", "4"]), + ("true", ["1", "2", "3", "4"]), + ], +) +def test_minimal_study_with_t1oo_flag(flag, expected, include_t1oo): + os.environ["OPENSAFELY_INCLUDE_T1OO"] = flag + # Test that type 1 opt-outs are only included if flag is explicitly set to "True" + session = make_session() + patient_1 = Patient(Patient_ID=1, DateOfBirth="1980-01-01", Sex="M") + patient_2 = Patient(Patient_ID=2, DateOfBirth="1965-01-01", Sex="F") + patient_3 = Patient(Patient_ID=3, DateOfBirth="1975-01-01", Sex="F") + patient_4 = Patient(Patient_ID=4, DateOfBirth="1985-01-01", Sex="F") + t1oo_2 = T1OO(Patient_ID=2) + t1oo_3 = T1OO(Patient_ID=3) + session.add_all([patient_1, patient_2, patient_3, patient_4, t1oo_2, t1oo_3]) + session.commit() + study = StudyDefinition( + population=patients.all(), + ) + assert_results(study.to_dicts(), patient_id=expected) + + @pytest.mark.parametrize("format", ["csv", "csv.gz", "feather", "dta", "dta.gz"]) def test_study_to_file_with_therapeutic_risk_groups(tmp_path, format): session = make_session() From 3fdd507a15b6f5c338c4a1342a9a099942948b80 Mon Sep 17 00:00:00 2001 From: Becky Smith Date: Wed, 23 Aug 2023 12:50:36 +0100 Subject: [PATCH 4/7] Update logging tests --- tests/test_logging.py | 68 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/tests/test_logging.py b/tests/test_logging.py index cc06496a..d8b3a0cf 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -107,10 +107,11 @@ def test_study_definition_initial_stats_logging(logger): ) assert get_stats_logs(logger.entries) == [ # output columns include patient_id, and the 4 variables defined in the - # study defniiton, including event_date_2, which is defined as a parameter to - # event_min_date + # study definition - population, event_date_1, event_min_date and event_date_2, + # which is defined as a parameter to event_min_date # tables - Patient, temp event table for each codelist - {"output_column_count": 5, "table_count": 3, "table_joins_count": 2}, + # joins - Patient to each event table, and t1oo + {"output_column_count": 5, "table_count": 3, "table_joins_count": 3}, # variable_count is a count of the top-level variables defined in the study def (i.e. not event_date_2) {"variable_count": 4}, # 2 variables use a codelist (event_date_1, and the nested event_date_2) @@ -134,10 +135,10 @@ def test_stats_logging_tpp_backend(logger): # initial stats expected_initial_study_def_logs = [ - # output columns include patient_id, and the 2 variables defined in the - # study defniiton - # tables - Patient, temp event table for codelist - {"output_column_count": 3, "table_count": 2, "table_joins_count": 1}, + # output columns include patient_id, and the 2 variables (population, event) + # defined in the study defniiton + # tables - Patient, temp event table, t1oo for codelist + {"output_column_count": 3, "table_count": 2, "table_joins_count": 2}, {"variable_count": 2}, {"variables_using_codelist_count": 1}, {"variable_using_codelist": "event", "codelist_size": 1}, @@ -195,6 +196,55 @@ def test_stats_logging_tpp_backend(logger): ) +def test_stats_logging_tpp_backend_with_t1oo(logger, include_t1oo): + # The query counter is a global at the module level, so it isn't reset between tests + # Find the next position (without incrementing it); this is the start of the test's timing logs + start_counter = timing_log_counter.next + + study = StudyDefinition( + population=patients.all(), + ) + study.to_dicts() + + # initial stats + expected_initial_study_def_logs = [ + # output columns include patient_id and population + # tables - Patient table only + # no joins because t1oo are included, so there is no need to join on the + # t1oo table to exclude them + {"output_column_count": 2, "table_count": 1, "table_joins_count": 0}, + {"variable_count": 1}, + {"variables_using_codelist_count": 0}, + ] + # timing stats + # logs in tpp_backend during query execution + + expected_timing_log_params = [ + *_sql_execute_timing_logs( + description="Query for population", + sql="SELECT * INTO #population", + timing_id=start_counter, + ), + *_sql_execute_timing_logs( + description=None, + sql="CREATE CLUSTERED INDEX patient_id_ix ON #population (patient_id)", + timing_id=start_counter + 1, + is_truncated=False, + ), + *_sql_execute_timing_logs( + description="Join all columns for final output", + sql="#population.patient_id AS [patient_id]", + timing_id=start_counter + 2, + ), + ] + assert_stats_logs( + logger, + expected_initial_study_def_logs, + expected_timing_log_params, + downloaded=False, + ) + + @patch("cohortextractor.cohortextractor.preflight_generation_check") @patch( "cohortextractor.cohortextractor.list_study_definitions", @@ -239,7 +289,7 @@ def test_stats_logging_generate_cohort( expected_initial_study_def_logs = [ # these 3 are logged from StudyDefinition instantiation # patient_id, population, sex - all from patient table, but we make one temp # table per variable - {"output_column_count": 3, "table_count": 2, "table_joins_count": 1}, + {"output_column_count": 3, "table_count": 2, "table_joins_count": 2}, {"variable_count": 2}, # population, sex {"variables_using_codelist_count": 0}, # index_date_count logged from generate_cohort @@ -391,7 +441,7 @@ def test_stats_logging_generate_cohort_with_index_dates( {"index_date_count": 3}, {"min_index_date": "2020-01-01", "max_index_date": "2020-03-01"}, # output_column/table/joins_count is logged in tpp_backend on backend instantiation so it's repeated for each index date - *[{"output_column_count": 3, "table_count": 2, "table_joins_count": 1}] * 4, + *[{"output_column_count": 3, "table_count": 2, "table_joins_count": 2}] * 4, *[ {"resetting_backend_index_date": ix_date} for ix_date in expected_index_dates From 38c0bba93e2614000a352bf61349b58d4d267464 Mon Sep 17 00:00:00 2001 From: Becky Smith Date: Wed, 13 Sep 2023 12:29:44 +0100 Subject: [PATCH 5/7] Use dsn parameter to indicate t1oo inclusion --- cohortextractor/emis_backend.py | 1 - cohortextractor/study_definition.py | 5 --- cohortextractor/tpp_backend.py | 36 ++++++++++++--- tests/conftest.py | 12 ----- tests/test_logging.py | 12 ++++- tests/test_tpp_backend.py | 70 +++++++++++++++++++++++++++-- 6 files changed, 107 insertions(+), 29 deletions(-) diff --git a/cohortextractor/emis_backend.py b/cohortextractor/emis_backend.py index 4716a00c..a8b236fe 100644 --- a/cohortextractor/emis_backend.py +++ b/cohortextractor/emis_backend.py @@ -49,7 +49,6 @@ def __init__( covariate_definitions, temporary_database=None, dummy_data=False, # Ignored - include_t1oo=False, # Ignored ): self.database_url = database_url self.covariate_definitions = covariate_definitions diff --git a/cohortextractor/study_definition.py b/cohortextractor/study_definition.py index 403f837b..c4bc2a25 100644 --- a/cohortextractor/study_definition.py +++ b/cohortextractor/study_definition.py @@ -37,10 +37,6 @@ def __init__( self.set_index_date(index_date) self.pandas_csv_args = self.get_pandas_csv_args(self.covariate_definitions) - self.include_t1oo = ( - os.environ.get("OPENSAFELY_INCLUDE_T1OO", "").lower().strip() == "true" - ) - self.database_url = os.environ.get("DATABASE_URL") self.temporary_database = os.environ.get("TEMP_DATABASE_NAME") if self.database_url: @@ -205,7 +201,6 @@ def create_backend(self, database_url=None, dummy_data=False): self.covariate_definitions, temporary_database=self.temporary_database, dummy_data=dummy_data, - include_t1oo=self.include_t1oo, ) def recreate_backend(self): diff --git a/cohortextractor/tpp_backend.py b/cohortextractor/tpp_backend.py index 4abba0e3..b896d747 100644 --- a/cohortextractor/tpp_backend.py +++ b/cohortextractor/tpp_backend.py @@ -6,6 +6,7 @@ import re import uuid from functools import cached_property +from urllib import parse import pandas import structlog @@ -40,6 +41,7 @@ class TPPBackend: _db_connection = None _current_column_name = None + include_t1oo = False def __init__( self, @@ -47,9 +49,13 @@ def __init__( covariate_definitions, temporary_database=None, dummy_data=False, - include_t1oo=False, ): - self.database_url = database_url + if database_url is not None: + # set self.include_t1oo from the database url + self.database_url = self.modify_dsn(database_url) + else: + self.database_url = database_url + self.covariate_definitions = covariate_definitions self.temporary_database = temporary_database self.dummy_data = dummy_data @@ -58,12 +64,28 @@ def __init__( self.truncate_sql_logs = False if self.covariate_definitions: - self.queries = self.get_queries( - self.covariate_definitions, include_t1oo=include_t1oo - ) + self.queries = self.get_queries(self.covariate_definitions) else: self.queries = [] + def modify_dsn(self, dsn): + """ + Removes the `opensafely_include_t1oo` parameter if present and uses it to set + the `include_t1oo` attribute accordingly + """ + parts = parse.urlparse(dsn) + params = parse.parse_qs(parts.query, keep_blank_values=True) + include_t1oo_values = params.pop("opensafely_include_t1oo", []) + if len(include_t1oo_values) == 1: + self.include_t1oo = include_t1oo_values[0].lower() == "true" + elif len(include_t1oo_values) != 0: + raise ValueError( + "`opensafely_include_t1oo` parameter must not be supplied more than once" + ) + new_query = parse.urlencode(params, doseq=True) + new_parts = parts._replace(query=new_query) + return parse.urlunparse(new_parts) + def to_file(self, filename): queries = list(self.queries) # If we have a temporary database available we write results to a table @@ -311,7 +333,7 @@ def get_therapeutic_risk_groups(self): therapeutics_risk_group_variables.add(name) return therapeutics_risk_group_variables - def get_queries(self, covariate_definitions, include_t1oo=False): + def get_queries(self, covariate_definitions): output_columns = {} table_queries = {} for name, (query_type, query_args) in covariate_definitions.items(): @@ -393,7 +415,7 @@ def get_queries(self, covariate_definitions, include_t1oo=False): def get_t1oo_exclude_expressions(): # If this query has been explictly flagged as including T1OO patients then # return unmodified - if include_t1oo: + if self.include_t1oo: return [], [] # Otherwise we add an extra LEFT OUTER JOIN on the T1OO table and # WHERE clause which will exclude any patient IDs found in the T1OO table diff --git a/tests/conftest.py b/tests/conftest.py index 0b9974c6..31d81033 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,6 @@ import os from pathlib import Path -import pytest - def pytest_generate_tests(metafunc): """Source the test environment""" @@ -11,13 +9,3 @@ def pytest_generate_tests(metafunc): k = k.strip() if not os.environ.get(k): os.environ[k] = v.strip() - - -@pytest.fixture() -def include_t1oo(): - # This flag should only be passed by job-runner, if it's - # been set in tests, something is wrong - assert "OPENSAFELY_INCLUDE_T1OO" not in os.environ - os.environ["OPENSAFELY_INCLUDE_T1OO"] = "True" - yield - del os.environ["OPENSAFELY_INCLUDE_T1OO"] diff --git a/tests/test_logging.py b/tests/test_logging.py index d8b3a0cf..ac307dc6 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -1,3 +1,4 @@ +import os import re from collections import Counter from unittest.mock import patch @@ -16,6 +17,15 @@ ) +@pytest.fixture +def set_database_url_with_t1oo(monkeypatch): + if "TPP_DATABASE_URL" in os.environ: + monkeypatch.setenv( + "DATABASE_URL", + f'{os.environ["TPP_DATABASE_URL"]}?opensafely_include_t1oo=True', + ) + + @pytest.fixture(name="logger") def fixture_logger(): """Modify `capture_logs` to keep reference to `processors` list intact, @@ -196,7 +206,7 @@ def test_stats_logging_tpp_backend(logger): ) -def test_stats_logging_tpp_backend_with_t1oo(logger, include_t1oo): +def test_stats_logging_tpp_backend_with_t1oo(logger, set_database_url_with_t1oo): # The query counter is a global at the module level, so it isn't reset between tests # Find the next position (without incrementing it); this is the start of the test's timing logs start_counter = timing_log_counter.next diff --git a/tests/test_tpp_backend.py b/tests/test_tpp_backend.py index 0b944bf7..dc8baa3f 100644 --- a/tests/test_tpp_backend.py +++ b/tests/test_tpp_backend.py @@ -80,6 +80,18 @@ def set_database_url(monkeypatch): monkeypatch.setenv("DATABASE_URL", os.environ["TPP_DATABASE_URL"]) +@pytest.fixture +def set_database_url_with_t1oo(monkeypatch): + def set_db_url(t1oo_value): + if "TPP_DATABASE_URL" in os.environ: + monkeypatch.setenv( + "DATABASE_URL", + f'{os.environ["TPP_DATABASE_URL"]}?opensafely_include_t1oo={t1oo_value}', + ) + + return set_db_url + + def setup_module(module): make_database() @@ -134,6 +146,59 @@ def setup_function(function): session.commit() +@pytest.mark.parametrize( + "dsn_in,dsn_out,t1oo_status", + [ + ( + "mssql://user:pass@localhost:4321/db", + "mssql://user:pass@localhost:4321/db", + False, + ), + ( + "mssql://user:pass@localhost:4321/db?param1=one¶m2¶m1=three", + "mssql://user:pass@localhost:4321/db?param1=one¶m1=three¶m2=", + False, + ), + ( + "mssql://user:pass@localhost:4321/db?opensafely_include_t1oo¶m2=two", + "mssql://user:pass@localhost:4321/db?param2=two", + False, + ), + ( + "mssql://user:pass@localhost:4321/db?opensafely_include_t1oo=false", + "mssql://user:pass@localhost:4321/db", + False, + ), + ( + "mssql://user:pass@localhost:4321/db?opensafely_include_t1oo=true", + "mssql://user:pass@localhost:4321/db", + True, + ), + ( + "mssql://user:pass@localhost:4321/db?opensafely_include_t1oo=True", + "mssql://user:pass@localhost:4321/db", + True, + ), + ], +) +def test_tpp_backend_modify_dsn(dsn_in, dsn_out, t1oo_status): + backend = TPPBackend(database_url=dsn_in, covariate_definitions=None) + assert backend.database_url == dsn_out + assert backend.include_t1oo == t1oo_status + + +@pytest.mark.parametrize( + "dsn", + [ + "mssql://user:pass@localhost:4321/db?opensafely_include_t1oo=false&opensafely_include_t1oo=false", + "mssql://user:pass@localhost:4321/db?opensafely_include_t1oo=false&opensafely_include_t1oo", + ], +) +def test_tpp_backend_modify_dsn_rejects_duplicate_params(dsn): + with pytest.raises(ValueError, match="must not be supplied more than once"): + TPPBackend(database_url=dsn, covariate_definitions=None) + + @pytest.mark.parametrize("format", ["csv", "csv.gz", "feather", "dta", "dta.gz"]) def test_minimal_study_to_file(tmp_path, format): session = make_session() @@ -185,7 +250,6 @@ def test_minimal_study_with_reserved_keywords(): def test_minimal_study_with_t1oo_default(): # Test that type 1 opt-outs are excluded by default - assert "OPENSAFELY_INCLUDE_T1OO" not in os.environ session = make_session() patient_1 = Patient(Patient_ID=1, DateOfBirth="1980-01-01", Sex="M") patient_2 = Patient(Patient_ID=2, DateOfBirth="1965-01-01", Sex="F") @@ -212,8 +276,8 @@ def test_minimal_study_with_t1oo_default(): ("true", ["1", "2", "3", "4"]), ], ) -def test_minimal_study_with_t1oo_flag(flag, expected, include_t1oo): - os.environ["OPENSAFELY_INCLUDE_T1OO"] = flag +def test_minimal_study_with_t1oo_flag(set_database_url_with_t1oo, flag, expected): + set_database_url_with_t1oo(flag) # Test that type 1 opt-outs are only included if flag is explicitly set to "True" session = make_session() patient_1 = Patient(Patient_ID=1, DateOfBirth="1980-01-01", Sex="M") From b5ddcf56802643811cdd587cbb058efb58a9e2a3 Mon Sep 17 00:00:00 2001 From: David Evans Date: Wed, 13 Sep 2023 20:25:28 +0100 Subject: [PATCH 6/7] Temporarily change default This involves reverting some of the test changes. These are all re-reverted back again in a follow-up PR. --- cohortextractor/tpp_backend.py | 3 ++- tests/test_logging.py | 19 +++++++++---------- tests/test_tpp_backend.py | 21 ++------------------- 3 files changed, 13 insertions(+), 30 deletions(-) diff --git a/cohortextractor/tpp_backend.py b/cohortextractor/tpp_backend.py index b896d747..62c61239 100644 --- a/cohortextractor/tpp_backend.py +++ b/cohortextractor/tpp_backend.py @@ -41,7 +41,8 @@ class TPPBackend: _db_connection = None _current_column_name = None - include_t1oo = False + # TODO: Temporary default to support safe deployment + include_t1oo = True def __init__( self, diff --git a/tests/test_logging.py b/tests/test_logging.py index ac307dc6..f55d6676 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -117,11 +117,10 @@ def test_study_definition_initial_stats_logging(logger): ) assert get_stats_logs(logger.entries) == [ # output columns include patient_id, and the 4 variables defined in the - # study definition - population, event_date_1, event_min_date and event_date_2, - # which is defined as a parameter to event_min_date + # study defniiton, including event_date_2, which is defined as a parameter to + # event_min_date # tables - Patient, temp event table for each codelist - # joins - Patient to each event table, and t1oo - {"output_column_count": 5, "table_count": 3, "table_joins_count": 3}, + {"output_column_count": 5, "table_count": 3, "table_joins_count": 2}, # variable_count is a count of the top-level variables defined in the study def (i.e. not event_date_2) {"variable_count": 4}, # 2 variables use a codelist (event_date_1, and the nested event_date_2) @@ -145,10 +144,10 @@ def test_stats_logging_tpp_backend(logger): # initial stats expected_initial_study_def_logs = [ - # output columns include patient_id, and the 2 variables (population, event) - # defined in the study defniiton - # tables - Patient, temp event table, t1oo for codelist - {"output_column_count": 3, "table_count": 2, "table_joins_count": 2}, + # output columns include patient_id, and the 2 variables defined in the + # study defniiton + # tables - Patient, temp event table for codelist + {"output_column_count": 3, "table_count": 2, "table_joins_count": 1}, {"variable_count": 2}, {"variables_using_codelist_count": 1}, {"variable_using_codelist": "event", "codelist_size": 1}, @@ -299,7 +298,7 @@ def test_stats_logging_generate_cohort( expected_initial_study_def_logs = [ # these 3 are logged from StudyDefinition instantiation # patient_id, population, sex - all from patient table, but we make one temp # table per variable - {"output_column_count": 3, "table_count": 2, "table_joins_count": 2}, + {"output_column_count": 3, "table_count": 2, "table_joins_count": 1}, {"variable_count": 2}, # population, sex {"variables_using_codelist_count": 0}, # index_date_count logged from generate_cohort @@ -451,7 +450,7 @@ def test_stats_logging_generate_cohort_with_index_dates( {"index_date_count": 3}, {"min_index_date": "2020-01-01", "max_index_date": "2020-03-01"}, # output_column/table/joins_count is logged in tpp_backend on backend instantiation so it's repeated for each index date - *[{"output_column_count": 3, "table_count": 2, "table_joins_count": 2}] * 4, + *[{"output_column_count": 3, "table_count": 2, "table_joins_count": 1}] * 4, *[ {"resetting_backend_index_date": ix_date} for ix_date in expected_index_dates diff --git a/tests/test_tpp_backend.py b/tests/test_tpp_backend.py index dc8baa3f..99dd5b51 100644 --- a/tests/test_tpp_backend.py +++ b/tests/test_tpp_backend.py @@ -152,12 +152,12 @@ def setup_function(function): ( "mssql://user:pass@localhost:4321/db", "mssql://user:pass@localhost:4321/db", - False, + True, ), ( "mssql://user:pass@localhost:4321/db?param1=one¶m2¶m1=three", "mssql://user:pass@localhost:4321/db?param1=one¶m1=three¶m2=", - False, + True, ), ( "mssql://user:pass@localhost:4321/db?opensafely_include_t1oo¶m2=two", @@ -248,23 +248,6 @@ def test_minimal_study_with_reserved_keywords(): assert_results(study.to_dicts(), all=["M", "F"], asc=["40", "55"]) -def test_minimal_study_with_t1oo_default(): - # Test that type 1 opt-outs are excluded by default - session = make_session() - patient_1 = Patient(Patient_ID=1, DateOfBirth="1980-01-01", Sex="M") - patient_2 = Patient(Patient_ID=2, DateOfBirth="1965-01-01", Sex="F") - t1oo_1 = T1OO(Patient_ID=1) - session.add_all([patient_1, patient_2, t1oo_1]) - session.commit() - study = StudyDefinition( - population=patients.all(), - all=patients.sex(), - asc=patients.age_as_of("2020-01-01"), - ) - # patient_1 (M, age 40) excluded - assert_results(study.to_dicts(), all=["F"], asc=["55"]) - - @pytest.mark.parametrize( "flag,expected", [ From dad5387447fc6b9763ff38bcf1271d719c3593af Mon Sep 17 00:00:00 2001 From: Becky Smith Date: Thu, 26 Oct 2023 11:54:34 +0100 Subject: [PATCH 7/7] Update t1oo table name --- cohortextractor/tpp_backend.py | 8 ++++++-- tests/test_tpp_backend.py | 8 ++++---- tests/tpp_backend_setup.py | 4 ++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cohortextractor/tpp_backend.py b/cohortextractor/tpp_backend.py index 62c61239..164e7dd7 100644 --- a/cohortextractor/tpp_backend.py +++ b/cohortextractor/tpp_backend.py @@ -37,6 +37,8 @@ SLEEP = 4 BACKOFF_FACTOR = 4 +T1OO_TABLE = "PatientsWithTypeOneDissent" + class TPPBackend: _db_connection = None @@ -421,8 +423,10 @@ def get_t1oo_exclude_expressions(): # Otherwise we add an extra LEFT OUTER JOIN on the T1OO table and # WHERE clause which will exclude any patient IDs found in the T1OO table return ( - [f"LEFT OUTER JOIN T1OO ON T1OO.Patient_ID = {patient_id_expr}"], - ["T1OO.Patient_ID IS null"], + [ + f"LEFT OUTER JOIN {T1OO_TABLE} ON {T1OO_TABLE}.Patient_ID = {patient_id_expr}" + ], + [f"{T1OO_TABLE}.Patient_ID IS null"], ) t100_join, t1oo_where = get_t1oo_exclude_expressions() diff --git a/tests/test_tpp_backend.py b/tests/test_tpp_backend.py index 99dd5b51..16b7cfd0 100644 --- a/tests/test_tpp_backend.py +++ b/tests/test_tpp_backend.py @@ -30,7 +30,6 @@ EC, ICNARC, OPA, - T1OO, UKRR, APCS_Der, Appointment, @@ -55,6 +54,7 @@ Organisation, Patient, PatientAddress, + PatientsWithTypeOneDissent, PotentialCareHomeAddress, RegistrationHistory, SGSS_AllTests_Negative, @@ -141,7 +141,7 @@ def setup_function(function): session.query(UKRR).delete() session.query(Patient).delete() session.query(BuildProgress).delete() - session.query(T1OO).delete() + session.query(PatientsWithTypeOneDissent).delete() session.commit() @@ -267,8 +267,8 @@ def test_minimal_study_with_t1oo_flag(set_database_url_with_t1oo, flag, expected patient_2 = Patient(Patient_ID=2, DateOfBirth="1965-01-01", Sex="F") patient_3 = Patient(Patient_ID=3, DateOfBirth="1975-01-01", Sex="F") patient_4 = Patient(Patient_ID=4, DateOfBirth="1985-01-01", Sex="F") - t1oo_2 = T1OO(Patient_ID=2) - t1oo_3 = T1OO(Patient_ID=3) + t1oo_2 = PatientsWithTypeOneDissent(Patient_ID=2) + t1oo_3 = PatientsWithTypeOneDissent(Patient_ID=3) session.add_all([patient_1, patient_2, patient_3, patient_4, t1oo_2, t1oo_3]) session.commit() study = StudyDefinition( diff --git a/tests/tpp_backend_setup.py b/tests/tpp_backend_setup.py index ca342aac..b2e3f32e 100644 --- a/tests/tpp_backend_setup.py +++ b/tests/tpp_backend_setup.py @@ -1057,8 +1057,8 @@ class VmpMapping(Base): prev_id = Column(String(collation="Latin1_General_CI_AS")) -class T1OO(Base): - __tablename__ = "T1OO" +class PatientsWithTypeOneDissent(Base): + __tablename__ = "PatientsWithTypeOneDissent" # fake pk to satisfy the ORM # Patient_ID might be the primary key, TBC pk = Column(Integer, primary_key=True)