From 640816811055c2e94204cf82b2769c29d0270ebf Mon Sep 17 00:00:00 2001 From: David Evans Date: Mon, 23 Sep 2024 17:32:44 +0100 Subject: [PATCH 1/2] Update MSSQL test Docker image The previous version was incompatible with the updated kernel now used by Github Actions runners. See: https://github.com/microsoft/mssql-docker/issues/868 The fix has been backported as far as the 2019 image, but not to the 2017 image. Seeing that the 2017 image didn't match what was being run in production in any case we might as well upgrade. This requires a few small changes: * The `sqlcmd` tool has moved from `/opt/mssql-tools/bin` to `/opt/mssql-tools18/bin`. This is due to the newer image using the `mssql-tools18` apt package. Note that references to the old path are still present in the codebase because _our_ Docker image still installs the `mssql-tools` package. * The `sqlcmd` invocation now requires the `-C` option to automatically trust the server certificate. --- tests/lib/databases.py | 7 ++++++- tests/support/mssql/entrypoint.sh | 5 +++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/lib/databases.py b/tests/lib/databases.py index ddf6cf192..7c0ac8773 100644 --- a/tests/lib/databases.py +++ b/tests/lib/databases.py @@ -177,7 +177,12 @@ def make_mssql_database(containers): def run_mssql(container_name, containers, password, mssql_port): # pragma: no cover containers.run_bg( name=container_name, - image="mcr.microsoft.com/mssql/server:2017-CU30-ubuntu-18.04", + # This is *not* the version that TPP run for us in production which, as of + # 2024-09-24, is SQL Server 2016 (13.0.5893.48). That version is not available + # as a Docker image, so we run the oldest supported version instead. Both the + # production server and our test server set the "compatibility level" to the + # same value so the same feature set should be supported. + image="mcr.microsoft.com/mssql/server:2019-CU28-ubuntu-20.04", volumes={ MSSQL_SETUP_DIR: {"bind": "/mssql", "mode": "ro"}, }, diff --git a/tests/support/mssql/entrypoint.sh b/tests/support/mssql/entrypoint.sh index c8f12c848..bd03442da 100755 --- a/tests/support/mssql/entrypoint.sh +++ b/tests/support/mssql/entrypoint.sh @@ -46,12 +46,13 @@ if [ "$1" = '/opt/mssql/bin/sqlservr' ]; then function _sqlcmd() { # Extra arguments: + # -C : trust server certificate # -b : exit with 1 on error rather than 0 # -h -1 : no headers in output # -W : trim trailing whitespace - /opt/mssql-tools/bin/sqlcmd \ + /opt/mssql-tools18/bin/sqlcmd \ -S localhost -U sa -P "$MSSQL_SA_PASSWORD" -d master \ - -b -h -1 -W \ + -C -b -h -1 -W \ "$@" } From 3995a653faecf706042bbe64cc0d6e3561cc1963 Mon Sep 17 00:00:00 2001 From: David Evans Date: Tue, 24 Sep 2024 16:38:25 +0100 Subject: [PATCH 2/2] Update log parsing code to work with MSSQL 2019 The previous code used a single big regex and relied on a precise sequence of IO statistics being returned in a fixed order. In SQL Server 2019 new items were added and the order slightly changed. We rewrite the parsing code to be robust to such changes. --- ehrql/utils/mssql_log_utils.py | 35 ++++++++++++++++-------- tests/unit/utils/test_mssql_log_utils.py | 8 ++++-- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/ehrql/utils/mssql_log_utils.py b/ehrql/utils/mssql_log_utils.py index 7e04a17a1..facacbea3 100644 --- a/ehrql/utils/mssql_log_utils.py +++ b/ehrql/utils/mssql_log_utils.py @@ -67,14 +67,9 @@ def execute_with_log(connection, query, log, query_id=None): # Regex to match IO statistics messages - Table\s'(?P[^']+)'. \s+ - Scan\scount\s (?P\d+), \s+ - logical\sreads\s (?P\d+), \s+ - physical\sreads\s (?P\d+), \s+ - read-ahead\sreads\s (?P\d+), \s+ - lob\slogical\sreads\s (?P\d+), \s+ - lob\sphysical\sreads\s (?P\d+), \s+ - lob\sread-ahead\sreads\s (?P\d+) + Table\s'(?P
[^']+)'\. \s+ + (?P.*) + \. $ ) .* """, @@ -114,6 +109,7 @@ def parse_statistics_messages(messages): timings[f"{prefix}_elapsed_ms"] += int(match["elapsed_ms"]) elif table := match["table"]: table = table.decode(errors="ignore") + io_stats_line = match["io_stats_line"].decode(errors="ignore") # Temporary table names are, internally to MSSQL, made globally unique # by padding with underscores and appending a unique suffix. We need to # restore the original name so our stats make sense. If you've got an @@ -121,9 +117,10 @@ def parse_statistics_messages(messages): # you get. if table.startswith("#"): table = table.partition("_____")[0] - stats = table_io[table] - for key in stats.keys(): - stats[key] += int(match[key]) + stats = parse_io_stats(io_stats_line) + cumulative_stats = table_io[table] + for key in cumulative_stats.keys(): + cumulative_stats[key] += stats.get(key, 0) else: # Given the structure of the regex it shouldn't be possible to get here, # but if somehow we did I'd rather drop the stats message than blow up @@ -135,6 +132,22 @@ def parse_statistics_messages(messages): return timings, table_io +def parse_io_stats(io_stats): + return dict(map(parse_io_stats_item, io_stats.split(","))) + + +def parse_io_stats_item(item): + item = item.strip() + name, _, value = item.rpartition(" ") + # Reformat MSSQL's names to the style we use internally + if name == "Scan count": + name = "scans" + name = name.removesuffix(" reads") + name = name.replace("-", "_").replace(" ", "_") + value = int(value) + return name, value + + def format_table_io(table_io): results_table = table_io_dict_to_table(table_io) return format_table(results_table) diff --git a/tests/unit/utils/test_mssql_log_utils.py b/tests/unit/utils/test_mssql_log_utils.py index f3b37aa0f..4e4c6739e 100644 --- a/tests/unit/utils/test_mssql_log_utils.py +++ b/tests/unit/utils/test_mssql_log_utils.py @@ -40,10 +40,14 @@ def test_parse_statistics_messages(): b" Scan count 1, logical reads 7, physical reads 0, read-ahead reads 0," b" lob logical reads 0, lob physical reads 0, lob read-ahead reads 0." ), + # Newer version of SQL Server return IO stats in a different order and with + # additional items, so check we handle those correctly ( b"Table '#tmp_1______________________________________________________________________________________________________________0000000014D9'." - b" Scan count 1, logical reads 4, physical reads 0, read-ahead reads 0," - b" lob logical reads 0, lob physical reads 0, lob read-ahead reads 0." + b" Scan count 1, logical reads 4, physical reads 0, page server reads 0," + b" read-ahead reads 0, page server read-ahead reads 0, lob logical reads 0," + b" lob physical reads 0, lob page server reads 0, lob read-ahead reads 0," + b" lob page server read-ahead reads 0." ), ( b"Table '#tmp_1______________________________________________________________________________________________________________0000000014B1'."