diff --git a/ehrql/utils/mssql_log_utils.py b/ehrql/utils/mssql_log_utils.py
index 7e04a17a1..facacbea3 100644
--- a/ehrql/utils/mssql_log_utils.py
+++ b/ehrql/utils/mssql_log_utils.py
@@ -67,14 +67,9 @@ def execute_with_log(connection, query, log, query_id=None):
# Regex to match IO statistics messages
- Table\s'(?P
[^']+)'. \s+
- Scan\scount\s (?P\d+), \s+
- logical\sreads\s (?P\d+), \s+
- physical\sreads\s (?P\d+), \s+
- read-ahead\sreads\s (?P\d+), \s+
- lob\slogical\sreads\s (?P\d+), \s+
- lob\sphysical\sreads\s (?P\d+), \s+
- lob\sread-ahead\sreads\s (?P\d+)
+ Table\s'(?P[^']+)'\. \s+
+ (?P.*)
+ \. $
) .*
""",
@@ -114,6 +109,7 @@ def parse_statistics_messages(messages):
timings[f"{prefix}_elapsed_ms"] += int(match["elapsed_ms"])
elif table := match["table"]:
table = table.decode(errors="ignore")
+ io_stats_line = match["io_stats_line"].decode(errors="ignore")
# Temporary table names are, internally to MSSQL, made globally unique
# by padding with underscores and appending a unique suffix. We need to
# restore the original name so our stats make sense. If you've got an
@@ -121,9 +117,10 @@ def parse_statistics_messages(messages):
# you get.
if table.startswith("#"):
table = table.partition("_____")[0]
- stats = table_io[table]
- for key in stats.keys():
- stats[key] += int(match[key])
+ stats = parse_io_stats(io_stats_line)
+ cumulative_stats = table_io[table]
+ for key in cumulative_stats.keys():
+ cumulative_stats[key] += stats.get(key, 0)
else:
# Given the structure of the regex it shouldn't be possible to get here,
# but if somehow we did I'd rather drop the stats message than blow up
@@ -135,6 +132,22 @@ def parse_statistics_messages(messages):
return timings, table_io
+def parse_io_stats(io_stats):
+ return dict(map(parse_io_stats_item, io_stats.split(",")))
+
+
+def parse_io_stats_item(item):
+ item = item.strip()
+ name, _, value = item.rpartition(" ")
+ # Reformat MSSQL's names to the style we use internally
+ if name == "Scan count":
+ name = "scans"
+ name = name.removesuffix(" reads")
+ name = name.replace("-", "_").replace(" ", "_")
+ value = int(value)
+ return name, value
+
+
def format_table_io(table_io):
results_table = table_io_dict_to_table(table_io)
return format_table(results_table)
diff --git a/tests/lib/databases.py b/tests/lib/databases.py
index ddf6cf192..7c0ac8773 100644
--- a/tests/lib/databases.py
+++ b/tests/lib/databases.py
@@ -177,7 +177,12 @@ def make_mssql_database(containers):
def run_mssql(container_name, containers, password, mssql_port): # pragma: no cover
containers.run_bg(
name=container_name,
- image="mcr.microsoft.com/mssql/server:2017-CU30-ubuntu-18.04",
+ # This is *not* the version that TPP run for us in production which, as of
+ # 2024-09-24, is SQL Server 2016 (13.0.5893.48). That version is not available
+ # as a Docker image, so we run the oldest supported version instead. Both the
+ # production server and our test server set the "compatibility level" to the
+ # same value so the same feature set should be supported.
+ image="mcr.microsoft.com/mssql/server:2019-CU28-ubuntu-20.04",
volumes={
MSSQL_SETUP_DIR: {"bind": "/mssql", "mode": "ro"},
},
diff --git a/tests/support/mssql/entrypoint.sh b/tests/support/mssql/entrypoint.sh
index c8f12c848..bd03442da 100755
--- a/tests/support/mssql/entrypoint.sh
+++ b/tests/support/mssql/entrypoint.sh
@@ -46,12 +46,13 @@ if [ "$1" = '/opt/mssql/bin/sqlservr' ]; then
function _sqlcmd() {
# Extra arguments:
+ # -C : trust server certificate
# -b : exit with 1 on error rather than 0
# -h -1 : no headers in output
# -W : trim trailing whitespace
- /opt/mssql-tools/bin/sqlcmd \
+ /opt/mssql-tools18/bin/sqlcmd \
-S localhost -U sa -P "$MSSQL_SA_PASSWORD" -d master \
- -b -h -1 -W \
+ -C -b -h -1 -W \
"$@"
}
diff --git a/tests/unit/utils/test_mssql_log_utils.py b/tests/unit/utils/test_mssql_log_utils.py
index f3b37aa0f..4e4c6739e 100644
--- a/tests/unit/utils/test_mssql_log_utils.py
+++ b/tests/unit/utils/test_mssql_log_utils.py
@@ -40,10 +40,14 @@ def test_parse_statistics_messages():
b" Scan count 1, logical reads 7, physical reads 0, read-ahead reads 0,"
b" lob logical reads 0, lob physical reads 0, lob read-ahead reads 0."
),
+ # Newer version of SQL Server return IO stats in a different order and with
+ # additional items, so check we handle those correctly
(
b"Table '#tmp_1______________________________________________________________________________________________________________0000000014D9'."
- b" Scan count 1, logical reads 4, physical reads 0, read-ahead reads 0,"
- b" lob logical reads 0, lob physical reads 0, lob read-ahead reads 0."
+ b" Scan count 1, logical reads 4, physical reads 0, page server reads 0,"
+ b" read-ahead reads 0, page server read-ahead reads 0, lob logical reads 0,"
+ b" lob physical reads 0, lob page server reads 0, lob read-ahead reads 0,"
+ b" lob page server read-ahead reads 0."
),
(
b"Table '#tmp_1______________________________________________________________________________________________________________0000000014B1'."