diff --git a/ehrql/utils/mssql_log_utils.py b/ehrql/utils/mssql_log_utils.py index 7e04a17a1..facacbea3 100644 --- a/ehrql/utils/mssql_log_utils.py +++ b/ehrql/utils/mssql_log_utils.py @@ -67,14 +67,9 @@ def execute_with_log(connection, query, log, query_id=None): # Regex to match IO statistics messages - Table\s'(?P[^']+)'. \s+ - Scan\scount\s (?P\d+), \s+ - logical\sreads\s (?P\d+), \s+ - physical\sreads\s (?P\d+), \s+ - read-ahead\sreads\s (?P\d+), \s+ - lob\slogical\sreads\s (?P\d+), \s+ - lob\sphysical\sreads\s (?P\d+), \s+ - lob\sread-ahead\sreads\s (?P\d+) + Table\s'(?P
[^']+)'\. \s+ + (?P.*) + \. $ ) .* """, @@ -114,6 +109,7 @@ def parse_statistics_messages(messages): timings[f"{prefix}_elapsed_ms"] += int(match["elapsed_ms"]) elif table := match["table"]: table = table.decode(errors="ignore") + io_stats_line = match["io_stats_line"].decode(errors="ignore") # Temporary table names are, internally to MSSQL, made globally unique # by padding with underscores and appending a unique suffix. We need to # restore the original name so our stats make sense. If you've got an @@ -121,9 +117,10 @@ def parse_statistics_messages(messages): # you get. if table.startswith("#"): table = table.partition("_____")[0] - stats = table_io[table] - for key in stats.keys(): - stats[key] += int(match[key]) + stats = parse_io_stats(io_stats_line) + cumulative_stats = table_io[table] + for key in cumulative_stats.keys(): + cumulative_stats[key] += stats.get(key, 0) else: # Given the structure of the regex it shouldn't be possible to get here, # but if somehow we did I'd rather drop the stats message than blow up @@ -135,6 +132,22 @@ def parse_statistics_messages(messages): return timings, table_io +def parse_io_stats(io_stats): + return dict(map(parse_io_stats_item, io_stats.split(","))) + + +def parse_io_stats_item(item): + item = item.strip() + name, _, value = item.rpartition(" ") + # Reformat MSSQL's names to the style we use internally + if name == "Scan count": + name = "scans" + name = name.removesuffix(" reads") + name = name.replace("-", "_").replace(" ", "_") + value = int(value) + return name, value + + def format_table_io(table_io): results_table = table_io_dict_to_table(table_io) return format_table(results_table) diff --git a/tests/lib/databases.py b/tests/lib/databases.py index ddf6cf192..7c0ac8773 100644 --- a/tests/lib/databases.py +++ b/tests/lib/databases.py @@ -177,7 +177,12 @@ def make_mssql_database(containers): def run_mssql(container_name, containers, password, mssql_port): # pragma: no cover containers.run_bg( name=container_name, - image="mcr.microsoft.com/mssql/server:2017-CU30-ubuntu-18.04", + # This is *not* the version that TPP run for us in production which, as of + # 2024-09-24, is SQL Server 2016 (13.0.5893.48). That version is not available + # as a Docker image, so we run the oldest supported version instead. Both the + # production server and our test server set the "compatibility level" to the + # same value so the same feature set should be supported. + image="mcr.microsoft.com/mssql/server:2019-CU28-ubuntu-20.04", volumes={ MSSQL_SETUP_DIR: {"bind": "/mssql", "mode": "ro"}, }, diff --git a/tests/support/mssql/entrypoint.sh b/tests/support/mssql/entrypoint.sh index c8f12c848..bd03442da 100755 --- a/tests/support/mssql/entrypoint.sh +++ b/tests/support/mssql/entrypoint.sh @@ -46,12 +46,13 @@ if [ "$1" = '/opt/mssql/bin/sqlservr' ]; then function _sqlcmd() { # Extra arguments: + # -C : trust server certificate # -b : exit with 1 on error rather than 0 # -h -1 : no headers in output # -W : trim trailing whitespace - /opt/mssql-tools/bin/sqlcmd \ + /opt/mssql-tools18/bin/sqlcmd \ -S localhost -U sa -P "$MSSQL_SA_PASSWORD" -d master \ - -b -h -1 -W \ + -C -b -h -1 -W \ "$@" } diff --git a/tests/unit/utils/test_mssql_log_utils.py b/tests/unit/utils/test_mssql_log_utils.py index f3b37aa0f..4e4c6739e 100644 --- a/tests/unit/utils/test_mssql_log_utils.py +++ b/tests/unit/utils/test_mssql_log_utils.py @@ -40,10 +40,14 @@ def test_parse_statistics_messages(): b" Scan count 1, logical reads 7, physical reads 0, read-ahead reads 0," b" lob logical reads 0, lob physical reads 0, lob read-ahead reads 0." ), + # Newer version of SQL Server return IO stats in a different order and with + # additional items, so check we handle those correctly ( b"Table '#tmp_1______________________________________________________________________________________________________________0000000014D9'." - b" Scan count 1, logical reads 4, physical reads 0, read-ahead reads 0," - b" lob logical reads 0, lob physical reads 0, lob read-ahead reads 0." + b" Scan count 1, logical reads 4, physical reads 0, page server reads 0," + b" read-ahead reads 0, page server read-ahead reads 0, lob logical reads 0," + b" lob physical reads 0, lob page server reads 0, lob read-ahead reads 0," + b" lob page server read-ahead reads 0." ), ( b"Table '#tmp_1______________________________________________________________________________________________________________0000000014B1'."