opensafely-core · evansd · Sep 25, 2024 · Sep 23, 2024 · Sep 24, 2024 · rebkwok
diff --git a/ehrql/utils/mssql_log_utils.py b/ehrql/utils/mssql_log_utils.py
@@ -67,14 +67,9 @@ def execute_with_log(connection, query, log, query_id=None):
 
     # Regex to match IO statistics messages
 
-    Table\s'(?P<table>[^']+)'. \s+
-    Scan\scount\s (?P<scans>\d+), \s+
-    logical\sreads\s (?P<logical>\d+), \s+
-    physical\sreads\s (?P<physical>\d+), \s+
-    read-ahead\sreads\s (?P<read_ahead>\d+), \s+
-    lob\slogical\sreads\s (?P<lob_logical>\d+), \s+
-    lob\sphysical\sreads\s (?P<lob_physical>\d+), \s+
-    lob\sread-ahead\sreads\s (?P<lob_read_ahead>\d+)
+    Table\s'(?P<table>[^']+)'\. \s+
+    (?P<io_stats_line>.*)
+    \. $
 
     ) .*
     """,
@@ -114,16 +109,18 @@ def parse_statistics_messages(messages):
                 timings[f"{prefix}_elapsed_ms"] += int(match["elapsed_ms"])
             elif table := match["table"]:
                 table = table.decode(errors="ignore")
+                io_stats_line = match["io_stats_line"].decode(errors="ignore")
                 # Temporary table names are, internally to MSSQL, made globally unique
                 # by padding with underscores and appending a unique suffix. We need to
                 # restore the original name so our stats make sense. If you've got an
                 # actual temp table name with 5 underscores in it you deserve everything
                 # you get.
                 if table.startswith("#"):
                     table = table.partition("_____")[0]
-                stats = table_io[table]
-                for key in stats.keys():
-                    stats[key] += int(match[key])
+                stats = parse_io_stats(io_stats_line)
+                cumulative_stats = table_io[table]
+                for key in cumulative_stats.keys():
+                    cumulative_stats[key] += stats.get(key, 0)
             else:
                 # Given the structure of the regex it shouldn't be possible to get here,
                 # but if somehow we did I'd rather drop the stats message than blow up
@@ -135,6 +132,22 @@ def parse_statistics_messages(messages):
     return timings, table_io
 
 
+def parse_io_stats(io_stats):
+    return dict(map(parse_io_stats_item, io_stats.split(",")))
+
+
+def parse_io_stats_item(item):
+    item = item.strip()
+    name, _, value = item.rpartition(" ")
+    # Reformat MSSQL's names to the style we use internally
+    if name == "Scan count":
+        name = "scans"
+    name = name.removesuffix(" reads")
+    name = name.replace("-", "_").replace(" ", "_")
+    value = int(value)
+    return name, value
+
+
 def format_table_io(table_io):
     results_table = table_io_dict_to_table(table_io)
     return format_table(results_table)

diff --git a/tests/lib/databases.py b/tests/lib/databases.py
@@ -177,7 +177,12 @@ def make_mssql_database(containers):
 def run_mssql(container_name, containers, password, mssql_port):  # pragma: no cover
     containers.run_bg(
         name=container_name,
-        image="mcr.microsoft.com/mssql/server:2017-CU30-ubuntu-18.04",
+        # This is *not* the version that TPP run for us in production which, as of
+        # 2024-09-24, is SQL Server 2016 (13.0.5893.48). That version is not available
+        # as a Docker image, so we run the oldest supported version instead. Both the
+        # production server and our test server set the "compatibility level" to the
+        # same value so the same feature set should be supported.
+        image="mcr.microsoft.com/mssql/server:2019-CU28-ubuntu-20.04",
         volumes={
             MSSQL_SETUP_DIR: {"bind": "/mssql", "mode": "ro"},
         },

diff --git a/tests/support/mssql/entrypoint.sh b/tests/support/mssql/entrypoint.sh
@@ -46,12 +46,13 @@ if [ "$1" = '/opt/mssql/bin/sqlservr' ]; then
 
     function _sqlcmd() {
       # Extra arguments:
+      #    -C : trust server certificate
       #    -b : exit with 1 on error rather than 0
       # -h -1 : no headers in output
       #    -W : trim trailing whitespace
-      /opt/mssql-tools/bin/sqlcmd \
+      /opt/mssql-tools18/bin/sqlcmd \
         -S localhost -U sa -P "$MSSQL_SA_PASSWORD" -d master \
-        -b -h -1 -W \
+        -C -b -h -1 -W \
         "$@"
     }
 

diff --git a/tests/unit/utils/test_mssql_log_utils.py b/tests/unit/utils/test_mssql_log_utils.py
@@ -40,10 +40,14 @@ def test_parse_statistics_messages():
             b" Scan count 1, logical reads 7, physical reads 0, read-ahead reads 0,"
             b" lob logical reads 0, lob physical reads 0, lob read-ahead reads 0."
         ),
+        # Newer version of SQL Server return IO stats in a different order and with
+        # additional items, so check we handle those correctly
         (
             b"Table '#tmp_1______________________________________________________________________________________________________________0000000014D9'."
-            b" Scan count 1, logical reads 4, physical reads 0, read-ahead reads 0,"
-            b" lob logical reads 0, lob physical reads 0, lob read-ahead reads 0."
+            b" Scan count 1, logical reads 4, physical reads 0, page server reads 0,"
 table_io = defaultdict( 
     lambda: { 
         "scans": 0, 
         "logical": 0, 
         "physical": 0, 
         "read_ahead": 0, 
         "lob_logical": 0, 
         "lob_physical": 0, 
         "lob_read_ahead": 0, 
     } 
 for key in cumulative_stats.keys(): 
     cumulative_stats[key] += stats.get(key, 0) 
 table_io = defaultdict( 
     lambda: { 
         "scans": 0, 
         "logical": 0, 
         "physical": 0, 
         "read_ahead": 0, 
         "lob_logical": 0, 
         "lob_physical": 0, 
         "lob_read_ahead": 0, 
     } 
 for key in cumulative_stats.keys(): 
     cumulative_stats[key] += stats.get(key, 0) 
+            b" read-ahead reads 0, page server read-ahead reads 0, lob logical reads 0,"
+            b" lob physical reads 0, lob page server reads 0, lob read-ahead reads 0,"
+            b" lob page server read-ahead reads 0."
         ),
         (
             b"Table '#tmp_1______________________________________________________________________________________________________________0000000014B1'."