diff --git a/ci/diff.py b/ci/diff.py index 0ef29a89..9b8d9ae3 100644 --- a/ci/diff.py +++ b/ci/diff.py @@ -5,6 +5,7 @@ import json import re import sys +import urllib.parse from enum import StrEnum from itertools import zip_longest from typing import Any, Iterable @@ -32,6 +33,16 @@ ipv6_pattern = re.compile(r"\b([0-9a-fA-F]{1,4}::?){1,7}[0-9a-fA-F]{1,4}\b") mac_pattern = re.compile(r"\b([0-9a-f]{2}:){5}[0-9a-f]{2}\b") +# Pattern matching strings starting with `"url": "/api/v1/` and ending with `"` +api_v1_pattern = re.compile(r'"url":\s*"/api/v1/.*?"') +# Pattern matching URLs where the final component is a number +# Defines 4 capture groups to be able to replace the number with a placeholder. +# Only matches the number if it is preceded by a `/` or `=` +# Does not match patterns containing `` and `` after `/api/v1/`. +api_v1_pattern_with_number = re.compile( + r'("url":\s*"/api/v1/(?!.*?<(?:IPv6|IPv4)>).*?)([/=])(\d+)(")' +) + class DiffError(Exception): """Base class for diff errors.""" @@ -48,6 +59,20 @@ def __init__(self, expected: int, result: int) -> None: # noqa: D107 ) +def unquote_url(match: re.Match[str]) -> str: + """Unquote URL encoded text in a /api/v1/ URL.""" + return urllib.parse.unquote(match.group(0)) + + +def replace_url_id(match: re.Match[str]) -> str: + """Replace the final number (ID) in a URL with a placeholder.""" + # match.group(1) contains the part before the separator (`"url": "/api/...`) + # match.group(2) contains the separator (/ or =) + # match.group(3) contains the number we want to replace + # match.group(4) contains the closing double quote + return f"{match.group(1)}{match.group(2)}{match.group(4)}" + + def group_objects(json_file_path: str) -> list[dict[str, Any]]: """Group objects in a JSON file by a specific criterion. @@ -56,12 +81,23 @@ def group_objects(json_file_path: str) -> list[dict[str, Any]]: """ with open(json_file_path, "r") as f: s = f.read() + # Replace all URL encoded text in /api/v1/ URLs with unquoted text + # This lets us replace it down the line with our normal IPv{4,6} and MAC placeholders + # Must be done _before_ all other replacements + s = api_v1_pattern.sub(unquote_url, s) + + # Replace all non-deterministic values with placeholders s = timestamp_pattern.sub("