feat(pii): Sanitize URLs in Span description and breadcrumbs (#1876)

When recording spans for outgoing HTTP requests, strip the target URLs in three parts: base URL, query params and fragment. The URL is always stripped of the authority and then set in the spans description. query params and fragment go into data fields of the span. This is also done when creating breadcrumbs for HTTP requests and in the HTTPX and Boto3 integrations.
getsentry · Feb 16, 2023 · ba1286e · ba1286e
1 parent 0b489c6
commit ba1286e
Show file tree

Hide file tree

Showing 10 changed files with 331 additions and 17 deletions.
diff --git a/sentry_sdk/consts.py b/sentry_sdk/consts.py
@@ -44,8 +44,6 @@
 DEFAULT_QUEUE_SIZE = 100
 DEFAULT_MAX_BREADCRUMBS = 100
 
-SENSITIVE_DATA_SUBSTITUTE = "[Filtered]"
-
 
 class INSTRUMENTER:
     SENTRY = "sentry"

diff --git a/sentry_sdk/integrations/boto3.py b/sentry_sdk/integrations/boto3.py
@@ -7,6 +7,7 @@
 
 from sentry_sdk._functools import partial
 from sentry_sdk._types import MYPY
+from sentry_sdk.utils import parse_url
 
 if MYPY:
     from typing import Any
@@ -66,9 +67,14 @@ def _sentry_request_created(service_id, request, operation_name, **kwargs):
         op=OP.HTTP_CLIENT,
         description=description,
     )
+
+    parsed_url = parse_url(request.url, sanitize=False)
+
     span.set_tag("aws.service_id", service_id)
     span.set_tag("aws.operation_name", operation_name)
-    span.set_data("aws.request.url", request.url)
+    span.set_data("aws.request.url", parsed_url.url)
+    span.set_data("http.query", parsed_url.query)
+    span.set_data("http.fragment", parsed_url.fragment)
 
     # We do it in order for subsequent http calls/retries be
     # attached to this span.

diff --git a/sentry_sdk/integrations/django/__init__.py b/sentry_sdk/integrations/django/__init__.py
@@ -6,7 +6,7 @@
 import weakref
 
 from sentry_sdk._types import MYPY
-from sentry_sdk.consts import OP, SENSITIVE_DATA_SUBSTITUTE
+from sentry_sdk.consts import OP
 from sentry_sdk.hub import Hub, _should_send_default_pii
 from sentry_sdk.scope import add_global_event_processor
 from sentry_sdk.serializer import add_global_repr_processor
@@ -16,6 +16,7 @@
     AnnotatedValue,
     HAS_REAL_CONTEXTVARS,
     CONTEXTVARS_ERROR_MESSAGE,
+    SENSITIVE_DATA_SUBSTITUTE,
     logger,
     capture_internal_exceptions,
     event_from_exception,

diff --git a/sentry_sdk/integrations/httpx.py b/sentry_sdk/integrations/httpx.py
@@ -1,7 +1,7 @@
 from sentry_sdk import Hub
 from sentry_sdk.consts import OP
 from sentry_sdk.integrations import Integration, DidNotEnable
-from sentry_sdk.utils import logger
+from sentry_sdk.utils import logger, parse_url
 
 from sentry_sdk._types import MYPY
 
@@ -41,11 +41,17 @@ def send(self, request, **kwargs):
         if hub.get_integration(HttpxIntegration) is None:
             return real_send(self, request, **kwargs)
 
+        parsed_url = parse_url(str(request.url), sanitize=False)
+
         with hub.start_span(
-            op=OP.HTTP_CLIENT, description="%s %s" % (request.method, request.url)
+            op=OP.HTTP_CLIENT,
+            description="%s %s" % (request.method, parsed_url.url),
         ) as span:
             span.set_data("method", request.method)
-            span.set_data("url", str(request.url))
+            span.set_data("url", parsed_url.url)
+            span.set_data("http.query", parsed_url.query)
+            span.set_data("http.fragment", parsed_url.fragment)
+
             for key, value in hub.iter_trace_propagation_headers():
                 logger.debug(
                     "[Tracing] Adding `{key}` header {value} to outgoing request to {url}.".format(
@@ -58,6 +64,7 @@ def send(self, request, **kwargs):
             span.set_data("status_code", rv.status_code)
             span.set_http_status(rv.status_code)
             span.set_data("reason", rv.reason_phrase)
+
             return rv
 
     Client.send = send
@@ -73,11 +80,17 @@ async def send(self, request, **kwargs):
         if hub.get_integration(HttpxIntegration) is None:
             return await real_send(self, request, **kwargs)
 
+        parsed_url = parse_url(str(request.url), sanitize=False)
+
         with hub.start_span(
-            op=OP.HTTP_CLIENT, description="%s %s" % (request.method, request.url)
+            op=OP.HTTP_CLIENT,
+            description="%s %s" % (request.method, parsed_url.url),
         ) as span:
             span.set_data("method", request.method)
-            span.set_data("url", str(request.url))
+            span.set_data("url", parsed_url.url)
+            span.set_data("http.query", parsed_url.query)
+            span.set_data("http.fragment", parsed_url.fragment)
+
             for key, value in hub.iter_trace_propagation_headers():
                 logger.debug(
                     "[Tracing] Adding `{key}` header {value} to outgoing request to {url}.".format(
@@ -90,6 +103,7 @@ async def send(self, request, **kwargs):
             span.set_data("status_code", rv.status_code)
             span.set_http_status(rv.status_code)
             span.set_data("reason", rv.reason_phrase)
+
             return rv
 
     AsyncClient.send = send
diff --git a/sentry_sdk/integrations/huey.py b/sentry_sdk/integrations/huey.py
@@ -6,11 +6,15 @@
 from sentry_sdk._compat import reraise
 from sentry_sdk._types import MYPY
 from sentry_sdk import Hub
-from sentry_sdk.consts import OP, SENSITIVE_DATA_SUBSTITUTE
+from sentry_sdk.consts import OP
 from sentry_sdk.hub import _should_send_default_pii
 from sentry_sdk.integrations import DidNotEnable, Integration
 from sentry_sdk.tracing import Transaction, TRANSACTION_SOURCE_TASK
-from sentry_sdk.utils import capture_internal_exceptions, event_from_exception
+from sentry_sdk.utils import (
+    capture_internal_exceptions,
+    event_from_exception,
+    SENSITIVE_DATA_SUBSTITUTE,
+)
 
 if MYPY:
     from typing import Any, Callable, Optional, Union, TypeVar

diff --git a/sentry_sdk/integrations/stdlib.py b/sentry_sdk/integrations/stdlib.py
@@ -8,7 +8,12 @@
 from sentry_sdk.integrations import Integration
 from sentry_sdk.scope import add_global_event_processor
 from sentry_sdk.tracing_utils import EnvironHeaders
-from sentry_sdk.utils import capture_internal_exceptions, logger, safe_repr
+from sentry_sdk.utils import (
+    capture_internal_exceptions,
+    logger,
+    safe_repr,
+    parse_url,
+)
 
 from sentry_sdk._types import MYPY
 
@@ -79,12 +84,17 @@ def putrequest(self, method, url, *args, **kwargs):
                 url,
             )
 
+        parsed_url = parse_url(real_url, sanitize=False)
+
         span = hub.start_span(
-            op=OP.HTTP_CLIENT, description="%s %s" % (method, real_url)
+            op=OP.HTTP_CLIENT,
+            description="%s %s" % (method, parsed_url.url),
         )
 
         span.set_data("method", method)
-        span.set_data("url", real_url)
+        span.set_data("url", parsed_url.url)
+        span.set_data("http.query", parsed_url.query)
+        span.set_data("http.fragment", parsed_url.fragment)
 
         rv = real_putrequest(self, method, url, *args, **kwargs)
 

diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py
@@ -8,6 +8,25 @@
 import sys
 import threading
 import time
+from collections import namedtuple
+
+try:
+    # Python 3
+    from urllib.parse import parse_qs
+    from urllib.parse import unquote
+    from urllib.parse import urlencode
+    from urllib.parse import urlsplit
+    from urllib.parse import urlunsplit
+
+except ImportError:
+    # Python 2
+    from cgi import parse_qs  # type: ignore
+    from urllib import unquote  # type: ignore
+    from urllib import urlencode  # type: ignore
+    from urlparse import urlsplit  # type: ignore
+    from urlparse import urlunsplit  # type: ignore
+
+
 from datetime import datetime
 from functools import partial
 
@@ -43,13 +62,14 @@
 
 epoch = datetime(1970, 1, 1)
 
-
 # The logger is created here but initialized in the debug support module
 logger = logging.getLogger("sentry_sdk.errors")
 
 MAX_STRING_LENGTH = 1024
 BASE64_ALPHABET = re.compile(r"^[a-zA-Z0-9/+=]*$")
 
+SENSITIVE_DATA_SUBSTITUTE = "[Filtered]"
+
 
 def json_dumps(data):
     # type: (Any) -> bytes
@@ -374,8 +394,6 @@ def removed_because_over_size_limit(cls):
     def substituted_because_contains_sensitive_data(cls):
         # type: () -> AnnotatedValue
         """The actual value was removed because it contained sensitive information."""
-        from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE
-
         return AnnotatedValue(
             value=SENSITIVE_DATA_SUBSTITUTE,
             metadata={
@@ -1163,6 +1181,79 @@ def from_base64(base64_string):
     return utf8_string
 
 
+Components = namedtuple("Components", ["scheme", "netloc", "path", "query", "fragment"])
+
+
+def sanitize_url(url, remove_authority=True, remove_query_values=True):
+    # type: (str, bool, bool) -> str
+    """
+    Removes the authority and query parameter values from a given URL.
+    """
+    parsed_url = urlsplit(url)
+    query_params = parse_qs(parsed_url.query, keep_blank_values=True)
+
+    # strip username:password (netloc can be usr:[email protected])
+    if remove_authority:
+        netloc_parts = parsed_url.netloc.split("@")
+        if len(netloc_parts) > 1:
+            netloc = "%s:%s@%s" % (
+                SENSITIVE_DATA_SUBSTITUTE,
+                SENSITIVE_DATA_SUBSTITUTE,
+                netloc_parts[-1],
+            )
+        else:
+            netloc = parsed_url.netloc
+    else:
+        netloc = parsed_url.netloc
+
+    # strip values from query string
+    if remove_query_values:
+        query_string = unquote(
+            urlencode({key: SENSITIVE_DATA_SUBSTITUTE for key in query_params})
+        )
+    else:
+        query_string = parsed_url.query
+
+    safe_url = urlunsplit(
+        Components(
+            scheme=parsed_url.scheme,
+            netloc=netloc,
+            query=query_string,
+            path=parsed_url.path,
+            fragment=parsed_url.fragment,
+        )
+    )
+
+    return safe_url
+
+
+ParsedUrl = namedtuple("ParsedUrl", ["url", "query", "fragment"])
+
+
+def parse_url(url, sanitize=True):
+
+    # type: (str, bool) -> ParsedUrl
+    """
+    Splits a URL into a url (including path), query and fragment. If sanitize is True, the query
+    parameters will be sanitized to remove sensitive data. The autority (username and password)
+    in the URL will always be removed.
+    """
+    url = sanitize_url(url, remove_authority=True, remove_query_values=sanitize)
+
+    parsed_url = urlsplit(url)
+    base_url = urlunsplit(
+        Components(
+            scheme=parsed_url.scheme,
+            netloc=parsed_url.netloc,
+            query="",
+            path=parsed_url.path,
+            fragment="",
+        )
+    )
+
+    return ParsedUrl(url=base_url, query=parsed_url.query, fragment=parsed_url.fragment)
+
+
 if PY37:
 
     def nanosecond_time():

diff --git a/tests/integrations/httpx/test_httpx.py b/tests/integrations/httpx/test_httpx.py
@@ -34,6 +34,8 @@ def before_breadcrumb(crumb, hint):
             assert crumb["data"] == {
                 "url": url,
                 "method": "GET",
+                "http.fragment": "",
+                "http.query": "",
                 "status_code": 200,
                 "reason": "OK",
                 "extra": "foo",

diff --git a/tests/integrations/requests/test_requests.py b/tests/integrations/requests/test_requests.py
@@ -20,6 +20,8 @@ def test_crumb_capture(sentry_init, capture_events):
     assert crumb["data"] == {
         "url": "https://httpbin.org/status/418",
         "method": "GET",
+        "http.fragment": "",
+        "http.query": "",
         "status_code": response.status_code,
         "reason": response.reason,
     }