Skip to content

Commit

Permalink
feat(pii): Sanitize URLs in Span description and breadcrumbs (#1876)
Browse files Browse the repository at this point in the history
When recording spans for outgoing HTTP requests, strip the target URLs in three parts: base URL, query params and fragment. The URL is always stripped of the authority and then set in the spans description. query params and fragment go into data fields of the span. This is also done when creating breadcrumbs for HTTP requests and in the HTTPX and Boto3 integrations.
  • Loading branch information
antonpirker authored Feb 16, 2023
1 parent 0b489c6 commit ba1286e
Show file tree
Hide file tree
Showing 10 changed files with 331 additions and 17 deletions.
2 changes: 0 additions & 2 deletions sentry_sdk/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@
DEFAULT_QUEUE_SIZE = 100
DEFAULT_MAX_BREADCRUMBS = 100

SENSITIVE_DATA_SUBSTITUTE = "[Filtered]"


class INSTRUMENTER:
SENTRY = "sentry"
Expand Down
8 changes: 7 additions & 1 deletion sentry_sdk/integrations/boto3.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from sentry_sdk._functools import partial
from sentry_sdk._types import MYPY
from sentry_sdk.utils import parse_url

if MYPY:
from typing import Any
Expand Down Expand Up @@ -66,9 +67,14 @@ def _sentry_request_created(service_id, request, operation_name, **kwargs):
op=OP.HTTP_CLIENT,
description=description,
)

parsed_url = parse_url(request.url, sanitize=False)

span.set_tag("aws.service_id", service_id)
span.set_tag("aws.operation_name", operation_name)
span.set_data("aws.request.url", request.url)
span.set_data("aws.request.url", parsed_url.url)
span.set_data("http.query", parsed_url.query)
span.set_data("http.fragment", parsed_url.fragment)

# We do it in order for subsequent http calls/retries be
# attached to this span.
Expand Down
3 changes: 2 additions & 1 deletion sentry_sdk/integrations/django/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import weakref

from sentry_sdk._types import MYPY
from sentry_sdk.consts import OP, SENSITIVE_DATA_SUBSTITUTE
from sentry_sdk.consts import OP
from sentry_sdk.hub import Hub, _should_send_default_pii
from sentry_sdk.scope import add_global_event_processor
from sentry_sdk.serializer import add_global_repr_processor
Expand All @@ -16,6 +16,7 @@
AnnotatedValue,
HAS_REAL_CONTEXTVARS,
CONTEXTVARS_ERROR_MESSAGE,
SENSITIVE_DATA_SUBSTITUTE,
logger,
capture_internal_exceptions,
event_from_exception,
Expand Down
24 changes: 19 additions & 5 deletions sentry_sdk/integrations/httpx.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from sentry_sdk import Hub
from sentry_sdk.consts import OP
from sentry_sdk.integrations import Integration, DidNotEnable
from sentry_sdk.utils import logger
from sentry_sdk.utils import logger, parse_url

from sentry_sdk._types import MYPY

Expand Down Expand Up @@ -41,11 +41,17 @@ def send(self, request, **kwargs):
if hub.get_integration(HttpxIntegration) is None:
return real_send(self, request, **kwargs)

parsed_url = parse_url(str(request.url), sanitize=False)

with hub.start_span(
op=OP.HTTP_CLIENT, description="%s %s" % (request.method, request.url)
op=OP.HTTP_CLIENT,
description="%s %s" % (request.method, parsed_url.url),
) as span:
span.set_data("method", request.method)
span.set_data("url", str(request.url))
span.set_data("url", parsed_url.url)
span.set_data("http.query", parsed_url.query)
span.set_data("http.fragment", parsed_url.fragment)

for key, value in hub.iter_trace_propagation_headers():
logger.debug(
"[Tracing] Adding `{key}` header {value} to outgoing request to {url}.".format(
Expand All @@ -58,6 +64,7 @@ def send(self, request, **kwargs):
span.set_data("status_code", rv.status_code)
span.set_http_status(rv.status_code)
span.set_data("reason", rv.reason_phrase)

return rv

Client.send = send
Expand All @@ -73,11 +80,17 @@ async def send(self, request, **kwargs):
if hub.get_integration(HttpxIntegration) is None:
return await real_send(self, request, **kwargs)

parsed_url = parse_url(str(request.url), sanitize=False)

with hub.start_span(
op=OP.HTTP_CLIENT, description="%s %s" % (request.method, request.url)
op=OP.HTTP_CLIENT,
description="%s %s" % (request.method, parsed_url.url),
) as span:
span.set_data("method", request.method)
span.set_data("url", str(request.url))
span.set_data("url", parsed_url.url)
span.set_data("http.query", parsed_url.query)
span.set_data("http.fragment", parsed_url.fragment)

for key, value in hub.iter_trace_propagation_headers():
logger.debug(
"[Tracing] Adding `{key}` header {value} to outgoing request to {url}.".format(
Expand All @@ -90,6 +103,7 @@ async def send(self, request, **kwargs):
span.set_data("status_code", rv.status_code)
span.set_http_status(rv.status_code)
span.set_data("reason", rv.reason_phrase)

return rv

AsyncClient.send = send
8 changes: 6 additions & 2 deletions sentry_sdk/integrations/huey.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@
from sentry_sdk._compat import reraise
from sentry_sdk._types import MYPY
from sentry_sdk import Hub
from sentry_sdk.consts import OP, SENSITIVE_DATA_SUBSTITUTE
from sentry_sdk.consts import OP
from sentry_sdk.hub import _should_send_default_pii
from sentry_sdk.integrations import DidNotEnable, Integration
from sentry_sdk.tracing import Transaction, TRANSACTION_SOURCE_TASK
from sentry_sdk.utils import capture_internal_exceptions, event_from_exception
from sentry_sdk.utils import (
capture_internal_exceptions,
event_from_exception,
SENSITIVE_DATA_SUBSTITUTE,
)

if MYPY:
from typing import Any, Callable, Optional, Union, TypeVar
Expand Down
16 changes: 13 additions & 3 deletions sentry_sdk/integrations/stdlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
from sentry_sdk.integrations import Integration
from sentry_sdk.scope import add_global_event_processor
from sentry_sdk.tracing_utils import EnvironHeaders
from sentry_sdk.utils import capture_internal_exceptions, logger, safe_repr
from sentry_sdk.utils import (
capture_internal_exceptions,
logger,
safe_repr,
parse_url,
)

from sentry_sdk._types import MYPY

Expand Down Expand Up @@ -79,12 +84,17 @@ def putrequest(self, method, url, *args, **kwargs):
url,
)

parsed_url = parse_url(real_url, sanitize=False)

span = hub.start_span(
op=OP.HTTP_CLIENT, description="%s %s" % (method, real_url)
op=OP.HTTP_CLIENT,
description="%s %s" % (method, parsed_url.url),
)

span.set_data("method", method)
span.set_data("url", real_url)
span.set_data("url", parsed_url.url)
span.set_data("http.query", parsed_url.query)
span.set_data("http.fragment", parsed_url.fragment)

rv = real_putrequest(self, method, url, *args, **kwargs)

Expand Down
97 changes: 94 additions & 3 deletions sentry_sdk/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,25 @@
import sys
import threading
import time
from collections import namedtuple

try:
# Python 3
from urllib.parse import parse_qs
from urllib.parse import unquote
from urllib.parse import urlencode
from urllib.parse import urlsplit
from urllib.parse import urlunsplit

except ImportError:
# Python 2
from cgi import parse_qs # type: ignore
from urllib import unquote # type: ignore
from urllib import urlencode # type: ignore
from urlparse import urlsplit # type: ignore
from urlparse import urlunsplit # type: ignore


from datetime import datetime
from functools import partial

Expand Down Expand Up @@ -43,13 +62,14 @@

epoch = datetime(1970, 1, 1)


# The logger is created here but initialized in the debug support module
logger = logging.getLogger("sentry_sdk.errors")

MAX_STRING_LENGTH = 1024
BASE64_ALPHABET = re.compile(r"^[a-zA-Z0-9/+=]*$")

SENSITIVE_DATA_SUBSTITUTE = "[Filtered]"


def json_dumps(data):
# type: (Any) -> bytes
Expand Down Expand Up @@ -374,8 +394,6 @@ def removed_because_over_size_limit(cls):
def substituted_because_contains_sensitive_data(cls):
# type: () -> AnnotatedValue
"""The actual value was removed because it contained sensitive information."""
from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE

return AnnotatedValue(
value=SENSITIVE_DATA_SUBSTITUTE,
metadata={
Expand Down Expand Up @@ -1163,6 +1181,79 @@ def from_base64(base64_string):
return utf8_string


Components = namedtuple("Components", ["scheme", "netloc", "path", "query", "fragment"])


def sanitize_url(url, remove_authority=True, remove_query_values=True):
# type: (str, bool, bool) -> str
"""
Removes the authority and query parameter values from a given URL.
"""
parsed_url = urlsplit(url)
query_params = parse_qs(parsed_url.query, keep_blank_values=True)

# strip username:password (netloc can be usr:[email protected])
if remove_authority:
netloc_parts = parsed_url.netloc.split("@")
if len(netloc_parts) > 1:
netloc = "%s:%s@%s" % (
SENSITIVE_DATA_SUBSTITUTE,
SENSITIVE_DATA_SUBSTITUTE,
netloc_parts[-1],
)
else:
netloc = parsed_url.netloc
else:
netloc = parsed_url.netloc

# strip values from query string
if remove_query_values:
query_string = unquote(
urlencode({key: SENSITIVE_DATA_SUBSTITUTE for key in query_params})
)
else:
query_string = parsed_url.query

safe_url = urlunsplit(
Components(
scheme=parsed_url.scheme,
netloc=netloc,
query=query_string,
path=parsed_url.path,
fragment=parsed_url.fragment,
)
)

return safe_url


ParsedUrl = namedtuple("ParsedUrl", ["url", "query", "fragment"])


def parse_url(url, sanitize=True):

# type: (str, bool) -> ParsedUrl
"""
Splits a URL into a url (including path), query and fragment. If sanitize is True, the query
parameters will be sanitized to remove sensitive data. The autority (username and password)
in the URL will always be removed.
"""
url = sanitize_url(url, remove_authority=True, remove_query_values=sanitize)

parsed_url = urlsplit(url)
base_url = urlunsplit(
Components(
scheme=parsed_url.scheme,
netloc=parsed_url.netloc,
query="",
path=parsed_url.path,
fragment="",
)
)

return ParsedUrl(url=base_url, query=parsed_url.query, fragment=parsed_url.fragment)


if PY37:

def nanosecond_time():
Expand Down
2 changes: 2 additions & 0 deletions tests/integrations/httpx/test_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def before_breadcrumb(crumb, hint):
assert crumb["data"] == {
"url": url,
"method": "GET",
"http.fragment": "",
"http.query": "",
"status_code": 200,
"reason": "OK",
"extra": "foo",
Expand Down
2 changes: 2 additions & 0 deletions tests/integrations/requests/test_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ def test_crumb_capture(sentry_init, capture_events):
assert crumb["data"] == {
"url": "https://httpbin.org/status/418",
"method": "GET",
"http.fragment": "",
"http.query": "",
"status_code": response.status_code,
"reason": response.reason,
}
Loading

0 comments on commit ba1286e

Please sign in to comment.