-
Notifications
You must be signed in to change notification settings - Fork 515
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(pii): Sanitize URLs in Span description and breadcrumbs (#1876)
When recording spans for outgoing HTTP requests, strip the target URLs in three parts: base URL, query params and fragment. The URL is always stripped of the authority and then set in the spans description. query params and fragment go into data fields of the span. This is also done when creating breadcrumbs for HTTP requests and in the HTTPX and Boto3 integrations.
- Loading branch information
1 parent
0b489c6
commit ba1286e
Showing
10 changed files
with
331 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,25 @@ | |
import sys | ||
import threading | ||
import time | ||
from collections import namedtuple | ||
|
||
try: | ||
# Python 3 | ||
from urllib.parse import parse_qs | ||
from urllib.parse import unquote | ||
from urllib.parse import urlencode | ||
from urllib.parse import urlsplit | ||
from urllib.parse import urlunsplit | ||
|
||
except ImportError: | ||
# Python 2 | ||
from cgi import parse_qs # type: ignore | ||
from urllib import unquote # type: ignore | ||
from urllib import urlencode # type: ignore | ||
from urlparse import urlsplit # type: ignore | ||
from urlparse import urlunsplit # type: ignore | ||
|
||
|
||
from datetime import datetime | ||
from functools import partial | ||
|
||
|
@@ -43,13 +62,14 @@ | |
|
||
epoch = datetime(1970, 1, 1) | ||
|
||
|
||
# The logger is created here but initialized in the debug support module | ||
logger = logging.getLogger("sentry_sdk.errors") | ||
|
||
MAX_STRING_LENGTH = 1024 | ||
BASE64_ALPHABET = re.compile(r"^[a-zA-Z0-9/+=]*$") | ||
|
||
SENSITIVE_DATA_SUBSTITUTE = "[Filtered]" | ||
|
||
|
||
def json_dumps(data): | ||
# type: (Any) -> bytes | ||
|
@@ -374,8 +394,6 @@ def removed_because_over_size_limit(cls): | |
def substituted_because_contains_sensitive_data(cls): | ||
# type: () -> AnnotatedValue | ||
"""The actual value was removed because it contained sensitive information.""" | ||
from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE | ||
|
||
return AnnotatedValue( | ||
value=SENSITIVE_DATA_SUBSTITUTE, | ||
metadata={ | ||
|
@@ -1163,6 +1181,79 @@ def from_base64(base64_string): | |
return utf8_string | ||
|
||
|
||
Components = namedtuple("Components", ["scheme", "netloc", "path", "query", "fragment"]) | ||
|
||
|
||
def sanitize_url(url, remove_authority=True, remove_query_values=True): | ||
# type: (str, bool, bool) -> str | ||
""" | ||
Removes the authority and query parameter values from a given URL. | ||
""" | ||
parsed_url = urlsplit(url) | ||
query_params = parse_qs(parsed_url.query, keep_blank_values=True) | ||
|
||
# strip username:password (netloc can be usr:[email protected]) | ||
if remove_authority: | ||
netloc_parts = parsed_url.netloc.split("@") | ||
if len(netloc_parts) > 1: | ||
netloc = "%s:%s@%s" % ( | ||
SENSITIVE_DATA_SUBSTITUTE, | ||
SENSITIVE_DATA_SUBSTITUTE, | ||
netloc_parts[-1], | ||
) | ||
else: | ||
netloc = parsed_url.netloc | ||
else: | ||
netloc = parsed_url.netloc | ||
|
||
# strip values from query string | ||
if remove_query_values: | ||
query_string = unquote( | ||
urlencode({key: SENSITIVE_DATA_SUBSTITUTE for key in query_params}) | ||
) | ||
else: | ||
query_string = parsed_url.query | ||
|
||
safe_url = urlunsplit( | ||
Components( | ||
scheme=parsed_url.scheme, | ||
netloc=netloc, | ||
query=query_string, | ||
path=parsed_url.path, | ||
fragment=parsed_url.fragment, | ||
) | ||
) | ||
|
||
return safe_url | ||
|
||
|
||
ParsedUrl = namedtuple("ParsedUrl", ["url", "query", "fragment"]) | ||
|
||
|
||
def parse_url(url, sanitize=True): | ||
|
||
# type: (str, bool) -> ParsedUrl | ||
""" | ||
Splits a URL into a url (including path), query and fragment. If sanitize is True, the query | ||
parameters will be sanitized to remove sensitive data. The autority (username and password) | ||
in the URL will always be removed. | ||
""" | ||
url = sanitize_url(url, remove_authority=True, remove_query_values=sanitize) | ||
|
||
parsed_url = urlsplit(url) | ||
base_url = urlunsplit( | ||
Components( | ||
scheme=parsed_url.scheme, | ||
netloc=parsed_url.netloc, | ||
query="", | ||
path=parsed_url.path, | ||
fragment="", | ||
) | ||
) | ||
|
||
return ParsedUrl(url=base_url, query=parsed_url.query, fragment=parsed_url.fragment) | ||
|
||
|
||
if PY37: | ||
|
||
def nanosecond_time(): | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.