Skip to content

Commit

Permalink
CBG-3029 use matching regex replacement for go and python code (#6710)
Browse files Browse the repository at this point in the history
  • Loading branch information
torcolvin authored and bbrks committed Mar 28, 2024
1 parent 1dc2294 commit 8ecfa23
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 114 deletions.
39 changes: 36 additions & 3 deletions tools-tests/password_remover_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import json
import unittest

import pytest

import password_remover

class TestStripPasswordsFromUrl(unittest.TestCase):
Expand Down Expand Up @@ -123,7 +125,7 @@ def test_non_parseable_config(self):
}
}
"""
with_passwords_removed = password_remover.remove_passwords(unparseable_json_with_passwords, log_json_parsing_exceptions=False)
with_passwords_removed = password_remover.remove_passwords(unparseable_json_with_passwords)
assert "foobar" not in with_passwords_removed


Expand Down Expand Up @@ -311,5 +313,36 @@ def test_basic(self):
json_text_expected = json.dumps(json_dict_expected, sort_keys=True)
assert json_text_expected == json_text_actual



@pytest.mark.parametrize("input_str, expected", [
(
b'{"foo": "bar"}',
'{"foo": "bar"}',
),
(
b'{\"foo\": `bar`}',
'{"foo": "bar"}',
),
(
b'{\"foo\": `bar\nbaz\nboo`}',
r'{"foo": "bar\nbaz\nboo"}',
),
(
b'{\"foo\": `bar\n\"baz\n\tboo`}',
r'{"foo": "bar\n\"baz\n\tboo"}',
),
(
b'{\"foo\": `bar\n`, \"baz\": `howdy`}',
r'{"foo": "bar\n", "baz": "howdy"}',
),
(
b'{\"foo\": `bar\r\n`, \"baz\": `\r\nhowdy`}',
r'{"foo": "bar\n", "baz": "\nhowdy"}',
),
(
b'{\"foo\": `bar\\baz`, \"something\": `else\\is\\here`}',
r'{"foo": "bar\\baz", "something": "else\\is\\here"}',
),
])
def test_convert_to_valid_json(input_str, expected):
assert password_remover.convert_to_valid_json(input_str) == expected
password_remover.get_parsed_json(input_str)
101 changes: 101 additions & 0 deletions tools-tests/tasks_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Copyright 2024-Present Couchbase, Inc.
#
# Use of this software is governed by the Business Source License included
# in the file licenses/BSL-Couchbase.txt. As of the Change Date specified
# in that file, in accordance with the Business Source License, use of this
# software will be governed by the Apache License, Version 2.0, included in
# the file licenses/APL2.txt.

import json
import pathlib

import password_remover
import tasks

import pytest

VERBOSE = 2

INPUT_CONFIG = """\
{
"password": "password",
"server": "http://localhost:4984/db",
"databases": {
"db" : {
"users" : {
"foo" : "bar"
}
}
}
}"""

REDACTED_OUTPUT = """\
{
"password": "******",
"server": "http://localhost:4984/db",
"databases": {
"db": {
"users": {
"<ud>foo</ud>": "bar"
}
}
}
}"""


@pytest.mark.parametrize("tag_userdata", [True, False])
def test_add_file_task(tmpdir, tag_userdata):
if tag_userdata:
expected = REDACTED_OUTPUT
else:
expected = REDACTED_OUTPUT.replace("<ud>foo</ud>", "foo")

filename = "config.json"
config_file = tmpdir.join(filename)
config_file.write(INPUT_CONFIG)
postprocessors = [password_remover.remove_passwords]
if tag_userdata:
postprocessors.append(password_remover.tag_userdata_in_server_config)
task = tasks.add_file_task(
config_file.strpath,
content_postprocessors=postprocessors,
)
output_dir = tmpdir.mkdir("output")
runner = tasks.TaskRunner(
verbosity=VERBOSE,
default_name="sg.log",
tmp_dir=output_dir,
)
runner.run(task)
runner.close_all_files()

with open(pathlib.Path(runner.tmpdir) / filename) as fh:
assert expected in fh.read()


def test_make_curl_task(tmpdir, httpserver):
output = "curltask"
httpserver.expect_request("/").respond_with_json(json.loads(INPUT_CONFIG))
task = tasks.make_curl_task(
"curltask",
httpserver.url_for("/"),
content_postprocessors=[
password_remover.remove_passwords,
password_remover.tag_userdata_in_server_config,
],
log_file=output,
)

output_dir = tmpdir.mkdir("output")
runner = tasks.TaskRunner(
verbosity=VERBOSE,
default_name="sg.log",
tmp_dir=output_dir,
)
runner.run(task)
runner.close_all_files()

with open(pathlib.Path(runner.tmpdir) / output) as fh:
assert REDACTED_OUTPUT in fh.read()

httpserver.check()
161 changes: 50 additions & 111 deletions tools/password_remover.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,45 +13,39 @@
"""


import enum
import json
import traceback
import re
from urllib.parse import urlparse

from typing import Union

def is_valid_json(invalid_json):
def get_parsed_json(json_text: str) -> dict:
"""
Is the given string valid JSON?
Turns json_text into a valid JSON object by replacing backquotes with single quotes, and returns it as a dictionary.
"""
got_exception = True
try:
json.loads(invalid_json)
got_exception = False
except Exception as e:
pass
valid_json = convert_to_valid_json(json_text)

return got_exception is False
# Lower case keys so that "databases" works as a
# key even if the JSON has "Databases" as a key.
return lower_keys_dict(valid_json)


def tag_userdata_in_server_config(json_text, log_json_parsing_exceptions=True):
def tag_userdata_in_server_config(json_text):
"""
Content postprocessor that tags user data in a config ready for post-process redaction
"""
try:
valid_json = convert_to_valid_json(json_text)

# Lower case keys so that "databases" works as a
# key even if the JSON has "Databases" as a key.
parsed_json = lower_keys_dict(valid_json)
parsed_json = get_parsed_json(json_text)

tag_userdata_in_server_json(parsed_json)
formatted_json_string = json.dumps(parsed_json, indent=4)
return formatted_json_string

except Exception as e:
if log_json_parsing_exceptions:
print("Exception trying to tag config user data in {0}. Exception: {1}".format(json_text, e))
traceback.print_exc()
print("Exception trying to tag config user data in {0}. Exception: {1}".format(json_text, e))
traceback.print_exc()
return '{"Error":"Error in sgcollect_info password_remover.py trying to tag config user data. See logs for details"}'


Expand All @@ -67,7 +61,7 @@ def tag_userdata_in_server_json(config):
tag_userdata_in_db_json(dbs[db])


def tag_userdata_in_db_config(json_text, log_json_parsing_exceptions=True):
def tag_userdata_in_db_config(json_text):
"""
Content postprocessor that tags user data in a db config ready for post-process redaction
"""
Expand All @@ -83,9 +77,8 @@ def tag_userdata_in_db_config(json_text, log_json_parsing_exceptions=True):
return formatted_json_string

except Exception as e:
if log_json_parsing_exceptions:
print("Exception trying to tag db config user data in {0}. Exception: {1}".format(json_text, e))
traceback.print_exc()
print("Exception trying to tag db config user data in {0}. Exception: {1}".format(json_text, e))
traceback.print_exc()
return '{"Error":"Error in sgcollect_info password_remover.py trying to tag db config user data. See logs for details"}'


Expand Down Expand Up @@ -157,26 +150,22 @@ def remove_passwords_from_config(config_fragment):
remove_passwords_from_config(item)


def remove_passwords(json_text, log_json_parsing_exceptions=True):
def remove_passwords(json_text):
"""
Content postprocessor that strips out all of the sensitive passwords
"""
try:
valid_json = convert_to_valid_json(json_text)
parsed_json = get_parsed_json(json_text)

# Lower case keys so that "databases" works as a
# key even if the JSON has "Databases" as a key.
parsed_json = lower_keys_dict(valid_json)
remove_passwords_from_config(parsed_json)

# Append a trailing \n here to ensure there's adequate separation in sync_gateway.log
formatted_json_string = json.dumps(parsed_json, indent=4) + "\n"
return formatted_json_string

except Exception as e:
if log_json_parsing_exceptions:
print("Exception trying to remove passwords from {0}. Exception: {1}".format(json_text, e))
traceback.print_exc()
print("Exception trying to remove passwords from {0}. Exception: {1}".format(json_text, e))
traceback.print_exc()
return '{"Error":"Error in sgcollect_info password_remover.py trying to remove passwords. See logs for details"}'


Expand Down Expand Up @@ -235,91 +224,41 @@ def strip_password_from_url(url_string):
return new_url


def escape_json_value(raw_value):
def replace_backticks_with_double_quotes(match):
"""
Escape all invalid json characters like " to produce a valid json value
Escape all invalid json characters in sync gateway config files that occur between backticks to produce a valid json value. This matches ConvertBackticksToDoubleQuotes in go code.
Before:
Before::
function(doc, oldDoc) { if (doc.type == "reject_me") {
function(doc, oldDoc) { if (doc.type == "reject_me") {
After:
After::
function(doc, oldDoc) { if (doc.type == \"reject_me\") {
function(doc, oldDoc) { if (doc.type == \"reject_me\") {
"""
escaped = raw_value
escaped = escaped.replace('\\', "\\\\") # Escape any backslashes
escaped = escaped.replace('"', '\\"') # Escape double quotes
escaped = escaped.replace("'", "\\'") # Escape single quotes

# TODO: other stuff should be escaped like \n \t and other control characters
# See http://stackoverflow.com/questions/983451/where-can-i-find-a-list-of-escape-characters-required-for-my-json-ajax-return-ty

return escaped


def convert_to_valid_json(invalid_json):

STATE_OUTSIDE_BACKTICK = "STATE_OUTSIDE_BACKTICK"
STATE_INSIDE_BACKTICK = "STATE_INSIDE_BACKTICK"
state = STATE_OUTSIDE_BACKTICK
output = []
sync_function_buffer = []

try:
invalid_json = invalid_json.decode('utf-8')
except (UnicodeDecodeError, AttributeError):
pass

# Strip newlines
invalid_json = invalid_json.replace('\n', '')

# Strip tabs
invalid_json = invalid_json.replace('\t', '')

# read string char by char
for json_char in invalid_json:

# if non-backtick character:
if json_char != '`':

# if in OUTSIDE_BACKTICK state
if state == STATE_OUTSIDE_BACKTICK:
# append char to output
output.append(json_char)

# if in INSIDE_BACKTICK state
elif state == STATE_INSIDE_BACKTICK:
# append to sync_function_buffer
sync_function_buffer.append(json_char)

# if backtick character
elif json_char == '`':

# if in OUTSIDE_BACKTICK state
if state == STATE_OUTSIDE_BACKTICK:
# transition to INSIDE_BACKTICK state
state = STATE_INSIDE_BACKTICK

# if in INSIDE_BACKTICK state
elif state == STATE_INSIDE_BACKTICK:
# run sync_function_buffer through escape_json_value()
sync_function_buffer_str = "".join(sync_function_buffer)
sync_function_buffer_str = escape_json_value(sync_function_buffer_str)

# append to output
output.append('"') # append a double quote
output.append(sync_function_buffer_str)
output.append('"') # append a double quote

# empty the sync_function_buffer
sync_function_buffer = []

# transition to OUTSIDE_BACKTICK state
state = STATE_OUTSIDE_BACKTICK

output_str = "".join(output)
return output_str


replacements = [
("\\", "\\\\"), # replace literal slash with two slashes
("\r\n", "\\n"),
("\r", ""),
("\n", "\\n"),
("\t", "\\t"),
(r'"', r'\"')
]
expr = match.group(0)
for search, replace in replacements:
expr = expr.replace(search, replace)

# Replace the backquotes with double-quotes
expr = '"' + expr[1:]
expr = expr[:-1] + '"'
return expr

def convert_to_valid_json(invalid_json: Union[bytes,str]) -> str:
"""
Converts json text from a sync gateway config file to valid json by replacing backticks with double quotes and escaping invalid json characters inside the backquotes.
"""
if isinstance(invalid_json, bytes):
invalid_json = invalid_json.decode('utf-8', errors="backslashreplace")
return re.sub(r"`(.*?)[^\\\\]`", replace_backticks_with_double_quotes, invalid_json, flags=re.MULTILINE|re.DOTALL)

0 comments on commit 8ecfa23

Please sign in to comment.