Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CBG-3029 use matching regex replacement for go and python code #6710

Merged
merged 1 commit into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 36 additions & 3 deletions tools-tests/password_remover_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import json
import unittest

import pytest

import password_remover

class TestStripPasswordsFromUrl(unittest.TestCase):
Expand Down Expand Up @@ -123,7 +125,7 @@ def test_non_parseable_config(self):
}
}
"""
with_passwords_removed = password_remover.remove_passwords(unparseable_json_with_passwords, log_json_parsing_exceptions=False)
with_passwords_removed = password_remover.remove_passwords(unparseable_json_with_passwords)
assert "foobar" not in with_passwords_removed


Expand Down Expand Up @@ -311,5 +313,36 @@ def test_basic(self):
json_text_expected = json.dumps(json_dict_expected, sort_keys=True)
assert json_text_expected == json_text_actual



@pytest.mark.parametrize("input_str, expected", [
(
b'{"foo": "bar"}',
'{"foo": "bar"}',
),
(
b'{\"foo\": `bar`}',
'{"foo": "bar"}',
),
(
b'{\"foo\": `bar\nbaz\nboo`}',
r'{"foo": "bar\nbaz\nboo"}',
),
(
b'{\"foo\": `bar\n\"baz\n\tboo`}',
r'{"foo": "bar\n\"baz\n\tboo"}',
),
(
b'{\"foo\": `bar\n`, \"baz\": `howdy`}',
r'{"foo": "bar\n", "baz": "howdy"}',
),
(
b'{\"foo\": `bar\r\n`, \"baz\": `\r\nhowdy`}',
r'{"foo": "bar\n", "baz": "\nhowdy"}',
),
(
b'{\"foo\": `bar\\baz`, \"something\": `else\\is\\here`}',
r'{"foo": "bar\\baz", "something": "else\\is\\here"}',
),
])
def test_convert_to_valid_json(input_str, expected):
assert password_remover.convert_to_valid_json(input_str) == expected
password_remover.get_parsed_json(input_str)
101 changes: 101 additions & 0 deletions tools-tests/tasks_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Copyright 2024-Present Couchbase, Inc.
#
# Use of this software is governed by the Business Source License included
# in the file licenses/BSL-Couchbase.txt. As of the Change Date specified
# in that file, in accordance with the Business Source License, use of this
# software will be governed by the Apache License, Version 2.0, included in
# the file licenses/APL2.txt.

import json
import pathlib

import password_remover
import tasks

import pytest

VERBOSE = 2

INPUT_CONFIG = """\
{
"password": "password",
"server": "http://localhost:4984/db",
"databases": {
"db" : {
"users" : {
"foo" : "bar"
}
}
}
}"""

REDACTED_OUTPUT = """\
{
"password": "******",
"server": "http://localhost:4984/db",
"databases": {
"db": {
"users": {
"<ud>foo</ud>": "bar"
}
}
}
}"""


@pytest.mark.parametrize("tag_userdata", [True, False])
def test_add_file_task(tmpdir, tag_userdata):
if tag_userdata:
expected = REDACTED_OUTPUT
else:
expected = REDACTED_OUTPUT.replace("<ud>foo</ud>", "foo")

filename = "config.json"
config_file = tmpdir.join(filename)
config_file.write(INPUT_CONFIG)
postprocessors = [password_remover.remove_passwords]
if tag_userdata:
postprocessors.append(password_remover.tag_userdata_in_server_config)
task = tasks.add_file_task(
config_file.strpath,
content_postprocessors=postprocessors,
)
output_dir = tmpdir.mkdir("output")
runner = tasks.TaskRunner(
verbosity=VERBOSE,
default_name="sg.log",
tmp_dir=output_dir,
)
runner.run(task)
runner.close_all_files()

with open(pathlib.Path(runner.tmpdir) / filename) as fh:
assert expected in fh.read()


def test_make_curl_task(tmpdir, httpserver):
output = "curltask"
httpserver.expect_request("/").respond_with_json(json.loads(INPUT_CONFIG))
task = tasks.make_curl_task(
"curltask",
httpserver.url_for("/"),
content_postprocessors=[
password_remover.remove_passwords,
password_remover.tag_userdata_in_server_config,
],
log_file=output,
)

output_dir = tmpdir.mkdir("output")
runner = tasks.TaskRunner(
verbosity=VERBOSE,
default_name="sg.log",
tmp_dir=output_dir,
)
runner.run(task)
runner.close_all_files()

with open(pathlib.Path(runner.tmpdir) / output) as fh:
assert REDACTED_OUTPUT in fh.read()

httpserver.check()
161 changes: 50 additions & 111 deletions tools/password_remover.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,45 +13,39 @@

"""


import enum
import json
import traceback
import re
from urllib.parse import urlparse

from typing import Union

def is_valid_json(invalid_json):
def get_parsed_json(json_text: str) -> dict:
"""
Is the given string valid JSON?
Turns json_text into a valid JSON object by replacing backquotes with single quotes, and returns it as a dictionary.
"""
got_exception = True
try:
json.loads(invalid_json)
got_exception = False
except Exception as e:
pass
valid_json = convert_to_valid_json(json_text)

return got_exception is False
# Lower case keys so that "databases" works as a
# key even if the JSON has "Databases" as a key.
return lower_keys_dict(valid_json)


def tag_userdata_in_server_config(json_text, log_json_parsing_exceptions=True):
def tag_userdata_in_server_config(json_text):
"""
Content postprocessor that tags user data in a config ready for post-process redaction
"""
try:
valid_json = convert_to_valid_json(json_text)

# Lower case keys so that "databases" works as a
# key even if the JSON has "Databases" as a key.
parsed_json = lower_keys_dict(valid_json)
parsed_json = get_parsed_json(json_text)

tag_userdata_in_server_json(parsed_json)
formatted_json_string = json.dumps(parsed_json, indent=4)
return formatted_json_string

except Exception as e:
if log_json_parsing_exceptions:
print("Exception trying to tag config user data in {0}. Exception: {1}".format(json_text, e))
traceback.print_exc()
print("Exception trying to tag config user data in {0}. Exception: {1}".format(json_text, e))
traceback.print_exc()
return '{"Error":"Error in sgcollect_info password_remover.py trying to tag config user data. See logs for details"}'


Expand All @@ -67,7 +61,7 @@ def tag_userdata_in_server_json(config):
tag_userdata_in_db_json(dbs[db])


def tag_userdata_in_db_config(json_text, log_json_parsing_exceptions=True):
def tag_userdata_in_db_config(json_text):
"""
Content postprocessor that tags user data in a db config ready for post-process redaction
"""
Expand All @@ -83,9 +77,8 @@ def tag_userdata_in_db_config(json_text, log_json_parsing_exceptions=True):
return formatted_json_string

except Exception as e:
if log_json_parsing_exceptions:
print("Exception trying to tag db config user data in {0}. Exception: {1}".format(json_text, e))
traceback.print_exc()
print("Exception trying to tag db config user data in {0}. Exception: {1}".format(json_text, e))
traceback.print_exc()
return '{"Error":"Error in sgcollect_info password_remover.py trying to tag db config user data. See logs for details"}'


Expand Down Expand Up @@ -157,26 +150,22 @@ def remove_passwords_from_config(config_fragment):
remove_passwords_from_config(item)


def remove_passwords(json_text, log_json_parsing_exceptions=True):
def remove_passwords(json_text):
"""
Content postprocessor that strips out all of the sensitive passwords
"""
try:
valid_json = convert_to_valid_json(json_text)
parsed_json = get_parsed_json(json_text)

# Lower case keys so that "databases" works as a
# key even if the JSON has "Databases" as a key.
parsed_json = lower_keys_dict(valid_json)
remove_passwords_from_config(parsed_json)

# Append a trailing \n here to ensure there's adequate separation in sync_gateway.log
formatted_json_string = json.dumps(parsed_json, indent=4) + "\n"
return formatted_json_string

except Exception as e:
if log_json_parsing_exceptions:
print("Exception trying to remove passwords from {0}. Exception: {1}".format(json_text, e))
traceback.print_exc()
print("Exception trying to remove passwords from {0}. Exception: {1}".format(json_text, e))
traceback.print_exc()
return '{"Error":"Error in sgcollect_info password_remover.py trying to remove passwords. See logs for details"}'


Expand Down Expand Up @@ -235,91 +224,41 @@ def strip_password_from_url(url_string):
return new_url


def escape_json_value(raw_value):
def replace_backticks_with_double_quotes(match):
"""
Escape all invalid json characters like " to produce a valid json value
Escape all invalid json characters in sync gateway config files that occur between backticks to produce a valid json value. This matches ConvertBackticksToDoubleQuotes in go code.

Before:
Before::

function(doc, oldDoc) { if (doc.type == "reject_me") {
function(doc, oldDoc) { if (doc.type == "reject_me") {

After:
After::

function(doc, oldDoc) { if (doc.type == \"reject_me\") {
function(doc, oldDoc) { if (doc.type == \"reject_me\") {

"""
escaped = raw_value
escaped = escaped.replace('\\', "\\\\") # Escape any backslashes
escaped = escaped.replace('"', '\\"') # Escape double quotes
escaped = escaped.replace("'", "\\'") # Escape single quotes

# TODO: other stuff should be escaped like \n \t and other control characters
# See http://stackoverflow.com/questions/983451/where-can-i-find-a-list-of-escape-characters-required-for-my-json-ajax-return-ty

return escaped


def convert_to_valid_json(invalid_json):

STATE_OUTSIDE_BACKTICK = "STATE_OUTSIDE_BACKTICK"
STATE_INSIDE_BACKTICK = "STATE_INSIDE_BACKTICK"
state = STATE_OUTSIDE_BACKTICK
output = []
sync_function_buffer = []

try:
invalid_json = invalid_json.decode('utf-8')
except (UnicodeDecodeError, AttributeError):
pass

# Strip newlines
invalid_json = invalid_json.replace('\n', '')

# Strip tabs
invalid_json = invalid_json.replace('\t', '')

# read string char by char
for json_char in invalid_json:

# if non-backtick character:
if json_char != '`':

# if in OUTSIDE_BACKTICK state
if state == STATE_OUTSIDE_BACKTICK:
# append char to output
output.append(json_char)

# if in INSIDE_BACKTICK state
elif state == STATE_INSIDE_BACKTICK:
# append to sync_function_buffer
sync_function_buffer.append(json_char)

# if backtick character
elif json_char == '`':

# if in OUTSIDE_BACKTICK state
if state == STATE_OUTSIDE_BACKTICK:
# transition to INSIDE_BACKTICK state
state = STATE_INSIDE_BACKTICK

# if in INSIDE_BACKTICK state
elif state == STATE_INSIDE_BACKTICK:
# run sync_function_buffer through escape_json_value()
sync_function_buffer_str = "".join(sync_function_buffer)
sync_function_buffer_str = escape_json_value(sync_function_buffer_str)

# append to output
output.append('"') # append a double quote
output.append(sync_function_buffer_str)
output.append('"') # append a double quote

# empty the sync_function_buffer
sync_function_buffer = []

# transition to OUTSIDE_BACKTICK state
state = STATE_OUTSIDE_BACKTICK

output_str = "".join(output)
return output_str


replacements = [
("\\", "\\\\"), # replace literal slash with two slashes
("\r\n", "\\n"),
("\r", ""),
("\n", "\\n"),
("\t", "\\t"),
(r'"', r'\"')
]
expr = match.group(0)
for search, replace in replacements:
expr = expr.replace(search, replace)

# Replace the backquotes with double-quotes
expr = '"' + expr[1:]
expr = expr[:-1] + '"'
return expr

def convert_to_valid_json(invalid_json: Union[bytes,str]) -> str:
"""
Converts json text from a sync gateway config file to valid json by replacing backticks with double quotes and escaping invalid json characters inside the backquotes.
"""
if isinstance(invalid_json, bytes):
invalid_json = invalid_json.decode('utf-8', errors="backslashreplace")
return re.sub(r"`(.*?)[^\\\\]`", replace_backticks_with_double_quotes, invalid_json, flags=re.MULTILINE|re.DOTALL)
Loading