From 172ac73b0147ecc5d1f7f01754f8c2ccca0845af Mon Sep 17 00:00:00 2001 From: Tor Colvin Date: Tue, 5 Mar 2024 07:34:56 -0500 Subject: [PATCH] CBG-3029 use matching regex replacement for go and python code (#6710) --- tools-tests/password_remover_test.py | 39 ++++++- tools-tests/tasks_test.py | 101 +++++++++++++++++ tools/password_remover.py | 161 +++++++++------------------ 3 files changed, 187 insertions(+), 114 deletions(-) create mode 100644 tools-tests/tasks_test.py diff --git a/tools-tests/password_remover_test.py b/tools-tests/password_remover_test.py index a14633a65a..e15f3a92aa 100644 --- a/tools-tests/password_remover_test.py +++ b/tools-tests/password_remover_test.py @@ -9,6 +9,8 @@ import json import unittest +import pytest + import password_remover class TestStripPasswordsFromUrl(unittest.TestCase): @@ -123,7 +125,7 @@ def test_non_parseable_config(self): } } """ - with_passwords_removed = password_remover.remove_passwords(unparseable_json_with_passwords, log_json_parsing_exceptions=False) + with_passwords_removed = password_remover.remove_passwords(unparseable_json_with_passwords) assert "foobar" not in with_passwords_removed @@ -311,5 +313,36 @@ def test_basic(self): json_text_expected = json.dumps(json_dict_expected, sort_keys=True) assert json_text_expected == json_text_actual - - +@pytest.mark.parametrize("input_str, expected", [ + ( + b'{"foo": "bar"}', + '{"foo": "bar"}', + ), + ( + b'{\"foo\": `bar`}', + '{"foo": "bar"}', + ), + ( + b'{\"foo\": `bar\nbaz\nboo`}', + r'{"foo": "bar\nbaz\nboo"}', + ), + ( + b'{\"foo\": `bar\n\"baz\n\tboo`}', + r'{"foo": "bar\n\"baz\n\tboo"}', + ), + ( + b'{\"foo\": `bar\n`, \"baz\": `howdy`}', + r'{"foo": "bar\n", "baz": "howdy"}', + ), + ( + b'{\"foo\": `bar\r\n`, \"baz\": `\r\nhowdy`}', + r'{"foo": "bar\n", "baz": "\nhowdy"}', + ), + ( + b'{\"foo\": `bar\\baz`, \"something\": `else\\is\\here`}', + r'{"foo": "bar\\baz", "something": "else\\is\\here"}', + ), + ]) +def test_convert_to_valid_json(input_str, expected): + assert password_remover.convert_to_valid_json(input_str) == expected + password_remover.get_parsed_json(input_str) diff --git a/tools-tests/tasks_test.py b/tools-tests/tasks_test.py new file mode 100644 index 0000000000..bd6c20e88e --- /dev/null +++ b/tools-tests/tasks_test.py @@ -0,0 +1,101 @@ +# Copyright 2024-Present Couchbase, Inc. +# +# Use of this software is governed by the Business Source License included +# in the file licenses/BSL-Couchbase.txt. As of the Change Date specified +# in that file, in accordance with the Business Source License, use of this +# software will be governed by the Apache License, Version 2.0, included in +# the file licenses/APL2.txt. + +import json +import pathlib + +import password_remover +import tasks + +import pytest + +VERBOSE = 2 + +INPUT_CONFIG = """\ +{ + "password": "password", + "server": "http://localhost:4984/db", + "databases": { + "db" : { + "users" : { + "foo" : "bar" + } + } + } +}""" + +REDACTED_OUTPUT = """\ +{ + "password": "******", + "server": "http://localhost:4984/db", + "databases": { + "db": { + "users": { + "foo": "bar" + } + } + } +}""" + + +@pytest.mark.parametrize("tag_userdata", [True, False]) +def test_add_file_task(tmpdir, tag_userdata): + if tag_userdata: + expected = REDACTED_OUTPUT + else: + expected = REDACTED_OUTPUT.replace("foo", "foo") + + filename = "config.json" + config_file = tmpdir.join(filename) + config_file.write(INPUT_CONFIG) + postprocessors = [password_remover.remove_passwords] + if tag_userdata: + postprocessors.append(password_remover.tag_userdata_in_server_config) + task = tasks.add_file_task( + config_file.strpath, + content_postprocessors=postprocessors, + ) + output_dir = tmpdir.mkdir("output") + runner = tasks.TaskRunner( + verbosity=VERBOSE, + default_name="sg.log", + tmp_dir=output_dir, + ) + runner.run(task) + runner.close_all_files() + + with open(pathlib.Path(runner.tmpdir) / filename) as fh: + assert expected in fh.read() + + +def test_make_curl_task(tmpdir, httpserver): + output = "curltask" + httpserver.expect_request("/").respond_with_json(json.loads(INPUT_CONFIG)) + task = tasks.make_curl_task( + "curltask", + httpserver.url_for("/"), + content_postprocessors=[ + password_remover.remove_passwords, + password_remover.tag_userdata_in_server_config, + ], + log_file=output, + ) + + output_dir = tmpdir.mkdir("output") + runner = tasks.TaskRunner( + verbosity=VERBOSE, + default_name="sg.log", + tmp_dir=output_dir, + ) + runner.run(task) + runner.close_all_files() + + with open(pathlib.Path(runner.tmpdir) / output) as fh: + assert REDACTED_OUTPUT in fh.read() + + httpserver.check() diff --git a/tools/password_remover.py b/tools/password_remover.py index 379a160765..4f50376c66 100644 --- a/tools/password_remover.py +++ b/tools/password_remover.py @@ -13,45 +13,39 @@ """ - +import enum import json import traceback +import re from urllib.parse import urlparse +from typing import Union -def is_valid_json(invalid_json): +def get_parsed_json(json_text: str) -> dict: """ - Is the given string valid JSON? + Turns json_text into a valid JSON object by replacing backquotes with single quotes, and returns it as a dictionary. """ - got_exception = True - try: - json.loads(invalid_json) - got_exception = False - except Exception as e: - pass + valid_json = convert_to_valid_json(json_text) - return got_exception is False + # Lower case keys so that "databases" works as a + # key even if the JSON has "Databases" as a key. + return lower_keys_dict(valid_json) -def tag_userdata_in_server_config(json_text, log_json_parsing_exceptions=True): +def tag_userdata_in_server_config(json_text): """ Content postprocessor that tags user data in a config ready for post-process redaction """ try: - valid_json = convert_to_valid_json(json_text) - - # Lower case keys so that "databases" works as a - # key even if the JSON has "Databases" as a key. - parsed_json = lower_keys_dict(valid_json) + parsed_json = get_parsed_json(json_text) tag_userdata_in_server_json(parsed_json) formatted_json_string = json.dumps(parsed_json, indent=4) return formatted_json_string except Exception as e: - if log_json_parsing_exceptions: - print("Exception trying to tag config user data in {0}. Exception: {1}".format(json_text, e)) - traceback.print_exc() + print("Exception trying to tag config user data in {0}. Exception: {1}".format(json_text, e)) + traceback.print_exc() return '{"Error":"Error in sgcollect_info password_remover.py trying to tag config user data. See logs for details"}' @@ -67,7 +61,7 @@ def tag_userdata_in_server_json(config): tag_userdata_in_db_json(dbs[db]) -def tag_userdata_in_db_config(json_text, log_json_parsing_exceptions=True): +def tag_userdata_in_db_config(json_text): """ Content postprocessor that tags user data in a db config ready for post-process redaction """ @@ -83,9 +77,8 @@ def tag_userdata_in_db_config(json_text, log_json_parsing_exceptions=True): return formatted_json_string except Exception as e: - if log_json_parsing_exceptions: - print("Exception trying to tag db config user data in {0}. Exception: {1}".format(json_text, e)) - traceback.print_exc() + print("Exception trying to tag db config user data in {0}. Exception: {1}".format(json_text, e)) + traceback.print_exc() return '{"Error":"Error in sgcollect_info password_remover.py trying to tag db config user data. See logs for details"}' @@ -157,16 +150,13 @@ def remove_passwords_from_config(config_fragment): remove_passwords_from_config(item) -def remove_passwords(json_text, log_json_parsing_exceptions=True): +def remove_passwords(json_text): """ Content postprocessor that strips out all of the sensitive passwords """ try: - valid_json = convert_to_valid_json(json_text) + parsed_json = get_parsed_json(json_text) - # Lower case keys so that "databases" works as a - # key even if the JSON has "Databases" as a key. - parsed_json = lower_keys_dict(valid_json) remove_passwords_from_config(parsed_json) # Append a trailing \n here to ensure there's adequate separation in sync_gateway.log @@ -174,9 +164,8 @@ def remove_passwords(json_text, log_json_parsing_exceptions=True): return formatted_json_string except Exception as e: - if log_json_parsing_exceptions: - print("Exception trying to remove passwords from {0}. Exception: {1}".format(json_text, e)) - traceback.print_exc() + print("Exception trying to remove passwords from {0}. Exception: {1}".format(json_text, e)) + traceback.print_exc() return '{"Error":"Error in sgcollect_info password_remover.py trying to remove passwords. See logs for details"}' @@ -235,91 +224,41 @@ def strip_password_from_url(url_string): return new_url -def escape_json_value(raw_value): +def replace_backticks_with_double_quotes(match): """ - Escape all invalid json characters like " to produce a valid json value + Escape all invalid json characters in sync gateway config files that occur between backticks to produce a valid json value. This matches ConvertBackticksToDoubleQuotes in go code. - Before: + Before:: - function(doc, oldDoc) { if (doc.type == "reject_me") { + function(doc, oldDoc) { if (doc.type == "reject_me") { - After: + After:: - function(doc, oldDoc) { if (doc.type == \"reject_me\") { + function(doc, oldDoc) { if (doc.type == \"reject_me\") { """ - escaped = raw_value - escaped = escaped.replace('\\', "\\\\") # Escape any backslashes - escaped = escaped.replace('"', '\\"') # Escape double quotes - escaped = escaped.replace("'", "\\'") # Escape single quotes - - # TODO: other stuff should be escaped like \n \t and other control characters - # See http://stackoverflow.com/questions/983451/where-can-i-find-a-list-of-escape-characters-required-for-my-json-ajax-return-ty - - return escaped - - -def convert_to_valid_json(invalid_json): - - STATE_OUTSIDE_BACKTICK = "STATE_OUTSIDE_BACKTICK" - STATE_INSIDE_BACKTICK = "STATE_INSIDE_BACKTICK" - state = STATE_OUTSIDE_BACKTICK - output = [] - sync_function_buffer = [] - - try: - invalid_json = invalid_json.decode('utf-8') - except (UnicodeDecodeError, AttributeError): - pass - - # Strip newlines - invalid_json = invalid_json.replace('\n', '') - - # Strip tabs - invalid_json = invalid_json.replace('\t', '') - - # read string char by char - for json_char in invalid_json: - - # if non-backtick character: - if json_char != '`': - - # if in OUTSIDE_BACKTICK state - if state == STATE_OUTSIDE_BACKTICK: - # append char to output - output.append(json_char) - - # if in INSIDE_BACKTICK state - elif state == STATE_INSIDE_BACKTICK: - # append to sync_function_buffer - sync_function_buffer.append(json_char) - - # if backtick character - elif json_char == '`': - - # if in OUTSIDE_BACKTICK state - if state == STATE_OUTSIDE_BACKTICK: - # transition to INSIDE_BACKTICK state - state = STATE_INSIDE_BACKTICK - - # if in INSIDE_BACKTICK state - elif state == STATE_INSIDE_BACKTICK: - # run sync_function_buffer through escape_json_value() - sync_function_buffer_str = "".join(sync_function_buffer) - sync_function_buffer_str = escape_json_value(sync_function_buffer_str) - - # append to output - output.append('"') # append a double quote - output.append(sync_function_buffer_str) - output.append('"') # append a double quote - - # empty the sync_function_buffer - sync_function_buffer = [] - - # transition to OUTSIDE_BACKTICK state - state = STATE_OUTSIDE_BACKTICK - - output_str = "".join(output) - return output_str - + replacements = [ + ("\\", "\\\\"), # replace literal slash with two slashes + ("\r\n", "\\n"), + ("\r", ""), + ("\n", "\\n"), + ("\t", "\\t"), + (r'"', r'\"') + ] + expr = match.group(0) + for search, replace in replacements: + expr = expr.replace(search, replace) + + # Replace the backquotes with double-quotes + expr = '"' + expr[1:] + expr = expr[:-1] + '"' + return expr + +def convert_to_valid_json(invalid_json: Union[bytes,str]) -> str: + """ + Converts json text from a sync gateway config file to valid json by replacing backticks with double quotes and escaping invalid json characters inside the backquotes. + """ + if isinstance(invalid_json, bytes): + invalid_json = invalid_json.decode('utf-8', errors="backslashreplace") + return re.sub(r"`(.*?)[^\\\\]`", replace_backticks_with_double_quotes, invalid_json, flags=re.MULTILINE|re.DOTALL)