From 6d53058604e7eac97076df45d87e7c9f6747a480 Mon Sep 17 00:00:00 2001
From: Karthik Bekal Pattathana
 <133984042+karthikbekalp@users.noreply.github.com>
Date: Thu, 19 Dec 2024 21:08:04 +0000
Subject: [PATCH] fix!: Ensure all open uses UTF-8 instead of default encoding.

Signed-off-by: Karthik Bekal Pattathana <133984042+karthikbekalp@users.noreply.github.com>
---
 .github/scripts/get_latest_changelog.py       |  2 +-
 .../_background/backend_runner.py             |  4 ++-
 .../_background/frontend_runner.py            |  9 +++---
 .../adaptor_runtime/_background/loaders.py    |  2 +-
 .../_background/log_buffers.py                |  4 +--
 src/openjd/adaptor_runtime/_entrypoint.py     |  8 +++--
 .../adaptor_runtime/adaptors/_validator.py    |  4 +--
 .../adaptors/configuration/_configuration.py  |  6 ++--
 .../configuration/_configuration_manager.py   |  2 +-
 .../integ/_utils/test_secure_open.py          |  6 ++--
 .../integ/background/test_background_mode.py  |  2 +-
 .../configuration/test_configuration.py       | 32 +++++++++++++------
 .../test_configuration_manager.py             |  2 +-
 .../unit/background/test_backend_runner.py    |  4 +--
 .../unit/background/test_frontend_runner.py   |  2 +-
 .../unit/background/test_log_buffers.py       |  4 +--
 .../adaptor_runtime/unit/test_entrypoint.py   |  4 +--
 test/openjd/test_copyright_header.py          |  2 +-
 18 files changed, 61 insertions(+), 38 deletions(-)

diff --git a/.github/scripts/get_latest_changelog.py b/.github/scripts/get_latest_changelog.py
index 5616efb..c6a7d33 100644
--- a/.github/scripts/get_latest_changelog.py
+++ b/.github/scripts/get_latest_changelog.py
@@ -32,7 +32,7 @@
 import re
 
 h2 = r"^##\s.*$"
-with open("CHANGELOG.md") as f:
+with open("CHANGELOG.md", encoding="utf-8") as f:
     contents = f.read()
 matches = re.findall(h2, contents, re.MULTILINE)
 changelog = contents[: contents.find(matches[1]) - 1] if len(matches) > 1 else contents
diff --git a/src/openjd/adaptor_runtime/_background/backend_runner.py b/src/openjd/adaptor_runtime/_background/backend_runner.py
index 5d4e16e..a1f7329 100644
--- a/src/openjd/adaptor_runtime/_background/backend_runner.py
+++ b/src/openjd/adaptor_runtime/_background/backend_runner.py
@@ -116,7 +116,9 @@ def run(self, *, on_connection_file_written: List[Callable[[], None]] | None = N
             raise
 
         try:
-            with secure_open(self._connection_file_path, open_mode="w") as conn_file:
+            with secure_open(
+                self._connection_file_path, open_mode="w", encoding="utf-8"
+            ) as conn_file:
                 json.dump(
                     ConnectionSettings(server_path),
                     conn_file,
diff --git a/src/openjd/adaptor_runtime/_background/frontend_runner.py b/src/openjd/adaptor_runtime/_background/frontend_runner.py
index c83b4ba..e47d4dc 100644
--- a/src/openjd/adaptor_runtime/_background/frontend_runner.py
+++ b/src/openjd/adaptor_runtime/_background/frontend_runner.py
@@ -153,7 +153,7 @@ def init(
         bootstrap_output_path = os.path.join(
             bootstrap_log_dir, f"adaptor-runtime-background-bootstrap-output-{bootstrap_id}.log"
         )
-        output_log_file = open(bootstrap_output_path, mode="w+")
+        output_log_file = open(bootstrap_output_path, mode="w+", encoding="utf-8")
         try:
             process = subprocess.Popen(
                 args,
@@ -194,7 +194,7 @@ def init(
             if process.stderr:
                 process.stderr.close()
 
-            with open(bootstrap_output_path, mode="r") as f:
+            with open(bootstrap_output_path, mode="r", encoding="utf-8") as f:
                 bootstrap_output = f.readlines()
             _logger.info("========== BEGIN BOOTSTRAP OUTPUT CONTENTS ==========")
             for line in bootstrap_output:
@@ -203,7 +203,7 @@ def init(
 
             _logger.info(f"Checking for bootstrap logs at '{bootstrap_log_path}'")
             try:
-                with open(bootstrap_log_path, mode="r") as f:
+                with open(bootstrap_log_path, mode="r", encoding="utf-8") as f:
                     bootstrap_logs = f.readlines()
             except Exception as e:
                 _logger.error(f"Failed to get bootstrap logs at '{bootstrap_log_path}': {e}")
@@ -442,7 +442,8 @@ def _wait_for_connection_file(
 
     def file_is_openable() -> bool:
         try:
-            open(filepath, mode="r").close()
+            with open(filepath, mode="r", encoding="utf-8"):
+                pass
         except IOError:
             # File is not available yet
             return False
diff --git a/src/openjd/adaptor_runtime/_background/loaders.py b/src/openjd/adaptor_runtime/_background/loaders.py
index e258a29..1ccd726 100644
--- a/src/openjd/adaptor_runtime/_background/loaders.py
+++ b/src/openjd/adaptor_runtime/_background/loaders.py
@@ -35,7 +35,7 @@ class ConnectionSettingsFileLoader(ConnectionSettingsLoader):
 
     def load(self) -> ConnectionSettings:
         try:
-            with open(self.file_path) as conn_file:
+            with open(self.file_path, encoding="utf-8") as conn_file:
                 loaded_settings = json.load(conn_file)
         except OSError as e:
             errmsg = f"Failed to open connection file '{self.file_path}': {e}"
diff --git a/src/openjd/adaptor_runtime/_background/log_buffers.py b/src/openjd/adaptor_runtime/_background/log_buffers.py
index 1aa3db6..394c1bd 100644
--- a/src/openjd/adaptor_runtime/_background/log_buffers.py
+++ b/src/openjd/adaptor_runtime/_background/log_buffers.py
@@ -132,7 +132,7 @@ def __init__(self, filepath: str, *, formatter: logging.Formatter | None = None)
     def buffer(self, record: logging.LogRecord) -> None:
         with (
             self._file_lock,
-            secure_open(self._filepath, open_mode="a") as f,
+            secure_open(self._filepath, open_mode="a", encoding="utf-8") as f,
         ):
             f.write(self._format(record))
 
@@ -142,7 +142,7 @@ def chunk(self) -> BufferedOutput:
         with (
             self._chunk_lock,
             self._file_lock,
-            open(self._filepath, mode="r") as f,
+            open(self._filepath, mode="r", encoding="utf-8") as f,
         ):
             self._chunk.id = id
             f.seek(self._chunk.start)
diff --git a/src/openjd/adaptor_runtime/_entrypoint.py b/src/openjd/adaptor_runtime/_entrypoint.py
index 23ad98d..df1a906 100644
--- a/src/openjd/adaptor_runtime/_entrypoint.py
+++ b/src/openjd/adaptor_runtime/_entrypoint.py
@@ -179,6 +179,10 @@ def _init_loggers(self, *, bootstrap_log_path: str | None = None) -> _LogConfig:
         formatter = ConditionalFormatter(
             "%(levelname)s: %(message)s", ignore_patterns=[_OPENJD_LOG_REGEX]
         )
+
+        # Ensure stdout uses UTF-8 encoding to avoid issues with
+        # encoding Non-ASCII characters.
+        sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
         stream_handler = logging.StreamHandler(sys.stdout)
         stream_handler.setFormatter(formatter)
 
@@ -590,14 +594,14 @@ def _load_data(data: str) -> dict:
 
 def _load_yaml_json(data: str) -> Any:
     """
-    Loads a YAML/JSON file/string.
+    Loads a YAML/JSON file/string using the UTF-8 encoding.
 
     Note that yaml.safe_load() is capable of loading JSON documents.
     """
     loaded_yaml = None
     if data.startswith("file://"):
         filepath = data[len("file://") :]
-        with open(filepath) as yaml_file:
+        with open(filepath, encoding="utf-8") as yaml_file:
             loaded_yaml = yaml.safe_load(yaml_file)
     else:
         loaded_yaml = yaml.safe_load(data)
diff --git a/src/openjd/adaptor_runtime/adaptors/_validator.py b/src/openjd/adaptor_runtime/adaptors/_validator.py
index cccd746..7f292e4 100644
--- a/src/openjd/adaptor_runtime/adaptors/_validator.py
+++ b/src/openjd/adaptor_runtime/adaptors/_validator.py
@@ -74,7 +74,7 @@ def from_schema_file(schema_path: str) -> AdaptorDataValidator:
             schema_path (str): The path to the JSON schema file to use.
         """
         try:
-            with open(schema_path) as schema_file:
+            with open(schema_path, encoding="utf-8") as schema_file:
                 schema = json.load(schema_file)
         except json.JSONDecodeError as e:
             _logger.error(f"Failed to decode JSON schema file: {e}")
@@ -148,7 +148,7 @@ def _load_yaml_json(data: str) -> Any:
     loaded_yaml = None
     if data.startswith("file://"):
         filepath = data[len("file://") :]
-        with open(filepath) as yaml_file:
+        with open(filepath, encoding="utf-8") as yaml_file:
             loaded_yaml = yaml.safe_load(yaml_file)
     else:
         loaded_yaml = yaml.safe_load(data)
diff --git a/src/openjd/adaptor_runtime/adaptors/configuration/_configuration.py b/src/openjd/adaptor_runtime/adaptors/configuration/_configuration.py
index 320ffbb..92b015f 100644
--- a/src/openjd/adaptor_runtime/adaptors/configuration/_configuration.py
+++ b/src/openjd/adaptor_runtime/adaptors/configuration/_configuration.py
@@ -82,7 +82,8 @@ def from_file(
         """
 
         try:
-            config = json.load(open(config_path))
+            with open(config_path, encoding="utf-8") as config_file:
+                config = json.load(config_file)
         except OSError as e:
             _logger.error(f"Failed to open configuration at {config_path}: {e}")
             raise
@@ -102,7 +103,8 @@ def from_file(
         schema_paths = schema_path if isinstance(schema_path, list) else [schema_path]
         for path in schema_paths:
             try:
-                schema = json.load(open(path))
+                with open(path, encoding="utf-8") as schema_file:
+                    schema = json.load(schema_file)
             except OSError as e:
                 _logger.error(f"Failed to open configuration schema at {path}: {e}")
                 raise
diff --git a/src/openjd/adaptor_runtime/adaptors/configuration/_configuration_manager.py b/src/openjd/adaptor_runtime/adaptors/configuration/_configuration_manager.py
index b5f8e0e..95b0205 100644
--- a/src/openjd/adaptor_runtime/adaptors/configuration/_configuration_manager.py
+++ b/src/openjd/adaptor_runtime/adaptors/configuration/_configuration_manager.py
@@ -95,7 +95,7 @@ def _ensure_config_file(filepath: str, *, create: bool = False) -> bool:
         _logger.info(f"Creating empty configuration at {filepath}")
         try:
             os.makedirs(os.path.dirname(filepath), mode=stat.S_IRWXU, exist_ok=True)
-            with secure_open(filepath, open_mode="w") as f:
+            with secure_open(filepath, open_mode="w", encoding="utf-8") as f:
                 json.dump({}, f)
         except OSError as e:
             _logger.warning(f"Could not write empty configuration to {filepath}: {e}")
diff --git a/test/openjd/adaptor_runtime/integ/_utils/test_secure_open.py b/test/openjd/adaptor_runtime/integ/_utils/test_secure_open.py
index 2508ad4..8f28840 100644
--- a/test/openjd/adaptor_runtime/integ/_utils/test_secure_open.py
+++ b/test/openjd/adaptor_runtime/integ/_utils/test_secure_open.py
@@ -24,7 +24,7 @@ def create_file():
         f"secure_open_test_{''.join(random.choice(string.ascii_letters) for _ in range(10))}.txt"
     )
     test_file_path = os.path.join(tempfile.gettempdir(), test_file_name)
-    with secure_open(test_file_path, open_mode="w") as test_file:
+    with secure_open(test_file_path, open_mode="w", encoding="utf-8") as test_file:
         test_file.write(file_content)
     yield test_file_path, file_content
     os.remove(test_file_path)
@@ -36,7 +36,7 @@ def test_secure_open_write_and_read(self, create_file):
         Test if the file owner can write and read the file
         """
         test_file_path, file_content = create_file
-        with secure_open(test_file_path, open_mode="r") as test_file:
+        with secure_open(test_file_path, open_mode="r", encoding="utf-8") as test_file:
             result = test_file.read()
         assert result == file_content
 
@@ -61,7 +61,7 @@ def test_secure_open_file_windows_permission(self, create_file, win_test_user):
 
         try:
             with pytest.raises(PermissionError):
-                with open(test_file_path, "r") as f:
+                with open(test_file_path, "r", encoding="utf-8") as f:
                     f.read()
         finally:
             # Revert the impersonation
diff --git a/test/openjd/adaptor_runtime/integ/background/test_background_mode.py b/test/openjd/adaptor_runtime/integ/background/test_background_mode.py
index 974e736..5357f48 100644
--- a/test/openjd/adaptor_runtime/integ/background/test_background_mode.py
+++ b/test/openjd/adaptor_runtime/integ/background/test_background_mode.py
@@ -46,7 +46,7 @@ def mock_runtime_logger_level(self, tmpdir: pathlib.Path):
         # Set up a config file for the backend process
         config = {"log_level": "DEBUG"}
         config_path = os.path.join(tmpdir, "configuration.json")
-        with open(config_path, mode="w") as f:
+        with open(config_path, mode="w", encoding="utf-8") as f:
             json.dump(config, f)
 
         # Override the default config path to the one we just created
diff --git a/test/openjd/adaptor_runtime/unit/adaptors/configuration/test_configuration.py b/test/openjd/adaptor_runtime/unit/adaptors/configuration/test_configuration.py
index 3deeabd..6ea0113 100644
--- a/test/openjd/adaptor_runtime/unit/adaptors/configuration/test_configuration.py
+++ b/test/openjd/adaptor_runtime/unit/adaptors/configuration/test_configuration.py
@@ -55,7 +55,9 @@ def test_loads_schema(
         result = Configuration.from_file(config_path, schema_path)
 
         # THEN
-        mock_open.assert_has_calls([call(config_path), call(schema_path)])
+        mock_open.assert_has_calls(
+            [call(config_path, encoding="utf-8"), call(schema_path, encoding="utf-8")]
+        )
         assert mock_load.call_count == 2
         mock_validate.assert_called_once_with(config, schema)
         assert result._config is config
@@ -77,7 +79,7 @@ def test_skips_validation_when_no_schema(
         result = Configuration.from_file(config_path)
 
         # THEN
-        mock_open.assert_called_once_with(config_path)
+        mock_open.assert_called_once_with(config_path, encoding="utf-8")
         mock_load.assert_called_once()
         assert (
             f"JSON Schema file path not provided. Configuration file {config_path} will not be "
@@ -107,7 +109,13 @@ def test_validates_against_multiple_schemas(
         result = Configuration.from_file(config_path, [schema_path, schema2_path])
 
         # THEN
-        mock_open.assert_has_calls([call(config_path), call(schema_path), call(schema2_path)])
+        mock_open.assert_has_calls(
+            [
+                call(config_path, encoding="utf-8"),
+                call(schema_path, encoding="utf-8"),
+                call(schema2_path, encoding="utf-8"),
+            ]
+        )
         assert mock_load.call_count == 3
         mock_validate.assert_has_calls([call(config, schema), call(config, schema2)])
         assert result._config is config
@@ -128,7 +136,7 @@ def test_raises_when_nonvalid_schema_path_value(
 
         # THEN
         mock_load.assert_called_once()
-        mock_open.assert_called_once_with(config_path)
+        mock_open.assert_called_once_with(config_path, encoding="utf-8")
         assert raised_err.match(f"Schema path cannot be an empty {type(schema_path)}")
 
     @patch.object(configuration.json, "load")
@@ -148,7 +156,9 @@ def test_raises_when_schema_open_fails(
 
         # THEN
         mock_load.assert_called_once()
-        mock_open.assert_has_calls([call(config_path), call(schema_path)])
+        mock_open.assert_has_calls(
+            [call(config_path, encoding="utf-8"), call(schema_path, encoding="utf-8")]
+        )
         assert raised_err.value is err
         assert f"Failed to open configuration schema at {schema_path}: " in caplog.text
 
@@ -169,7 +179,9 @@ def test_raises_when_schema_json_decode_fails(
 
         # THEN
         assert mock_load.call_count == 2
-        mock_open.assert_has_calls([call(config_path), call(schema_path)])
+        mock_open.assert_has_calls(
+            [call(config_path, encoding="utf-8"), call(schema_path, encoding="utf-8")]
+        )
         assert raised_err.value is err
         assert f"Failed to decode configuration schema at {schema_path}: " in caplog.text
 
@@ -187,7 +199,7 @@ def test_raises_when_config_open_fails(
             Configuration.from_file(config_path, "")
 
         # THEN
-        mock_open.assert_called_once_with(config_path)
+        mock_open.assert_called_once_with(config_path, encoding="utf-8")
         assert raised_err.value is err
         assert f"Failed to open configuration at {config_path}: " in caplog.text
 
@@ -206,7 +218,7 @@ def test_raises_when_config_json_decode_fails(
             Configuration.from_file(config_path, "")
 
         # THEN
-        mock_open.assert_called_once_with(config_path)
+        mock_open.assert_called_once_with(config_path, encoding="utf-8")
         mock_load.assert_called_once()
         assert raised_err.value is err
         assert f"Failed to decode configuration at {config_path}: " in caplog.text
@@ -234,7 +246,9 @@ def test_raises_when_config_fails_jsonschema_validation(
             Configuration.from_file(config_path, schema_path)
 
         # THEN
-        mock_open.assert_has_calls([call(config_path), call(schema_path)])
+        mock_open.assert_has_calls(
+            [call(config_path, encoding="utf-8"), call(schema_path, encoding="utf-8")]
+        )
         assert mock_load.call_count == 2
         mock_validate.assert_called_once_with(config, schema)
         assert raised_err.value is mock_validate.side_effect
diff --git a/test/openjd/adaptor_runtime/unit/adaptors/configuration/test_configuration_manager.py b/test/openjd/adaptor_runtime/unit/adaptors/configuration/test_configuration_manager.py
index f48c436..8146c40 100644
--- a/test/openjd/adaptor_runtime/unit/adaptors/configuration/test_configuration_manager.py
+++ b/test/openjd/adaptor_runtime/unit/adaptors/configuration/test_configuration_manager.py
@@ -119,7 +119,7 @@ def test_create(
         # THEN
         assert result == created
         mock_exists.assert_called_once_with(path)
-        open_mock.assert_called_once_with(path, open_mode="w")
+        open_mock.assert_called_once_with(path, open_mode="w", encoding="utf-8")
         assert f'Configuration file at "{path}" does not exist.' in caplog.text
         assert f"Creating empty configuration at {path}" in caplog.text
         if created:
diff --git a/test/openjd/adaptor_runtime/unit/background/test_backend_runner.py b/test/openjd/adaptor_runtime/unit/background/test_backend_runner.py
index 1ce62a6..9b52eed 100644
--- a/test/openjd/adaptor_runtime/unit/background/test_backend_runner.py
+++ b/test/openjd/adaptor_runtime/unit/background/test_backend_runner.py
@@ -96,7 +96,7 @@ def test_run(
         )
         mock_thread.assert_called_once()
         mock_thread.return_value.start.assert_called_once()
-        open_mock.assert_called_once_with(conn_file, open_mode="w")
+        open_mock.assert_called_once_with(conn_file, open_mode="w", encoding="utf-8")
         mock_json_dump.assert_called_once_with(
             ConnectionSettings(socket_path),
             open_mock.return_value,
@@ -170,7 +170,7 @@ def test_run_raises_when_writing_connection_file_fails(
         ]
         mock_thread.assert_called_once()
         mock_thread.return_value.start.assert_called_once()
-        open_mock.assert_called_once_with(conn_file, open_mode="w")
+        open_mock.assert_called_once_with(conn_file, open_mode="w", encoding="utf-8")
         mock_thread.return_value.join.assert_called_once()
         if OSName.is_posix():
             mock_os_remove.assert_has_calls([call(conn_file), call(socket_path)])
diff --git a/test/openjd/adaptor_runtime/unit/background/test_frontend_runner.py b/test/openjd/adaptor_runtime/unit/background/test_frontend_runner.py
index 9545ea5..0bb19e8 100644
--- a/test/openjd/adaptor_runtime/unit/background/test_frontend_runner.py
+++ b/test/openjd/adaptor_runtime/unit/background/test_frontend_runner.py
@@ -963,7 +963,7 @@ def test_waits_for_file(
         # THEN
         mock_exists.assert_has_calls([call(filepath)] * 2)
         mock_sleep.assert_has_calls([call(interval)] * 3)
-        open_mock.assert_has_calls([call(filepath, mode="r")] * 2)
+        open_mock.assert_has_calls([call(filepath, mode="r", encoding="utf-8")] * 2)
 
     @patch.object(frontend_runner.time, "sleep")
     @patch.object(frontend_runner.os.path, "exists")
diff --git a/test/openjd/adaptor_runtime/unit/background/test_log_buffers.py b/test/openjd/adaptor_runtime/unit/background/test_log_buffers.py
index 7ccb7c3..56da529 100644
--- a/test/openjd/adaptor_runtime/unit/background/test_log_buffers.py
+++ b/test/openjd/adaptor_runtime/unit/background/test_log_buffers.py
@@ -120,7 +120,7 @@ def test_buffer(self) -> None:
             buffer.buffer(mock_record)
 
         # THEN
-        open_mock.assert_called_once_with(filepath, open_mode="a")
+        open_mock.assert_called_once_with(filepath, open_mode="a", encoding="utf-8")
         handle = open_mock.return_value
         handle.write.assert_called_once_with(mock_record.msg)
 
@@ -140,7 +140,7 @@ def test_chunk(self, mocked_chunk_id: Tuple[str, MagicMock]) -> None:
 
         # THEN
         mock_create_id.assert_called_once()
-        open_mock.assert_called_once_with(filepath, mode="r")
+        open_mock.assert_called_once_with(filepath, mode="r", encoding="utf-8")
         handle = open_mock.return_value
         handle.seek.assert_called_once_with(buffer._chunk.start)
         handle.read.assert_called_once()
diff --git a/test/openjd/adaptor_runtime/unit/test_entrypoint.py b/test/openjd/adaptor_runtime/unit/test_entrypoint.py
index 46adc5d..e859ac1 100644
--- a/test/openjd/adaptor_runtime/unit/test_entrypoint.py
+++ b/test/openjd/adaptor_runtime/unit/test_entrypoint.py
@@ -865,7 +865,7 @@ def test_accepts_file(self, input: str, expected: dict):
 
         # THEN
         assert output == expected
-        open_mock.assert_called_once_with(filepath)
+        open_mock.assert_called_once_with(filepath, encoding="utf-8")
 
     @patch.object(runtime_entrypoint, "open")
     def test_raises_on_os_error(self, mock_open: MagicMock, caplog: pytest.LogCaptureFixture):
@@ -880,7 +880,7 @@ def test_raises_on_os_error(self, mock_open: MagicMock, caplog: pytest.LogCaptur
 
         # THEN
         assert raised_err.value is mock_open.side_effect
-        mock_open.assert_called_once_with(filepath)
+        mock_open.assert_called_once_with(filepath, encoding="utf-8")
         assert "Failed to open data file: " in caplog.text
 
     def test_raises_when_parsing_fails(self, caplog: pytest.LogCaptureFixture):
diff --git a/test/openjd/test_copyright_header.py b/test/openjd/test_copyright_header.py
index 23b22db..8689a77 100644
--- a/test/openjd/test_copyright_header.py
+++ b/test/openjd/test_copyright_header.py
@@ -33,7 +33,7 @@ def _check_file(filename: Path) -> None:
 def _is_version_file(filename: Path) -> bool:
     if filename.name != "_version.py":
         return False
-    with open(filename) as infile:
+    with open(filename, encoding="utf-8") as infile:
         lines_read = 0
         for line in infile:
             if _generated_by_scm.search(line):