From a0304a039e52cdb6494d5767b52261bb50463954 Mon Sep 17 00:00:00 2001
From: Carey Metcalfe <carey@cmetcalfe.ca>
Date: Mon, 25 Nov 2024 12:07:59 -0500
Subject: [PATCH] Use `LogRecord.getMessage` to get OTLP body (#4216)

---
 CHANGELOG.md                                  |  3 +
 .../sdk/_logs/_internal/__init__.py           | 56 ++++++-------------
 opentelemetry-sdk/tests/logs/test_export.py   | 46 +++++++++++++++
 3 files changed, 65 insertions(+), 40 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bf0ae6a891..b8fb2bd3f4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,6 +44,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   ([#4224](https://github.com/open-telemetry/opentelemetry-python/pull/4224))
 - Drop `OTEL_PYTHON_EXPERIMENTAL_DISABLE_PROMETHEUS_UNIT_NORMALIZATION` environment variable
   ([#4217](https://github.com/open-telemetry/opentelemetry-python/pull/4217))
+- Improve compatibility with other logging libraries that override
+  `LogRecord.getMessage()` in order to customize message formatting
+  ([#4216](https://github.com/open-telemetry/opentelemetry-python/pull/4216))
 
 ## Version 1.27.0/0.48b0 (2024-08-28)
 
diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py
index 2d52b1bc74..110bb89c99 100644
--- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py
+++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py
@@ -438,6 +438,7 @@ def force_flush(self, timeout_millis: int = 30000) -> bool:
         "exc_text",
         "filename",
         "funcName",
+        "getMessage",
         "message",
         "levelname",
         "levelno",
@@ -503,51 +504,26 @@ def _translate(self, record: logging.LogRecord) -> LogRecord:
         observered_timestamp = time_ns()
         span_context = get_current_span().get_span_context()
         attributes = self._get_attributes(record)
-        # This comment is taken from GanyedeNil's PR #3343, I have redacted it
-        # slightly for clarity:
-        # According to the definition of the Body field type in the
-        # OTel 1.22.0 Logs Data Model article, the Body field should be of
-        # type 'any' and should not use the str method to directly translate
-        # the msg. This is because str only converts non-text types into a
-        # human-readable form, rather than a standard format, which leads to
-        # the need for additional operations when collected through a log
-        # collector.
-        # Considering that he Body field should be of type 'any' and should not
-        # use the str method but record.msg is also a string type, then the
-        # difference is just the self.args formatting?
-        # The primary consideration depends on the ultimate purpose of the log.
-        # Converting the default log directly into a string is acceptable as it
-        # will be required to be presented in a more readable format. However,
-        # this approach might not be as "standard" when hoping to aggregate
-        # logs and perform subsequent data analysis. In the context of log
-        # extraction, it would be more appropriate for the msg to be
-        # converted into JSON format or remain unchanged, as it will eventually
-        # be transformed into JSON. If the final output JSON data contains a
-        # structure that appears similar to JSON but is not, it may confuse
-        # users. This is particularly true for operation and maintenance
-        # personnel who need to deal with log data in various languages.
-        # Where is the JSON converting occur? and what about when the msg
-        # represents something else but JSON, the expected behavior change?
-        # For the ConsoleLogExporter, it performs the to_json operation in
-        # opentelemetry.sdk._logs._internal.export.ConsoleLogExporter.__init__,
-        # so it can handle any type of input without problems. As for the
-        # OTLPLogExporter, it also handles any type of input encoding in
-        # _encode_log located in
-        # opentelemetry.exporter.otlp.proto.common._internal._log_encoder.
-        # Therefore, no extra operation is needed to support this change.
-        # The only thing to consider is the users who have already been using
-        # this SDK. If they upgrade the SDK after this change, they will need
-        # to readjust their logging collection rules to adapt to the latest
-        # output format. Therefore, this change is considered a breaking
-        # change and needs to be upgraded at an appropriate time.
         severity_number = std_to_otel(record.levelno)
         if self.formatter:
             body = self.format(record)
         else:
-            if isinstance(record.msg, str) and record.args:
-                body = record.msg % record.args
-            else:
+            # `record.getMessage()` uses `record.msg` as a template to format
+            # `record.args` into. There is a special case in `record.getMessage()`
+            # where it will only attempt formatting if args are provided,
+            # otherwise, it just stringifies `record.msg`.
+            #
+            # Since the OTLP body field has a type of 'any' and the logging module
+            # is sometimes used in such a way that objects incorrectly end up
+            # set as record.msg, in those cases we would like to bypass
+            # `record.getMessage()` completely and set the body to the object
+            # itself instead of its string representation.
+            # For more background, see: https://github.com/open-telemetry/opentelemetry-python/pull/4216
+            if not record.args and not isinstance(record.msg, str):
+                # no args are provided so it's *mostly* safe to use the message template as the body
                 body = record.msg
+            else:
+                body = record.getMessage()
 
         # related to https://github.com/open-telemetry/opentelemetry-python/issues/3548
         # Severity Text = WARN as defined in https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/logs/data-model.md#displaying-severity.
diff --git a/opentelemetry-sdk/tests/logs/test_export.py b/opentelemetry-sdk/tests/logs/test_export.py
index ce31d3991f..2e00bad653 100644
--- a/opentelemetry-sdk/tests/logs/test_export.py
+++ b/opentelemetry-sdk/tests/logs/test_export.py
@@ -230,6 +230,52 @@ def test_simple_log_record_processor_different_msg_types(self):
                 item.instrumentation_scope.name, "different_msg_types"
             )
 
+    def test_simple_log_record_processor_custom_single_obj(self):
+        """
+        Tests that special-case handling for logging a single non-string object
+        is correctly applied.
+        """
+        exporter = InMemoryLogExporter()
+        log_record_processor = BatchLogRecordProcessor(exporter)
+
+        provider = LoggerProvider()
+        provider.add_log_record_processor(log_record_processor)
+
+        logger = logging.getLogger("single_obj")
+        logger.addHandler(LoggingHandler(logger_provider=provider))
+
+        # NOTE: the behaviour of `record.getMessage` is detailed in the
+        # `logging.Logger.debug` documentation:
+        # > The msg is the message format string, and the args are the arguments
+        # > which are merged into msg using the string formatting operator. [...]
+        # > No % formatting operation is performed on msg when no args are supplied.
+
+        # This test uses the presence of '%s' in the first arg to determine if
+        # formatting was applied
+
+        # string msg with no args - getMessage bypasses formatting and sets the string directly
+        logger.warning("a string with a percent-s: %s")
+        # string msg with args - getMessage formats args into the msg
+        logger.warning("a string with a percent-s: %s", "and arg")
+        # non-string msg with args - getMessage stringifies msg and formats args into it
+        logger.warning(["a non-string with a percent-s", "%s"], "and arg")
+        # non-string msg with no args:
+        #  - normally getMessage would stringify the object and bypass formatting
+        #  - SPECIAL CASE: bypass stringification as well to keep the raw object
+        logger.warning(["a non-string with a percent-s", "%s"])
+        log_record_processor.shutdown()
+
+        finished_logs = exporter.get_finished_logs()
+        expected = [
+            ("a string with a percent-s: %s"),
+            ("a string with a percent-s: and arg"),
+            ("['a non-string with a percent-s', 'and arg']"),
+            (["a non-string with a percent-s", "%s"]),
+        ]
+        for emitted, expected in zip(finished_logs, expected):
+            self.assertEqual(emitted.log_record.body, expected)
+            self.assertEqual(emitted.instrumentation_scope.name, "single_obj")
+
     def test_simple_log_record_processor_different_msg_types_with_formatter(
         self,
     ):