Add logging and fix all logging contexts (#790)

* Add logging and fix all logging contexts * Add call to `upper` to make the level config less error prone * Add docs page * Linting and fixup * Add note about log serialization * Make fixes based on Staci suggestions
WordPress · Jul 7, 2022 · ce33572 · ce33572
1 parent b7765c4
commit ce33572
Show file tree

Hide file tree

Showing 13 changed files with 292 additions and 79 deletions.
diff --git a/api/catalog/api/controllers/elasticsearch/related.py b/api/catalog/api/controllers/elasticsearch/related.py
@@ -1,3 +1,6 @@
+import json
+import logging
+
 from elasticsearch_dsl import Search
 
 from catalog.api.controllers.elasticsearch.utils import (
@@ -8,10 +11,14 @@
 )
 
 
+parent_logger = logging.getLogger(__name__)
+
+
 def related_media(uuid, index, filter_dead):
     """
     Given a UUID, find related search results.
     """
+    logger = parent_logger.getChild("related_media")
     search_client = Search(using="default", index=index)
 
     # Convert UUID to sequential ID.
@@ -33,8 +40,20 @@ def related_media(uuid, index, filter_dead):
     start, end = get_query_slice(s, page_size, page, filter_dead)
     s = s[start:end]
     response = s.execute()
+    logger.debug(
+        "executed related query "
+        f"es_took_ms={response.took} "
+        f"query={json.dumps(s.to_dict())} "
+        f"response={json.dumps(response.to_dict())} "
+    )
+
     results = post_process_results(s, start, end, page_size, response, filter_dead)
 
     result_count, _ = get_result_and_page_count(response, results, page_size)
 
+    logger.debug(
+        "finished post processing and returning related "
+        f"result_count={result_count} "
+        f"results={json.dumps(results)}"
+    )
     return results, result_count
diff --git a/api/catalog/api/controllers/elasticsearch/search.py b/api/catalog/api/controllers/elasticsearch/search.py
@@ -1,9 +1,8 @@
 from __future__ import annotations
 
 import json
-import logging as log
-import pprint
-from typing import List, Literal, Tuple
+import logging
+from typing import List, Literal, Optional, Tuple
 
 from django.conf import settings
 
@@ -21,19 +20,32 @@
 from catalog.api.serializers.media_serializers import MediaSearchRequestSerializer
 
 
+parent_logger = logging.getLogger(__name__)
+
+
 class FieldMapping:
     """
     Establishes a mapping between a field in ``MediaSearchRequestSerializer`` and the
     Elasticsearch index for a media.
     """
 
-    def __init__(self, serializer_field: str, es_field: str = None):
+    def __init__(self, serializer_field: str, es_field: Optional[str] = None):
         self.serializer_field: str = serializer_field
         """the name of the field in ``MediaSearchRequestSerializer``"""
 
         self.es_field: str = es_field or serializer_field
         """the name of the field in the Elasticsearch index"""
 
+    def __str__(self):
+        return (
+            "FieldMapping("
+            f"serializer_field={self.serializer_field}, "
+            f"es_field={self.es_field})"
+        )
+
+    def __repr__(self):
+        return str(self)
+
 
 def _quote_escape(query_string: str) -> str:
     """
@@ -69,13 +81,19 @@ def _apply_filter(
     :param behaviour: whether to accept (``filter``) or reject (``exclude``) the hit
     :return: the modified search query
     """
-
+    logger = parent_logger.getChild("_apply_filter")
     search_params = query_serializer.data
     if mapping.serializer_field in search_params:
         filters = []
         for arg in search_params[mapping.serializer_field].split(","):
             filters.append(Q("term", **{mapping.es_field: arg}))
         method = getattr(s, behaviour)  # can be ``s.filter`` or ``s.exclude``
+        logger.debug(
+            "applying filter "
+            f"behaviour={behaviour} "
+            f"mapping={mapping} "
+            f"filters={json.dumps(list(map(lambda f: f.to_dict(), filters)))} "
+        )
         return method("bool", should=filters)
     else:
         return s
@@ -96,6 +114,13 @@ def perform_search(
     :return: the list of search results with the page and result count
     """
 
+    logger = parent_logger.getChild("perform_search")
+    logger.info(
+        "searching with "
+        f"query_serializer.data={json.dumps(query_serializer.data)} "
+        f"index={index} "
+        f"(hashed) ip={ip} "
+    )
     s = Search(using="default", index=index)
     search_params = query_serializer.data
 
@@ -121,6 +146,7 @@ def perform_search(
 
     # Exclude mature content
     if not search_params["mature"]:
+        logger.debug("excluding mature")
         s = s.exclude("term", mature=True)
     # Exclude sources with ``filter_content`` enabled
     s = exclude_filtered_providers(s)
@@ -130,15 +156,16 @@ def perform_search(
 
     search_fields = ["tags.name", "title", "description"]
     if "q" in search_params:
-        query = _quote_escape(search_params["q"])
+        escaped_query = _quote_escape(search_params["q"])
+        logger.info(f"searching with query term escaped_query={escaped_query}")
         s = s.query(
             "simple_query_string",
-            query=query,
+            query=escaped_query,
             fields=search_fields,
             default_operator="AND",
         )
         # Boost exact matches
-        quotes_stripped = query.replace('"', "")
+        quotes_stripped = escaped_query.replace('"', "")
         exact_match_boost = Q(
             "simple_query_string",
             fields=["title"],
@@ -147,6 +174,7 @@ def perform_search(
         )
         s.query = Q("bool", must=s.query, should=exact_match_boost)
     else:
+        logger.info("searching without query term")
         query_bases = ["creator", "title", ("tags", "tags.name")]
         for query_basis in query_bases:
             if isinstance(query_basis, tuple):
@@ -155,12 +183,21 @@ def perform_search(
                 serializer_field = es_field = query_basis
             if serializer_field in search_params:
                 value = _quote_escape(search_params[serializer_field])
+                logger.debug(
+                    "adding query for "
+                    f"value={value} "
+                    f"es_field={es_field}"
+                    f"serializer_field={serializer_field}"
+                )
                 s = s.query("simple_query_string", fields=[es_field], query=value)
 
     if settings.USE_RANK_FEATURES:
         feature_boost = {"standardized_popularity": 10000}
         rank_queries = []
         for field, boost in feature_boost.items():
+            logger.debug(
+                "applying ranked features " f"field={field} " f"boost={boost} "
+            )
             rank_queries.append(Q("rank_feature", field=field, boost=boost))
         s.query = Q("bool", must=s.query or EMPTY_QUERY, should=rank_queries)
 
@@ -183,15 +220,16 @@ def perform_search(
     s = s[start:end]
 
     try:
-        if settings.VERBOSE_ES_RESPONSE:
-            log.info(pprint.pprint(s.to_dict()))
+        logger.info("executing query")
         search_response = s.execute()
-        log.info(
-            f"query={json.dumps(s.to_dict())}," f" es_took_ms={search_response.took}"
+        logger.debug(
+            "executed query "
+            f"es_took_ms={search_response.took} "
+            f"query={json.dumps(s.to_dict())} "
+            f"response={json.dumps(search_response.to_dict())} "
         )
-        if settings.VERBOSE_ES_RESPONSE:
-            log.info(pprint.pprint(search_response.to_dict()))
     except RequestError as e:
+        logger.error("encountered error executing query", exc_info=True)
         raise ValueError(e)
 
     results = post_process_results(
@@ -206,4 +244,17 @@ def perform_search(
     result_count, page_count = get_result_and_page_count(
         search_response, results, search_params["page_size"]
     )
+
+    dumpable_results = (
+        results.to_dict()
+        if isinstance(results, Hit)
+        else list(map(lambda r: r.to_dict(), results))
+    )
+
+    logger.debug(
+        "finished post processing and returning "
+        f"result_count={result_count} "
+        f"page_count={page_count} "
+        f"results={json.dumps(dumpable_results)}"
+    )
     return results, page_count, result_count
diff --git a/api/catalog/api/controllers/elasticsearch/stats.py b/api/catalog/api/controllers/elasticsearch/stats.py
@@ -1,4 +1,5 @@
-import logging as log
+import json
+import logging
 from typing import Literal
 
 from django.core.cache import cache
@@ -7,6 +8,9 @@
 from elasticsearch_dsl import Search
 
 
+parent_logger = logging.getLogger(__name__)
+
+
 SOURCE_CACHE_TIMEOUT = 60 * 20  # seconds
 
 
@@ -18,14 +22,21 @@ def get_stats(index: Literal["image", "audio"]):
     :param index: the Elasticsearch index name
     :return: a dictionary mapping sources to the count of their media items
     """
-
+    logger = parent_logger.getChild("get_stats")
     source_cache_name = "sources-" + index
     try:
+        logger.debug(f"fetching source cache key={source_cache_name}")
         sources = cache.get(key=source_cache_name)
         if sources is not None:
+            logger.debug(f"cache hit! returning sources={json.dumps(sources)}")
             return sources
+        else:
+            logger.debug("cache missed")
     except ValueError:
-        log.warning("Source cache fetch failed")
+        # TODO: Improve error handling here.
+        # What failed? Why? Do we need to address it?
+        # Is this a critical issue? Why is this a "warning"?
+        logger.warning("Source cache fetch failed")
 
     # Don't increase `size` without reading this issue first:
     # https://github.com/elastic/elasticsearch/issues/18838
@@ -46,6 +57,8 @@ def get_stats(index: Literal["image", "audio"]):
         sources = {}
 
     if sources:
+        logger.info(f"putting sources to cache key={source_cache_name}")
         cache.set(key=source_cache_name, timeout=SOURCE_CACHE_TIMEOUT, value=sources)
 
+    logger.debug(f"sources={json.dumps(sources)}")
     return sources