Skip to content
This repository has been archived by the owner on Feb 22, 2023. It is now read-only.

Commit

Permalink
Add logging and fix all logging contexts (#790)
Browse files Browse the repository at this point in the history
* Add logging and fix all logging contexts

* Add call to `upper` to make the level config less error prone

* Add docs page

* Linting and fixup

* Add note about log serialization

* Make fixes based on Staci suggestions
  • Loading branch information
sarayourfriend authored Jul 7, 2022
1 parent b7765c4 commit ce33572
Show file tree
Hide file tree
Showing 13 changed files with 292 additions and 79 deletions.
19 changes: 19 additions & 0 deletions api/catalog/api/controllers/elasticsearch/related.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import json
import logging

from elasticsearch_dsl import Search

from catalog.api.controllers.elasticsearch.utils import (
Expand All @@ -8,10 +11,14 @@
)


parent_logger = logging.getLogger(__name__)


def related_media(uuid, index, filter_dead):
"""
Given a UUID, find related search results.
"""
logger = parent_logger.getChild("related_media")
search_client = Search(using="default", index=index)

# Convert UUID to sequential ID.
Expand All @@ -33,8 +40,20 @@ def related_media(uuid, index, filter_dead):
start, end = get_query_slice(s, page_size, page, filter_dead)
s = s[start:end]
response = s.execute()
logger.debug(
"executed related query "
f"es_took_ms={response.took} "
f"query={json.dumps(s.to_dict())} "
f"response={json.dumps(response.to_dict())} "
)

results = post_process_results(s, start, end, page_size, response, filter_dead)

result_count, _ = get_result_and_page_count(response, results, page_size)

logger.debug(
"finished post processing and returning related "
f"result_count={result_count} "
f"results={json.dumps(results)}"
)
return results, result_count
79 changes: 65 additions & 14 deletions api/catalog/api/controllers/elasticsearch/search.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from __future__ import annotations

import json
import logging as log
import pprint
from typing import List, Literal, Tuple
import logging
from typing import List, Literal, Optional, Tuple

from django.conf import settings

Expand All @@ -21,19 +20,32 @@
from catalog.api.serializers.media_serializers import MediaSearchRequestSerializer


parent_logger = logging.getLogger(__name__)


class FieldMapping:
"""
Establishes a mapping between a field in ``MediaSearchRequestSerializer`` and the
Elasticsearch index for a media.
"""

def __init__(self, serializer_field: str, es_field: str = None):
def __init__(self, serializer_field: str, es_field: Optional[str] = None):
self.serializer_field: str = serializer_field
"""the name of the field in ``MediaSearchRequestSerializer``"""

self.es_field: str = es_field or serializer_field
"""the name of the field in the Elasticsearch index"""

def __str__(self):
return (
"FieldMapping("
f"serializer_field={self.serializer_field}, "
f"es_field={self.es_field})"
)

def __repr__(self):
return str(self)


def _quote_escape(query_string: str) -> str:
"""
Expand Down Expand Up @@ -69,13 +81,19 @@ def _apply_filter(
:param behaviour: whether to accept (``filter``) or reject (``exclude``) the hit
:return: the modified search query
"""

logger = parent_logger.getChild("_apply_filter")
search_params = query_serializer.data
if mapping.serializer_field in search_params:
filters = []
for arg in search_params[mapping.serializer_field].split(","):
filters.append(Q("term", **{mapping.es_field: arg}))
method = getattr(s, behaviour) # can be ``s.filter`` or ``s.exclude``
logger.debug(
"applying filter "
f"behaviour={behaviour} "
f"mapping={mapping} "
f"filters={json.dumps(list(map(lambda f: f.to_dict(), filters)))} "
)
return method("bool", should=filters)
else:
return s
Expand All @@ -96,6 +114,13 @@ def perform_search(
:return: the list of search results with the page and result count
"""

logger = parent_logger.getChild("perform_search")
logger.info(
"searching with "
f"query_serializer.data={json.dumps(query_serializer.data)} "
f"index={index} "
f"(hashed) ip={ip} "
)
s = Search(using="default", index=index)
search_params = query_serializer.data

Expand All @@ -121,6 +146,7 @@ def perform_search(

# Exclude mature content
if not search_params["mature"]:
logger.debug("excluding mature")
s = s.exclude("term", mature=True)
# Exclude sources with ``filter_content`` enabled
s = exclude_filtered_providers(s)
Expand All @@ -130,15 +156,16 @@ def perform_search(

search_fields = ["tags.name", "title", "description"]
if "q" in search_params:
query = _quote_escape(search_params["q"])
escaped_query = _quote_escape(search_params["q"])
logger.info(f"searching with query term escaped_query={escaped_query}")
s = s.query(
"simple_query_string",
query=query,
query=escaped_query,
fields=search_fields,
default_operator="AND",
)
# Boost exact matches
quotes_stripped = query.replace('"', "")
quotes_stripped = escaped_query.replace('"', "")
exact_match_boost = Q(
"simple_query_string",
fields=["title"],
Expand All @@ -147,6 +174,7 @@ def perform_search(
)
s.query = Q("bool", must=s.query, should=exact_match_boost)
else:
logger.info("searching without query term")
query_bases = ["creator", "title", ("tags", "tags.name")]
for query_basis in query_bases:
if isinstance(query_basis, tuple):
Expand All @@ -155,12 +183,21 @@ def perform_search(
serializer_field = es_field = query_basis
if serializer_field in search_params:
value = _quote_escape(search_params[serializer_field])
logger.debug(
"adding query for "
f"value={value} "
f"es_field={es_field}"
f"serializer_field={serializer_field}"
)
s = s.query("simple_query_string", fields=[es_field], query=value)

if settings.USE_RANK_FEATURES:
feature_boost = {"standardized_popularity": 10000}
rank_queries = []
for field, boost in feature_boost.items():
logger.debug(
"applying ranked features " f"field={field} " f"boost={boost} "
)
rank_queries.append(Q("rank_feature", field=field, boost=boost))
s.query = Q("bool", must=s.query or EMPTY_QUERY, should=rank_queries)

Expand All @@ -183,15 +220,16 @@ def perform_search(
s = s[start:end]

try:
if settings.VERBOSE_ES_RESPONSE:
log.info(pprint.pprint(s.to_dict()))
logger.info("executing query")
search_response = s.execute()
log.info(
f"query={json.dumps(s.to_dict())}," f" es_took_ms={search_response.took}"
logger.debug(
"executed query "
f"es_took_ms={search_response.took} "
f"query={json.dumps(s.to_dict())} "
f"response={json.dumps(search_response.to_dict())} "
)
if settings.VERBOSE_ES_RESPONSE:
log.info(pprint.pprint(search_response.to_dict()))
except RequestError as e:
logger.error("encountered error executing query", exc_info=True)
raise ValueError(e)

results = post_process_results(
Expand All @@ -206,4 +244,17 @@ def perform_search(
result_count, page_count = get_result_and_page_count(
search_response, results, search_params["page_size"]
)

dumpable_results = (
results.to_dict()
if isinstance(results, Hit)
else list(map(lambda r: r.to_dict(), results))
)

logger.debug(
"finished post processing and returning "
f"result_count={result_count} "
f"page_count={page_count} "
f"results={json.dumps(dumpable_results)}"
)
return results, page_count, result_count
19 changes: 16 additions & 3 deletions api/catalog/api/controllers/elasticsearch/stats.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging as log
import json
import logging
from typing import Literal

from django.core.cache import cache
Expand All @@ -7,6 +8,9 @@
from elasticsearch_dsl import Search


parent_logger = logging.getLogger(__name__)


SOURCE_CACHE_TIMEOUT = 60 * 20 # seconds


Expand All @@ -18,14 +22,21 @@ def get_stats(index: Literal["image", "audio"]):
:param index: the Elasticsearch index name
:return: a dictionary mapping sources to the count of their media items
"""

logger = parent_logger.getChild("get_stats")
source_cache_name = "sources-" + index
try:
logger.debug(f"fetching source cache key={source_cache_name}")
sources = cache.get(key=source_cache_name)
if sources is not None:
logger.debug(f"cache hit! returning sources={json.dumps(sources)}")
return sources
else:
logger.debug("cache missed")
except ValueError:
log.warning("Source cache fetch failed")
# TODO: Improve error handling here.
# What failed? Why? Do we need to address it?
# Is this a critical issue? Why is this a "warning"?
logger.warning("Source cache fetch failed")

# Don't increase `size` without reading this issue first:
# https://github.com/elastic/elasticsearch/issues/18838
Expand All @@ -46,6 +57,8 @@ def get_stats(index: Literal["image", "audio"]):
sources = {}

if sources:
logger.info(f"putting sources to cache key={source_cache_name}")
cache.set(key=source_cache_name, timeout=SOURCE_CACHE_TIMEOUT, value=sources)

logger.debug(f"sources={json.dumps(sources)}")
return sources
Loading

0 comments on commit ce33572

Please sign in to comment.