Skip to content
This repository has been archived by the owner on Feb 22, 2023. It is now read-only.

Add logging and fix all logging contexts #790

Merged
merged 6 commits into from
Jul 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions api/catalog/api/controllers/elasticsearch/related.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import json
import logging

from elasticsearch_dsl import Search

from catalog.api.controllers.elasticsearch.utils import (
Expand All @@ -8,10 +11,14 @@
)


parent_logger = logging.getLogger(__name__)


def related_media(uuid, index, filter_dead):
"""
Given a UUID, find related search results.
"""
logger = parent_logger.getChild("related_media")
search_client = Search(using="default", index=index)

# Convert UUID to sequential ID.
Expand All @@ -33,8 +40,20 @@ def related_media(uuid, index, filter_dead):
start, end = get_query_slice(s, page_size, page, filter_dead)
s = s[start:end]
response = s.execute()
logger.debug(
"executed related query "
f"es_took_ms={response.took} "
f"query={json.dumps(s.to_dict())} "
f"response={json.dumps(response.to_dict())} "
)

results = post_process_results(s, start, end, page_size, response, filter_dead)

result_count, _ = get_result_and_page_count(response, results, page_size)

logger.debug(
"finished post processing and returning related "
f"result_count={result_count} "
f"results={json.dumps(results)}"
)
return results, result_count
79 changes: 65 additions & 14 deletions api/catalog/api/controllers/elasticsearch/search.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from __future__ import annotations

import json
import logging as log
import pprint
from typing import List, Literal, Tuple
import logging
from typing import List, Literal, Optional, Tuple

from django.conf import settings

Expand All @@ -20,19 +19,32 @@
from catalog.api.serializers.media_serializers import MediaSearchRequestSerializer


parent_logger = logging.getLogger(__name__)


class FieldMapping:
"""
Establishes a mapping between a field in ``MediaSearchRequestSerializer`` and the
Elasticsearch index for a media.
"""

def __init__(self, serializer_field: str, es_field: str = None):
def __init__(self, serializer_field: str, es_field: Optional[str] = None):
self.serializer_field: str = serializer_field
"""the name of the field in ``MediaSearchRequestSerializer``"""

self.es_field: str = es_field or serializer_field
"""the name of the field in the Elasticsearch index"""

def __str__(self):
return (
"FieldMapping("
f"serializer_field={self.serializer_field}, "
f"es_field={self.es_field})"
)

def __repr__(self):
return str(self)


def _quote_escape(query_string: str) -> str:
"""
Expand Down Expand Up @@ -68,13 +80,19 @@ def _apply_filter(
:param behaviour: whether to accept (``filter``) or reject (``exclude``) the hit
:return: the modified search query
"""

logger = parent_logger.getChild("_apply_filter")
search_params = query_serializer.data
if mapping.serializer_field in search_params:
filters = []
for arg in search_params[mapping.serializer_field].split(","):
filters.append(Q("term", **{mapping.es_field: arg}))
method = getattr(s, behaviour) # can be ``s.filter`` or ``s.exclude``
logger.debug(
"applying filter "
f"behaviour={behaviour} "
f"mapping={mapping} "
f"filters={json.dumps(list(map(lambda f: f.to_dict(), filters)))} "
)
return method("bool", should=filters)
else:
return s
Expand All @@ -95,6 +113,13 @@ def perform_search(
:return: the list of search results with the page and result count
"""

logger = parent_logger.getChild("perform_search")
logger.info(
"searching with "
f"query_serializer.data={json.dumps(query_serializer.data)} "
f"index={index} "
f"(hashed) ip={ip} "
)
s = Search(using="default", index=index)
search_params = query_serializer.data

Expand All @@ -120,6 +145,7 @@ def perform_search(

# Exclude mature content
if not search_params["mature"]:
logger.debug("excluding mature")
s = s.exclude("term", mature=True)
# Exclude sources with ``filter_content`` enabled
s = exclude_filtered_providers(s)
Expand All @@ -129,15 +155,16 @@ def perform_search(

search_fields = ["tags.name", "title", "description"]
if "q" in search_params:
query = _quote_escape(search_params["q"])
escaped_query = _quote_escape(search_params["q"])
logger.info(f"searching with query term escaped_query={escaped_query}")
s = s.query(
"simple_query_string",
query=query,
query=escaped_query,
fields=search_fields,
default_operator="AND",
)
# Boost exact matches
quotes_stripped = query.replace('"', "")
quotes_stripped = escaped_query.replace('"', "")
exact_match_boost = Q(
"simple_query_string",
fields=["title"],
Expand All @@ -146,6 +173,7 @@ def perform_search(
)
s.query = Q("bool", must=s.query, should=exact_match_boost)
else:
logger.info("searching without query term")
query_bases = ["creator", "title", ("tags", "tags.name")]
for query_basis in query_bases:
if isinstance(query_basis, tuple):
Expand All @@ -154,12 +182,21 @@ def perform_search(
serializer_field = es_field = query_basis
if serializer_field in search_params:
value = _quote_escape(search_params[serializer_field])
logger.debug(
"adding query for "
f"value={value} "
f"es_field={es_field}"
f"serializer_field={serializer_field}"
)
s = s.query("simple_query_string", fields=[es_field], query=value)

if settings.USE_RANK_FEATURES:
feature_boost = {"standardized_popularity": 10000}
rank_queries = []
for field, boost in feature_boost.items():
logger.debug(
"applying ranked features " f"field={field} " f"boost={boost} "
)
rank_queries.append(Q("rank_feature", field=field, boost=boost))
s.query = Q("bool", must=s.query, should=rank_queries)

Expand All @@ -182,15 +219,16 @@ def perform_search(
s = s[start:end]

try:
if settings.VERBOSE_ES_RESPONSE:
log.info(pprint.pprint(s.to_dict()))
logger.info("executing query")
search_response = s.execute()
log.info(
f"query={json.dumps(s.to_dict())}," f" es_took_ms={search_response.took}"
logger.debug(
"executed query "
f"es_took_ms={search_response.took} "
f"query={json.dumps(s.to_dict())} "
f"response={json.dumps(search_response.to_dict())} "
)
if settings.VERBOSE_ES_RESPONSE:
log.info(pprint.pprint(search_response.to_dict()))
except RequestError as e:
logger.error("encountered error executing query", exc_info=True)
raise ValueError(e)

results = post_process_results(
Expand All @@ -205,4 +243,17 @@ def perform_search(
result_count, page_count = get_result_and_page_count(
search_response, results, search_params["page_size"]
)

dumpable_results = (
results.to_dict()
if isinstance(results, Hit)
else list(map(lambda r: r.to_dict(), results))
)

logger.debug(
"finished post processing and returning "
f"result_count={result_count} "
f"page_count={page_count} "
f"results={json.dumps(dumpable_results)}"
)
return results, page_count, result_count
19 changes: 16 additions & 3 deletions api/catalog/api/controllers/elasticsearch/stats.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging as log
import json
import logging
from typing import Literal

from django.core.cache import cache
Expand All @@ -7,6 +8,9 @@
from elasticsearch_dsl import Search


parent_logger = logging.getLogger(__name__)


SOURCE_CACHE_TIMEOUT = 60 * 20 # seconds


Expand All @@ -18,14 +22,21 @@ def get_stats(index: Literal["image", "audio"]):
:param index: the Elasticsearch index name
:return: a dictionary mapping sources to the count of their media items
"""

logger = parent_logger.getChild("get_stats")
source_cache_name = "sources-" + index
try:
logger.debug(f"fetching source cache key={source_cache_name}")
sources = cache.get(key=source_cache_name)
if sources is not None:
logger.debug(f"cache hit! returning sources={json.dumps(sources)}")
return sources
else:
logger.debug("cache missed")
except ValueError:
log.warning("Source cache fetch failed")
# TODO: Improve error handling here.
# What failed? Why? Do we need to address it?
# Is this a critical issue? Why is this a "warning"?
logger.warning("Source cache fetch failed")

# Don't increase `size` without reading this issue first:
# https://github.com/elastic/elasticsearch/issues/18838
Expand All @@ -46,6 +57,8 @@ def get_stats(index: Literal["image", "audio"]):
sources = {}

if sources:
logger.info(f"putting sources to cache key={source_cache_name}")
cache.set(key=source_cache_name, timeout=SOURCE_CACHE_TIMEOUT, value=sources)

logger.debug(f"sources={json.dumps(sources)}")
return sources
Loading