diff --git a/api/catalog/api/controllers/elasticsearch/related.py b/api/catalog/api/controllers/elasticsearch/related.py index df28513b6..e3b0decc6 100644 --- a/api/catalog/api/controllers/elasticsearch/related.py +++ b/api/catalog/api/controllers/elasticsearch/related.py @@ -1,3 +1,6 @@ +import json +import logging + from elasticsearch_dsl import Search from catalog.api.controllers.elasticsearch.utils import ( @@ -8,10 +11,14 @@ ) +parent_logger = logging.getLogger(__name__) + + def related_media(uuid, index, filter_dead): """ Given a UUID, find related search results. """ + logger = parent_logger.getChild("related_media") search_client = Search(using="default", index=index) # Convert UUID to sequential ID. @@ -33,8 +40,20 @@ def related_media(uuid, index, filter_dead): start, end = get_query_slice(s, page_size, page, filter_dead) s = s[start:end] response = s.execute() + logger.debug( + "executed related query " + f"es_took_ms={response.took} " + f"query={json.dumps(s.to_dict())} " + f"response={json.dumps(response.to_dict())} " + ) + results = post_process_results(s, start, end, page_size, response, filter_dead) result_count, _ = get_result_and_page_count(response, results, page_size) + logger.debug( + "finished post processing and returning related " + f"result_count={result_count} " + f"results={json.dumps(results)}" + ) return results, result_count diff --git a/api/catalog/api/controllers/elasticsearch/search.py b/api/catalog/api/controllers/elasticsearch/search.py index a8493b327..c0b1d9b86 100644 --- a/api/catalog/api/controllers/elasticsearch/search.py +++ b/api/catalog/api/controllers/elasticsearch/search.py @@ -1,9 +1,8 @@ from __future__ import annotations import json -import logging as log -import pprint -from typing import List, Literal, Tuple +import logging +from typing import List, Literal, Optional, Tuple from django.conf import settings @@ -20,19 +19,32 @@ from catalog.api.serializers.media_serializers import MediaSearchRequestSerializer +parent_logger = logging.getLogger(__name__) + + class FieldMapping: """ Establishes a mapping between a field in ``MediaSearchRequestSerializer`` and the Elasticsearch index for a media. """ - def __init__(self, serializer_field: str, es_field: str = None): + def __init__(self, serializer_field: str, es_field: Optional[str] = None): self.serializer_field: str = serializer_field """the name of the field in ``MediaSearchRequestSerializer``""" self.es_field: str = es_field or serializer_field """the name of the field in the Elasticsearch index""" + def __str__(self): + return ( + "FieldMapping(" + f"serializer_field={self.serializer_field}, " + f"es_field={self.es_field})" + ) + + def __repr__(self): + return str(self) + def _quote_escape(query_string: str) -> str: """ @@ -68,13 +80,19 @@ def _apply_filter( :param behaviour: whether to accept (``filter``) or reject (``exclude``) the hit :return: the modified search query """ - + logger = parent_logger.getChild("_apply_filter") search_params = query_serializer.data if mapping.serializer_field in search_params: filters = [] for arg in search_params[mapping.serializer_field].split(","): filters.append(Q("term", **{mapping.es_field: arg})) method = getattr(s, behaviour) # can be ``s.filter`` or ``s.exclude`` + logger.debug( + "applying filter " + f"behaviour={behaviour} " + f"mapping={mapping} " + f"filters={json.dumps(list(map(lambda f: f.to_dict(), filters)))} " + ) return method("bool", should=filters) else: return s @@ -95,6 +113,13 @@ def perform_search( :return: the list of search results with the page and result count """ + logger = parent_logger.getChild("perform_search") + logger.info( + "searching with " + f"query_serializer.data={json.dumps(query_serializer.data)} " + f"index={index} " + f"(hashed) ip={ip} " + ) s = Search(using="default", index=index) search_params = query_serializer.data @@ -120,6 +145,7 @@ def perform_search( # Exclude mature content if not search_params["mature"]: + logger.debug("excluding mature") s = s.exclude("term", mature=True) # Exclude sources with ``filter_content`` enabled s = exclude_filtered_providers(s) @@ -129,15 +155,16 @@ def perform_search( search_fields = ["tags.name", "title", "description"] if "q" in search_params: - query = _quote_escape(search_params["q"]) + escaped_query = _quote_escape(search_params["q"]) + logger.info(f"searching with query term escaped_query={escaped_query}") s = s.query( "simple_query_string", - query=query, + query=escaped_query, fields=search_fields, default_operator="AND", ) # Boost exact matches - quotes_stripped = query.replace('"', "") + quotes_stripped = escaped_query.replace('"', "") exact_match_boost = Q( "simple_query_string", fields=["title"], @@ -146,6 +173,7 @@ def perform_search( ) s.query = Q("bool", must=s.query, should=exact_match_boost) else: + logger.info("searching without query term") query_bases = ["creator", "title", ("tags", "tags.name")] for query_basis in query_bases: if isinstance(query_basis, tuple): @@ -154,12 +182,21 @@ def perform_search( serializer_field = es_field = query_basis if serializer_field in search_params: value = _quote_escape(search_params[serializer_field]) + logger.debug( + "adding query for " + f"value={value} " + f"es_field={es_field}" + f"serializer_field={serializer_field}" + ) s = s.query("simple_query_string", fields=[es_field], query=value) if settings.USE_RANK_FEATURES: feature_boost = {"standardized_popularity": 10000} rank_queries = [] for field, boost in feature_boost.items(): + logger.debug( + "applying ranked features " f"field={field} " f"boost={boost} " + ) rank_queries.append(Q("rank_feature", field=field, boost=boost)) s.query = Q("bool", must=s.query, should=rank_queries) @@ -182,15 +219,16 @@ def perform_search( s = s[start:end] try: - if settings.VERBOSE_ES_RESPONSE: - log.info(pprint.pprint(s.to_dict())) + logger.info("executing query") search_response = s.execute() - log.info( - f"query={json.dumps(s.to_dict())}," f" es_took_ms={search_response.took}" + logger.debug( + "executed query " + f"es_took_ms={search_response.took} " + f"query={json.dumps(s.to_dict())} " + f"response={json.dumps(search_response.to_dict())} " ) - if settings.VERBOSE_ES_RESPONSE: - log.info(pprint.pprint(search_response.to_dict())) except RequestError as e: + logger.error("encountered error executing query", exc_info=True) raise ValueError(e) results = post_process_results( @@ -205,4 +243,17 @@ def perform_search( result_count, page_count = get_result_and_page_count( search_response, results, search_params["page_size"] ) + + dumpable_results = ( + results.to_dict() + if isinstance(results, Hit) + else list(map(lambda r: r.to_dict(), results)) + ) + + logger.debug( + "finished post processing and returning " + f"result_count={result_count} " + f"page_count={page_count} " + f"results={json.dumps(dumpable_results)}" + ) return results, page_count, result_count diff --git a/api/catalog/api/controllers/elasticsearch/stats.py b/api/catalog/api/controllers/elasticsearch/stats.py index f37e54812..bd7a163bc 100644 --- a/api/catalog/api/controllers/elasticsearch/stats.py +++ b/api/catalog/api/controllers/elasticsearch/stats.py @@ -1,4 +1,5 @@ -import logging as log +import json +import logging from typing import Literal from django.core.cache import cache @@ -7,6 +8,9 @@ from elasticsearch_dsl import Search +parent_logger = logging.getLogger(__name__) + + SOURCE_CACHE_TIMEOUT = 60 * 20 # seconds @@ -18,14 +22,21 @@ def get_stats(index: Literal["image", "audio"]): :param index: the Elasticsearch index name :return: a dictionary mapping sources to the count of their media items """ - + logger = parent_logger.getChild("get_stats") source_cache_name = "sources-" + index try: + logger.debug(f"fetching source cache key={source_cache_name}") sources = cache.get(key=source_cache_name) if sources is not None: + logger.debug(f"cache hit! returning sources={json.dumps(sources)}") return sources + else: + logger.debug("cache missed") except ValueError: - log.warning("Source cache fetch failed") + # TODO: Improve error handling here. + # What failed? Why? Do we need to address it? + # Is this a critical issue? Why is this a "warning"? + logger.warning("Source cache fetch failed") # Don't increase `size` without reading this issue first: # https://github.com/elastic/elasticsearch/issues/18838 @@ -46,6 +57,8 @@ def get_stats(index: Literal["image", "audio"]): sources = {} if sources: + logger.info(f"putting sources to cache key={source_cache_name}") cache.set(key=source_cache_name, timeout=SOURCE_CACHE_TIMEOUT, value=sources) + logger.debug(f"sources={json.dumps(sources)}") return sources diff --git a/api/catalog/api/controllers/elasticsearch/utils.py b/api/catalog/api/controllers/elasticsearch/utils.py index cb0bb6e12..5ba55f111 100644 --- a/api/catalog/api/controllers/elasticsearch/utils.py +++ b/api/catalog/api/controllers/elasticsearch/utils.py @@ -1,3 +1,5 @@ +import json +import logging from itertools import accumulate from math import ceil from typing import List, Optional, Tuple @@ -12,6 +14,9 @@ from catalog.api.utils.validate_images import validate_images +parent_logger = logging.getLogger(__name__) + + FILTER_CACHE_TIMEOUT = 30 DEAD_LINK_RATIO = 1 / 2 ELASTICSEARCH_MAX_RESULT_WINDOW = 10000 @@ -25,20 +30,24 @@ def exclude_filtered_providers(s: Search) -> Search: :param s: the search query to issue to Elasticsearch :return: the modified search query """ - + logger = parent_logger.getChild("exclude_filtered_providers") filter_cache_key = "filtered_providers" filtered_providers = cache.get(key=filter_cache_key) if filtered_providers is None: filtered_providers = ContentProvider.objects.filter(filter_content=True).values( "provider_identifier" ) + logger.debug("adding filtered providers to cache") cache.set( key=filter_cache_key, timeout=FILTER_CACHE_TIMEOUT, value=filtered_providers, ) + + logger.info(f'filtered_providers={",".join(filtered_providers)}') if len(filtered_providers) != 0: to_exclude = [f["provider_identifier"] for f in filtered_providers] + logger.info("auto-excluding filtered providers") s = s.exclude("terms", provider=to_exclude) return s @@ -55,26 +64,71 @@ def paginate_with_dead_link_mask( :param page: The page number. :return: Tuple of start and end. """ + logger = parent_logger.getChild("paginate_with_dead_link_mask") query_hash = get_query_hash(s) query_mask = get_query_mask(query_hash) + logger.debug( + f"page_size={page_size} " + f"page={page} " + f"query_hash={query_hash} " + f'query_mask={",".join(map(str, query_mask))} ' + ) if not query_mask: start = 0 end = ceil(page_size * page / (1 - DEAD_LINK_RATIO)) - elif page_size * (page - 1) > sum(query_mask): + logger.debug( + "empty query_mask " + f"page_size={page_size} " + f"page={page} " + f"start={start} " + f"end={end} " + ) + return start, end + + # TODO: What does this request_size mean vs the one below + # that doesn't subtract 1 from the page? Why do we subtract + # 1 here and the other time we do not? + request_size = page_size * (page - 1) + query_mask_sum = sum(query_mask) + if request_size > query_mask_sum: start = len(query_mask) end = ceil(page_size * page / (1 - DEAD_LINK_RATIO)) + logger.debug( + "request size exceeds query mask sum (branch 1)" + f"request_size={request_size} " + f"sum(query_mask)={query_mask_sum} " + f"start={start} " + f"end={end} " + ) + return start, end + + accu_query_mask = list(accumulate(query_mask)) + logger.debug(f"accu_query_mask={accu_query_mask} ") + start = 0 + if page > 1: + caught = False + try: + idx = page_size * (page - 1) + 1 + start = accu_query_mask.index(idx) + except ValueError: + caught = True + idx = page_size * (page - 1) + start = accu_query_mask.index(idx) + 1 + + logger.debug("shifted page using " f"idx={idx} " f"caught?={caught} ") + + request_size = page_size * page + if request_size > query_mask_sum: + logger.debug( + "request size exceeds query mask sum (branch 2) " + f"request_size={request_size} " + f"query_mask_sum={query_mask_sum} " + ) + end = ceil(page_size * page / (1 - DEAD_LINK_RATIO)) else: - accu_query_mask = list(accumulate(query_mask)) - start = 0 - if page > 1: - try: - start = accu_query_mask.index(page_size * (page - 1) + 1) - except ValueError: - start = accu_query_mask.index(page_size * (page - 1)) + 1 - if page_size * page > sum(query_mask): - end = ceil(page_size * page / (1 - DEAD_LINK_RATIO)) - else: - end = accu_query_mask.index(page_size * page) + 1 + end = accu_query_mask.index(page_size * page) + 1 + + logger.debug("third branch returnining " f"start={start} " f"end={end} ") return start, end @@ -84,24 +138,37 @@ def get_query_slice( """ Select the start and end of the search results for this query. """ + logger = parent_logger.getChild("get_query_slice") if filter_dead: start_slice, end_slice = paginate_with_dead_link_mask(s, page_size, page) else: # Paginate search query. start_slice = page_size * (page - 1) end_slice = page_size * page + logger.debug( + "paginating without filtering for dead links " + f"page_size={page_size} " + f"page={page} " + f"start_slice={start_slice} " + f"end_slice={end_slice} " + ) if start_slice + end_slice > ELASTICSEARCH_MAX_RESULT_WINDOW: - raise ValueError("Deep pagination is not allowed.") + raise ValueError( + "Deep pagination is not allowed. " + f"Window totalled {start_slice + end_slice}." + ) + + logger.info(f"start_slice={start_slice} end_slice={end_slice}") return start_slice, end_slice def post_process_results( - s, start, end, page_size, search_results, filter_dead + s, start, end, page_size, search_results, filter_dead, depth: int = 0 ) -> List[Hit]: """ After fetching the search results from the back end, iterate through the - results, perform image validation, and route certain thumbnails through our - proxy. + results filtering dead links. After filtering links, continue accruing additional + results until the requested page_size of validated results is reached. :param s: The Elasticsearch Search object. :param start: The start of the result slice. @@ -111,6 +178,8 @@ def post_process_results( :param filter_dead: Whether images should be validated. :return: List of results. """ + logger = parent_logger.getChild("post_process_results") + logger.info(f"post processing results depth={depth}") results = [] to_validate = [] for res in search_results: @@ -129,10 +198,20 @@ def post_process_results( return results s = s[start:end] + logger.info(f"executing additional backfill query depth={depth}") search_response = s.execute() + # don't log the query off of ``s`` because it's identical to the already + # logged query elsewhere in the logs, just with different start/end. + logger.debug( + "exectued additional backfill query" + f"start={start} " + f"end={end} " + f"es_took_ms={search_response.took} " + f"response={json.dumps(search_response.to_dict())} " + ) return post_process_results( - s, start, end, page_size, search_response, filter_dead + s, start, end, page_size, search_response, filter_dead, depth=depth + 1 ) return results[:page_size] @@ -148,13 +227,22 @@ def get_result_and_page_count( :param results: The list of filtered result Hits. :return: Result and page count. """ + logger = parent_logger.getChild("get_result_and_page_count") result_count = response_obj.hits.total.value natural_page_count = int(result_count / page_size) if natural_page_count % page_size != 0: + logger.debug("incrementing natural_page_count") natural_page_count += 1 last_allowed_page = int((5000 + page_size / 2) / page_size) page_count = min(natural_page_count, last_allowed_page) if len(results) < page_size and page_count == 0: + logger.debug(f"setting result_count to len(results)={len(results)}") result_count = len(results) + logger.debug( + f"result_count={result_count} " + f"page_count={page_count} " + f"natural_page_count={natural_page_count} " + f"last_allowed_page={last_allowed_page}" + ) return result_count, page_count diff --git a/api/catalog/api/utils/oauth2_helper.py b/api/catalog/api/utils/oauth2_helper.py index ccfa0d0f1..e7673e771 100644 --- a/api/catalog/api/utils/oauth2_helper.py +++ b/api/catalog/api/utils/oauth2_helper.py @@ -6,7 +6,7 @@ from catalog.api import models -log = logging.getLogger(__name__) +parent_logger = logging.getLogger(__name__) def get_token_info(token: str): @@ -19,6 +19,7 @@ def get_token_info(token: str): token, rate limit model, and email verification status as a tuple; else return (None, None, None). """ + logger = parent_logger.getChild("get_token_info") try: token = AccessToken.objects.get(token=token) except AccessToken.DoesNotExist: @@ -30,11 +31,11 @@ def get_token_info(token: str): rate_limit_model = application.rate_limit_model verified = application.verified except models.ThrottledApplication.DoesNotExist: - log.warning("Failed to find application associated with access token.") + logger.warning("Failed to find application associated with access token.") client_id = None rate_limit_model = None verified = None return client_id, rate_limit_model, verified else: - log.warning("Rejected expired access token.") + logger.warning("Rejected expired access token.") return None, None, None diff --git a/api/catalog/api/utils/throttle.py b/api/catalog/api/utils/throttle.py index f6938fae1..43120196a 100644 --- a/api/catalog/api/utils/throttle.py +++ b/api/catalog/api/utils/throttle.py @@ -1,5 +1,4 @@ import abc -import logging from rest_framework.throttling import SimpleRateThrottle @@ -8,9 +7,6 @@ from catalog.api.utils.oauth2_helper import get_token_info -log = logging.getLogger(__name__) - - class ThrottleExemption(abc.ABC): """ Abstract class describing a given throttle exemption. diff --git a/api/catalog/api/utils/validate_images.py b/api/catalog/api/utils/validate_images.py index 3dcd1bb3c..9a94e682c 100644 --- a/api/catalog/api/utils/validate_images.py +++ b/api/catalog/api/utils/validate_images.py @@ -7,7 +7,7 @@ from catalog.api.utils.dead_link_mask import get_query_mask, save_query_mask -log = logging.getLogger(__name__) +parent_logger = logging.getLogger(__name__) def validate_images(query_hash, start_slice, results, image_urls): @@ -19,8 +19,12 @@ def validate_images(query_hash, start_slice, results, image_urls): Results are cached in redis and shared amongst all API servers in the cluster. """ + logger = parent_logger.getChild("validate_images") if not image_urls: + logger.info("no image urls to validate") return + + logger.debug("starting validation") start_time = time.time() # Pull matching images from the cache. redis = django_redis.get_redis_connection("default") @@ -29,11 +33,13 @@ def validate_images(query_hash, start_slice, results, image_urls): cached_statuses = [ int(b.decode("utf-8")) if b is not None else None for b in cached_statuses ] + logger.debug(f"len(cached_statuses)={len(cached_statuses)}") # Anything that isn't in the cache needs to be validated via HEAD request. to_verify = {} for idx, url in enumerate(image_urls): if cached_statuses[idx] is None: to_verify[url] = idx + logger.debug(f"len(to_verify)={len(to_verify)}") reqs = ( grequests.head(u, allow_redirects=False, timeout=2, verify=False) for u in to_verify.keys() @@ -58,11 +64,14 @@ def validate_images(query_hash, start_slice, results, image_urls): for key, status in to_cache.items(): # Cache successful links for a day, and broken links for 120 days. if status == 200: + logger.debug("healthy link " f"key={key} ") pipe.expire(key, twenty_four_hours_seconds) elif status == -1: + logger.debug("no response from provider " f"key={key}") # Content provider failed to respond; try again in a short interval pipe.expire(key, thirty_minutes) else: + logger.debug("broken link " f"key={key} ") pipe.expire(key, twenty_four_hours_seconds * 120) pipe.execute() @@ -81,14 +90,16 @@ def validate_images(query_hash, start_slice, results, image_urls): del_idx = len(cached_statuses) - idx - 1 status = cached_statuses[del_idx] if status == 429 or status == 403: - log.warning( + logger.warning( "Image validation failed due to rate limiting or blocking. " - f"Affected URL: {image_urls[idx]}" + f"url={image_urls[idx]} " + f"status={status} " ) elif status != 200: - log.info( - f"Deleting broken image with ID " - f"{results[del_idx]['identifier']} from results." + logger.info( + "Deleting broken image from results " + f"id={results[del_idx]['identifier']} " + f"status={status} " ) del results[del_idx] new_mask[del_idx] = 0 @@ -100,8 +111,14 @@ def validate_images(query_hash, start_slice, results, image_urls): save_query_mask(query_hash, new_mask) end_time = time.time() - log.info(f"Validated images in {end_time - start_time} ") + logger.debug( + "end validation " + f"end_time={end_time} " + f"start_time={start_time} " + f"delta={end_time - start_time} " + ) def _validation_failure(request, exception): - log.warning(f"Failed to validate image! Reason: {exception}") + logger = parent_logger.getChild("_validation_failure") + logger.warning(f"Failed to validate image! Reason: {exception}") diff --git a/api/catalog/api/utils/waveform.py b/api/catalog/api/utils/waveform.py index 6a5478e7f..605f7da48 100644 --- a/api/catalog/api/utils/waveform.py +++ b/api/catalog/api/utils/waveform.py @@ -11,7 +11,7 @@ import requests -log = logging.getLogger(__name__) +parent_logger = logging.getLogger(__name__) TMP_DIR = pathlib.Path("/tmp").resolve() UA_STRING = "OpenverseWaveform/0.0 (https://wordpress.org/openverse)" @@ -41,18 +41,19 @@ def download_audio(url, identifier): :param identifier: the identifier of the media object to name the file :returns: the name of the file on the disk """ - log.info(f"Downloading file at {url}") + logger = parent_logger.getChild("download_audio") + logger.info(f"downloading file url={url}") headers = {"User-Agent": UA_STRING} with requests.get(url, stream=True, headers=headers) as res: - log.debug(f"Response code: {res.status_code}") + logger.debug(f"res.status_code={res.status_code}") mimetype = res.headers["content-type"] - log.debug(f"MIME type: {mimetype}") + logger.debug(f"mimetype={mimetype}") ext = ext_from_url(url) or mimetypes.guess_extension(mimetype) if ext is None: raise ValueError("Could not identify media extension") file_name = f"audio-{identifier}{ext}" - log.debug(f"File name: {file_name}") + logger.debug(f"file name={file_name}") with open(TMP_DIR.joinpath(file_name), "wb") as file: shutil.copyfileobj(res.raw, file) return file_name @@ -67,7 +68,8 @@ def generate_waveform(file_name, duration): :param file_name: the name of the downloaded audio file :param duration: the duration of the audio to determine pixels per second """ - log.info("Invoking audiowaveform") + logger = parent_logger.getChild("generate_waveform") + logger.info("Invoking audiowaveform") pps = math.ceil(1e6 / duration) # approx 1000 points in total args = [ @@ -79,9 +81,9 @@ def generate_waveform(file_name, duration): "--pixels-per-second", str(pps), ] - log.debug(f'Command: {" ".join(args)}') + logger.debug(f'executing subprocess command={" ".join(args)}') proc = subprocess.run(args, cwd=TMP_DIR, check=True, capture_output=True) - log.debug(f"Subprocess exit code: {proc.returncode}") + logger.debug(f"finished subprocess proc.returncode={proc.returncode}") return proc.stdout @@ -95,11 +97,12 @@ def process_waveform_output(json_out): :param json_out: the JSON output generated by ``audiowaveform`` :returns: the list of peaks """ - log.info("Transforming points") + logger = parent_logger.getChild("process_waveform_output") + logger.info("Transforming points") output = json.loads(json_out) data = output["data"] - log.debug(f"Original umber of points: {len(data)}") + logger.debug(f"initial points len(data)={len(data)}") transformed_data = [] max_val = 0 @@ -112,7 +115,9 @@ def process_waveform_output(json_out): if val > max_val: max_val = val transformed_data = [round(val / max_val, 5) for val in transformed_data] - log.debug(f"Transformed number of points: {len(transformed_data)}") + logger.debug( + f"finished transformation len(transformed_data)={len(transformed_data)}" + ) return transformed_data @@ -122,15 +127,16 @@ def cleanup(file_name): :param file_name: the name of the file to delete """ - log.info(f"Deleting {file_name}") + logger = parent_logger.getChild("cleanup") file_path = TMP_DIR.joinpath(file_name) - log.debug(f"File path: {file_path}") + logger.debug(f"file_path={file_path}") if file_path.exists(): - log.info(f"Deleting file {file_path}") + logger.debug("deleting file") os.remove(file_path) + logger.debug("file deleted") else: - log.info("File not found, nothing deleted") + logger.debug("file not found, nothing deleted") def generate_peaks(audio) -> List[float]: diff --git a/api/catalog/api/views/audio_views.py b/api/catalog/api/views/audio_views.py index 620e6cebd..03e91b045 100644 --- a/api/catalog/api/views/audio_views.py +++ b/api/catalog/api/views/audio_views.py @@ -1,5 +1,3 @@ -import logging - from django.utils.decorators import method_decorator from rest_framework.decorators import action from rest_framework.response import Response @@ -27,9 +25,6 @@ from catalog.api.views.media_views import MediaViewSet -log = logging.getLogger(__name__) - - @method_decorator(swagger_auto_schema(**AudioSearch.swagger_setup), "list") @method_decorator(swagger_auto_schema(**AudioStats.swagger_setup), "stats") @method_decorator(swagger_auto_schema(**AudioDetail.swagger_setup), "retrieve") diff --git a/api/catalog/api/views/image_views.py b/api/catalog/api/views/image_views.py index 6b3f40498..17dff394b 100644 --- a/api/catalog/api/views/image_views.py +++ b/api/catalog/api/views/image_views.py @@ -1,5 +1,4 @@ import io -import logging from django.conf import settings from django.http.response import FileResponse, Http404, HttpResponse @@ -37,9 +36,6 @@ from catalog.api.views.media_views import MediaViewSet -log = logging.getLogger(__name__) - - @method_decorator(swagger_auto_schema(**ImageSearch.swagger_setup), "list") @method_decorator(swagger_auto_schema(**ImageStats.swagger_setup), "stats") @method_decorator(swagger_auto_schema(**ImageDetail.swagger_setup), "retrieve") diff --git a/api/catalog/logger.py b/api/catalog/logger.py index c6651db65..188638ca9 100644 --- a/api/catalog/logger.py +++ b/api/catalog/logger.py @@ -1,11 +1,16 @@ from logging import LogRecord +from decouple import config + def health_check_filter(record: LogRecord) -> bool: # Filter out health checks from the logs, they're verbose and happen frequently return not ("GET /healthcheck" in record.getMessage() and record.status_code == 200) +LOG_LEVEL = config("LOG_LEVEL", default="INFO").upper() + + # Logging configuration LOGGING = { # NOTE: Most of this is inherited from the default configuration @@ -33,7 +38,7 @@ def health_check_filter(record: LogRecord) -> bool: "handlers": { # Default console logger "console": { - "level": "INFO", + "level": LOG_LEVEL, "filters": ["require_debug_true", "request_id"], "class": "logging.StreamHandler", "formatter": "console", @@ -47,14 +52,14 @@ def health_check_filter(record: LogRecord) -> bool: }, # Handler for all other logging "general_console": { - "level": "INFO", + "level": LOG_LEVEL, "filters": ["request_id"], "class": "logging.StreamHandler", "formatter": "console", }, # Default server logger "django.server": { - "level": "INFO", + "level": LOG_LEVEL, "filters": ["request_id"], "class": "logging.StreamHandler", "formatter": "django.server", @@ -69,6 +74,8 @@ def health_check_filter(record: LogRecord) -> bool: "loggers": { "django": { "handlers": ["console", "console_prod", "mail_admins"], + # Keep this at info to avoid django internal debug logs; + # we just want our own debug logs when log level is set to debug "level": "INFO", "propagate": False, }, @@ -76,14 +83,14 @@ def health_check_filter(record: LogRecord) -> bool: "handlers": ["django.server"], # Filter health check logs "filters": ["health_check", "request_id"], - "level": "INFO", + "level": LOG_LEVEL, "propagate": False, }, # Default handler for all other loggers "": { "handlers": ["general_console"], "filters": ["request_id"], - "level": "INFO", + "level": LOG_LEVEL, }, }, } diff --git a/api/docs/guides/index.md b/api/docs/guides/index.md index 25aa56785..6c099f4b5 100644 --- a/api/docs/guides/index.md +++ b/api/docs/guides/index.md @@ -14,4 +14,5 @@ https document publish deploy +logging ``` diff --git a/api/docs/guides/logging.md b/api/docs/guides/logging.md new file mode 100644 index 000000000..01ea4e382 --- /dev/null +++ b/api/docs/guides/logging.md @@ -0,0 +1,23 @@ +# Logging + +Openverse is in the process of establishing and implementing its logging strategy. Until the [introduction of more extensive logging in the Elasticsearch controller modules](https://github.com/WordPress/openverse-api/pull/790) we did not have an established approach to logging. That PR still does not establish a comprehensive approach, but it does introduce and follow some rules: + +1. Always use a child logger based on the module name and method name. For example, at the top level of a module, there should be a `parent_logger` variable that is the result of calling `logging.getLogger(__name__)`. Individual methods that log should [get a child logger for themselves based on this parent logger and their method name](https://docs.python.org/3/library/logging.html#logging.Logger.getChild): +```py +def apply_filters(self, ...): + logging = parent_logger.getChild("apply_filters") + ... +``` +2. When logging variable values, log them in the following format: `name=value`. Prefer that `name` equals the expression that produces the `value`. For example, if logging the length of a list named `providers`, `name` should be `len(providers)`, with the full expression looking like this: `len(providers)={len(providers)}`. If logging a property of an object, prefer `object.property_name={object.property_name}`. Exclude serialization from the name like `json.dumps` as this is assumed: `verified={json.dumps(verified)}`. +3. Avoid using the `pprint` module for serializing log data as it significantly increases the amount of space and time it takes to write a log. Instead, prefer a simpler `json.dumps`. + +These practices provide context in the logs and makes them uniformly searchable and filterable based on the values assigned to the names. Using a child logger means we can easily see all the logs for a method. Using the `name=value` format means we always know how to filter any given logged variable either by name or by name and value. + +Openverse also makes use of request IDs. Because multiple workers are writing to the same log at once, any given request's logs may be interspersed with the logs of another. Filtering on the request ID allows us to see the full logging story of a single request. This allows us to trace problematic requests through the codebase and understand what specific parts of the code are causing problems with these particular requests. + +## Future improvements + +Potential future improvements to logging in Openverse could include: + +1. Even more uniform data logging format like formatting all logs as JSON. +2. Establishing clearer practices around what log levels to use and when.