From 0c652b1815e196cb08557d83bca7835210064aba Mon Sep 17 00:00:00 2001 From: Dhruv Bhanushali Date: Sat, 16 Dec 2023 10:32:23 +0400 Subject: [PATCH] Make tallying of thumbnail errors Redis-resilient --- api/api/utils/image_proxy/__init__.py | 23 ++++++++- api/test/unit/utils/test_image_proxy.py | 63 +++++++++++++++++++------ 2 files changed, 70 insertions(+), 16 deletions(-) diff --git a/api/api/utils/image_proxy/__init__.py b/api/api/utils/image_proxy/__init__.py index b7f85dad6ea..c31c72fca77 100644 --- a/api/api/utils/image_proxy/__init__.py +++ b/api/api/utils/image_proxy/__init__.py @@ -1,3 +1,4 @@ +import itertools import logging from dataclasses import dataclass from typing import Literal @@ -12,6 +13,7 @@ import sentry_sdk from aiohttp.client_exceptions import ClientResponseError from asgiref.sync import sync_to_async +from redis.exceptions import ConnectionError from sentry_sdk import push_scope, set_context from api.utils.aiohttp import get_aiohttp_session @@ -24,6 +26,8 @@ parent_logger = logging.getLogger(__name__) +exception_iterator = itertools.count() + HEADERS = { "User-Agent": settings.OUTBOUND_USER_AGENT_TEMPLATE.format( purpose="ThumbnailGeneration" @@ -113,6 +117,15 @@ def _tally_response( ) +@sync_to_async +def _tally_client_response_errors(tallies, month: str, domain: str, status: int): + logger = parent_logger.getChild("_tally_client_response_errors") + try: + tallies.incr(f"thumbnail_http_error:{domain}:{month}:{status}") + except ConnectionError: + logger.warning("Redis connect failed, thumbnail HTTP errors not tallied.") + + _UPSTREAM_TIMEOUT = aiohttp.ClientTimeout(15) @@ -178,7 +191,13 @@ async def get( except Exception as exc: exception_name = f"{exc.__class__.__module__}.{exc.__class__.__name__}" key = f"thumbnail_error:{exception_name}:{domain}:{month}" - count = await tallies_incr(key) + try: + count = await tallies_incr(key) + except ConnectionError: + logger.warning("Redis connect failed, thumbnail errors not tallied.") + # We will use a counter to space out Sentry logs. + count = next(exception_iterator) + if count <= settings.THUMBNAIL_ERROR_INITIAL_ALERT_THRESHOLD or ( count % settings.THUMBNAIL_ERROR_REPEATED_ALERT_FREQUENCY == 0 ): @@ -198,7 +217,7 @@ async def get( if isinstance(exc, ClientResponseError): status = exc.status do_not_wait_for( - tallies_incr(f"thumbnail_http_error:{domain}:{month}:{status}") + _tally_client_response_errors(tallies, month, domain, status) ) logger.warning( f"Failed to render thumbnail " diff --git a/api/test/unit/utils/test_image_proxy.py b/api/test/unit/utils/test_image_proxy.py index cc9983736f1..2e2a0add367 100644 --- a/api/test/unit/utils/test_image_proxy.py +++ b/api/test/unit/utils/test_image_proxy.py @@ -1,6 +1,8 @@ import asyncio +import itertools from dataclasses import replace from test.factory.models.image import ImageFactory +from unittest.mock import patch from urllib.parse import urlencode from django.conf import settings @@ -374,6 +376,7 @@ def test_get_successful_records_response_code( ), ], ) +@cache_availability_params @alert_count_params def test_get_exception_handles_error( photon_get, @@ -383,14 +386,25 @@ def test_get_exception_handles_error( should_alert, sentry_capture_exception, setup_request_exception, - redis, + is_cache_reachable, + cache_name, + request, + caplog, ): + cache = request.getfixturevalue(cache_name) + setup_request_exception(exc) month = get_monthly_timestamp() key = f"thumbnail_error:{exc_name}:{TEST_IMAGE_DOMAIN}:{month}" - redis.set(key, count_start) + if is_cache_reachable: + cache.set(key, count_start) - with pytest.raises(UpstreamThumbnailException): + with ( + pytest.raises(UpstreamThumbnailException), + patch( + "api.utils.image_proxy.exception_iterator", itertools.count(count_start + 1) + ), + ): photon_get(TEST_MEDIA_INFO) assert_func = ( @@ -399,9 +413,14 @@ def test_get_exception_handles_error( else sentry_capture_exception.assert_not_called ) assert_func() - assert redis.get(key) == str(count_start + 1).encode() + + if is_cache_reachable: + assert cache.get(key) == str(count_start + 1).encode() + else: + assert "Redis connect failed, thumbnail errors not tallied." in caplog.text +@cache_availability_params @alert_count_params @pytest.mark.parametrize( "status_code, text", @@ -419,15 +438,23 @@ def test_get_http_exception_handles_error( count_start, should_alert, sentry_capture_exception, - redis, + is_cache_reachable, + cache_name, + request, + caplog, ): + cache = request.getfixturevalue(cache_name) + month = get_monthly_timestamp() key = f"thumbnail_error:aiohttp.client_exceptions.ClientResponseError:{TEST_IMAGE_DOMAIN}:{month}" - redis.set(key, count_start) + if is_cache_reachable: + cache.set(key, count_start) - with pytest.raises(UpstreamThumbnailException): + with pytest.raises(UpstreamThumbnailException), patch( + "api.utils.image_proxy.exception_iterator", itertools.count(count_start + 1) + ): with pook.use(): - pook.get(PHOTON_URL_FOR_TEST_IMAGE).reply(status_code, text).mock + pook.get(PHOTON_URL_FOR_TEST_IMAGE).reply(status_code, text) photon_get(TEST_MEDIA_INFO) assert_func = ( @@ -436,13 +463,21 @@ def test_get_http_exception_handles_error( else sentry_capture_exception.assert_not_called ) assert_func() - assert redis.get(key) == str(count_start + 1).encode() - # Assertions about the HTTP error specific message - assert ( - redis.get(f"thumbnail_http_error:{TEST_IMAGE_DOMAIN}:{month}:{status_code}") - == b"1" - ) + if is_cache_reachable: + assert cache.get(key) == str(count_start + 1).encode() + assert ( + cache.get(f"thumbnail_http_error:{TEST_IMAGE_DOMAIN}:{month}:{status_code}") + == b"1" + ) + else: + assert all( + message in caplog.text + for message in [ + "Redis connect failed, thumbnail HTTP errors not tallied.", + "Redis connect failed, thumbnail errors not tallied.", + ] + ) @pook.on