From 8615579a7c2fc93f1f6d481abbdd11819e4144a4 Mon Sep 17 00:00:00 2001 From: Staci Mullins <63313398+stacimc@users.noreply.github.com> Date: Mon, 26 Feb 2024 10:58:02 -0800 Subject: [PATCH 1/4] Extend Europeana timeout even further (#3826) --- catalog/dags/providers/provider_workflows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalog/dags/providers/provider_workflows.py b/catalog/dags/providers/provider_workflows.py index 441b59b57c6..08dc13bc22a 100644 --- a/catalog/dags/providers/provider_workflows.py +++ b/catalog/dags/providers/provider_workflows.py @@ -218,7 +218,7 @@ def __post_init__(self): start_date=datetime(2022, 10, 27), schedule_string="@daily", dated=True, - pull_timeout=timedelta(weeks=1), + pull_timeout=timedelta(days=12), ), ProviderWorkflow( ingester_class=FinnishMuseumsDataIngester, From 157bb6c9ea31983d8b8adf69ab1b572727e419f8 Mon Sep 17 00:00:00 2001 From: "Openverse (Bot)" <101814513+openverse-bot@users.noreply.github.com> Date: Tue, 27 Feb 2024 07:59:27 +1100 Subject: [PATCH 2/4] Publish changelog for frontend-2024.02.26.18.58.35 (#3829) Co-authored-by: AetherUnbound --- documentation/changelogs/frontend/2024.02.26.18.58.35.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 documentation/changelogs/frontend/2024.02.26.18.58.35.md diff --git a/documentation/changelogs/frontend/2024.02.26.18.58.35.md b/documentation/changelogs/frontend/2024.02.26.18.58.35.md new file mode 100644 index 00000000000..136625f2ee5 --- /dev/null +++ b/documentation/changelogs/frontend/2024.02.26.18.58.35.md @@ -0,0 +1,6 @@ +# 2024.02.26.18.58.35 + +## Bug Fixes + +- Fix sketchfab loading state error + ([#3794](https://github.com/WordPress/openverse/pull/3794)) by @zackkrida From 2cb8b5eb11f0abb39f79697ae2b8ddc065b7e0f7 Mon Sep 17 00:00:00 2001 From: Kanishka Bansode <96020697+kb-0311@users.noreply.github.com> Date: Tue, 27 Feb 2024 09:00:57 +0530 Subject: [PATCH 3/4] Replaced `get_token_info` calls with the `request.auth.application` (#3528) * Replace get_token_info with request.auth.application Co-authored-by: Kanishka Bansode <96020697+kb-0311@users.noreply.github.com> * Remove usage of request.user for anon request checks * Invert logic for checking authorization * Update test to reflect that the request is anonymous Previously the request in this test was considered not anonymous, because `request` was None. With the update it is identified as anonymous, and therefore the authority boost is not applied. --------- Co-authored-by: sarayourfriend Co-authored-by: Staci Cooper --- .../middleware/response_headers_middleware.py | 17 +++-- api/api/serializers/media_serializers.py | 4 +- api/api/utils/oauth2_helper.py | 65 ------------------- api/api/utils/throttle.py | 20 +++--- api/api/views/oauth2_views.py | 20 ++---- .../test_search_controller_search_query.py | 1 - api/test/unit/utils/test_throttle.py | 1 + 7 files changed, 31 insertions(+), 97 deletions(-) delete mode 100644 api/api/utils/oauth2_helper.py diff --git a/api/api/middleware/response_headers_middleware.py b/api/api/middleware/response_headers_middleware.py index 76b21e4ae6a..740c4650378 100644 --- a/api/api/middleware/response_headers_middleware.py +++ b/api/api/middleware/response_headers_middleware.py @@ -1,4 +1,6 @@ -from api.utils.oauth2_helper import get_token_info +from rest_framework.request import Request + +from api.models.oauth import ThrottledApplication def response_headers_middleware(get_response): @@ -11,14 +13,15 @@ def response_headers_middleware(get_response): to identify malicious requesters or request patterns. """ - def middleware(request): + def middleware(request: Request): response = get_response(request) - if hasattr(request, "auth") and request.auth: - token_info = get_token_info(str(request.auth)) - if token_info: - response["x-ov-client-application-name"] = token_info.application_name - response["x-ov-client-application-verified"] = token_info.verified + if not (hasattr(request, "auth") and hasattr(request.auth, "application")): + return response + + application: ThrottledApplication = request.auth.application + response["x-ov-client-application-name"] = application.name + response["x-ov-client-application-verified"] = application.verified return response diff --git a/api/api/serializers/media_serializers.py b/api/api/serializers/media_serializers.py index 961ce5b5027..1dd62e8b054 100644 --- a/api/api/serializers/media_serializers.py +++ b/api/api/serializers/media_serializers.py @@ -54,7 +54,7 @@ class PaginatedRequestSerializer(serializers.Serializer): def validate_page_size(self, value): request = self.context.get("request") - is_anonymous = bool(request and request.user and request.user.is_anonymous) + is_anonymous = getattr(request, "auth", None) is None max_value = ( settings.MAX_ANONYMOUS_PAGE_SIZE if is_anonymous @@ -247,7 +247,7 @@ class MediaSearchRequestSerializer(PaginatedRequestSerializer): def is_request_anonymous(self): request = self.context.get("request") - return bool(request and request.user and request.user.is_anonymous) + return getattr(request, "auth", None) is None @staticmethod def _truncate(value): diff --git a/api/api/utils/oauth2_helper.py b/api/api/utils/oauth2_helper.py deleted file mode 100644 index 6ffd0c4c0c9..00000000000 --- a/api/api/utils/oauth2_helper.py +++ /dev/null @@ -1,65 +0,0 @@ -import datetime as dt -import logging -from dataclasses import dataclass - -from oauth2_provider.models import AccessToken - -from api import models - - -parent_logger = logging.getLogger(__name__) - - -@dataclass -class TokenInfo: - """Extracted ``models.ThrottledApplication`` metadata.""" - - client_id: str - rate_limit_model: str - verified: bool - application_name: str - - @property - def valid(self): - return self.client_id and self.verified - - -def get_token_info(token: str) -> None | TokenInfo: - """ - Recover an OAuth2 application client ID and rate limit model from an access token. - - :param token: An OAuth2 access token. - :return: If the token is valid, return the client ID associated with the - token, rate limit model, and email verification status as a tuple; else - return ``(None, None, None)``. - """ - logger = parent_logger.getChild("get_token_info") - try: - token = AccessToken.objects.get(token=token) - except AccessToken.DoesNotExist: - return None - - try: - application = models.ThrottledApplication.objects.get(accesstoken=token) - except models.ThrottledApplication.DoesNotExist: - # Critical because it indicates a data integrity problem. - # In practice should never occur so long as the preceding - # operation to retrieve the access token was successful. - logger.critical("Failed to find application associated with access token.") - return None - - expired = token.expires < dt.datetime.now(token.expires.tzinfo) - if expired: - logger.info( - "rejected expired access token " - f"application.name={application.name} " - f"application.client_id={application.client_id} " - ) - return None - - return TokenInfo( - client_id=str(application.client_id), - rate_limit_model=application.rate_limit_model, - verified=application.verified, - application_name=application.name, - ) diff --git a/api/api/utils/throttle.py b/api/api/utils/throttle.py index 4e3effaab34..8d4f4b1a859 100644 --- a/api/api/utils/throttle.py +++ b/api/api/utils/throttle.py @@ -5,8 +5,6 @@ from redis.exceptions import ConnectionError -from api.utils.oauth2_helper import get_token_info - parent_logger = logging.getLogger(__name__) @@ -47,8 +45,11 @@ def has_valid_token(self, request): if not request.auth: return False - token_info = get_token_info(str(request.auth)) - return token_info and token_info.valid + application = getattr(request.auth, "application", None) + if application is None: + return False + + return application.client_id and application.verified def get_cache_key(self, request, view): return self.cache_format % { @@ -146,15 +147,16 @@ class AbstractOAuth2IdRateThrottle(SimpleRateThrottle, metaclass=abc.ABCMeta): def get_cache_key(self, request, view): # Find the client ID associated with the access token. - auth = str(request.auth) - token_info = get_token_info(auth) - if not (token_info and token_info.valid): + if not self.has_valid_token(request): return None - if token_info.rate_limit_model not in self.applies_to_rate_limit_model: + # `self.has_valid_token` call earlier ensures accessing `application` will not fail + application = request.auth.application + + if application.rate_limit_model not in self.applies_to_rate_limit_model: return None - return self.cache_format % {"scope": self.scope, "ident": token_info.client_id} + return self.cache_format % {"scope": self.scope, "ident": application.client_id} class OAuth2IdThumbnailRateThrottle(AbstractOAuth2IdRateThrottle): diff --git a/api/api/views/oauth2_views.py b/api/api/views/oauth2_views.py index db4d780af72..b8511ae03ad 100644 --- a/api/api/views/oauth2_views.py +++ b/api/api/views/oauth2_views.py @@ -7,6 +7,7 @@ from django.conf import settings from django.core.cache import cache from django.core.mail import send_mail +from rest_framework.request import Request from rest_framework.response import Response from rest_framework.reverse import reverse from rest_framework.views import APIView @@ -22,7 +23,6 @@ OAuth2KeyInfoSerializer, OAuth2RegistrationSerializer, ) -from api.utils.oauth2_helper import get_token_info from api.utils.throttle import OnePerSecond, TenPerDay @@ -169,7 +169,7 @@ class CheckRates(APIView): throttle_classes = (OnePerSecond,) @key_info - def get(self, request, format=None): + def get(self, request: Request, format=None): """ Get information about your API key. @@ -181,23 +181,17 @@ def get(self, request, format=None): """ # TODO: Replace 403 responses with DRF `authentication_classes`. - if not request.auth: + if not request.auth or not hasattr(request.auth, "application"): return Response(status=403, data="Forbidden") - access_token = str(request.auth) - token_info = get_token_info(access_token) + application: ThrottledApplication = request.auth.application - if not token_info: - # This shouldn't happen if `request.auth` was true above, - # but better safe than sorry - return Response(status=403, data="Forbidden") - - client_id = token_info.client_id + client_id = application.client_id if not client_id: return Response(status=403, data="Forbidden") - throttle_type = token_info.rate_limit_model + throttle_type = application.rate_limit_model throttle_key = "throttle_{scope}_{client_id}" if throttle_type == "standard": sustained_throttle_key = throttle_key.format( @@ -242,7 +236,7 @@ def get(self, request, format=None): "requests_this_minute": burst_requests, "requests_today": sustained_requests, "rate_limit_model": throttle_type, - "verified": token_info.verified, + "verified": application.verified, } ) return Response(status=status, data=response_data.data) diff --git a/api/test/unit/controllers/test_search_controller_search_query.py b/api/test/unit/controllers/test_search_controller_search_query.py index 3ccd56daded..081bd356c27 100644 --- a/api/test/unit/controllers/test_search_controller_search_query.py +++ b/api/test/unit/controllers/test_search_controller_search_query.py @@ -172,7 +172,6 @@ def test_create_search_query_q_search_with_filters(image_media_type_config): } }, {"rank_feature": {"boost": 10000, "field": "standardized_popularity"}}, - {"rank_feature": {"boost": 25000, "field": "authority_boost"}}, ], } diff --git a/api/test/unit/utils/test_throttle.py b/api/test/unit/utils/test_throttle.py index 1afdaff5325..a727ea2b836 100644 --- a/api/test/unit/utils/test_throttle.py +++ b/api/test/unit/utils/test_throttle.py @@ -71,6 +71,7 @@ def enable_throttles(settings): def access_token(): token = AccessTokenFactory.create() token.application.verified = True + token.application.client_id = 123 token.application.save() return token From 2cffcb9f8da6961e84a00854a3cd472fd0f9dad8 Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Tue, 27 Feb 2024 14:31:55 +1100 Subject: [PATCH 4/4] Use infrequent access when uploading provider TSVs (#3810) * Add recipe useful for testing env var changes * Use infrequent access storage class for provider TSVs * Fix mock failing on new call * Update catalog/env.template Co-authored-by: Madison Swain-Bowden --------- Co-authored-by: Madison Swain-Bowden --- catalog/dags/common/loader/s3.py | 8 +++- .../dags/providers/provider_dag_factory.py | 3 ++ catalog/dags/providers/provider_workflows.py | 41 ++++++++++++++++++- catalog/env.template | 6 +++ .../dags/providers/test_provider_workflows.py | 1 + justfile | 4 ++ 6 files changed, 60 insertions(+), 3 deletions(-) diff --git a/catalog/dags/common/loader/s3.py b/catalog/dags/common/loader/s3.py index 7410bac6780..99f267f86e4 100644 --- a/catalog/dags/common/loader/s3.py +++ b/catalog/dags/common/loader/s3.py @@ -42,12 +42,15 @@ def copy_file_to_s3( s3_prefix, aws_conn_id, ti, + extra_args=None, ): """ Copy a TSV file to S3 with the given prefix. The TSV's version is pushed to the `tsv_version` XCom, and the constructed S3 key is pushed to the `s3_key` XCom. The TSV is removed after the upload is complete. + + ``extra_args`` refers to the S3Hook argument. """ if tsv_file_path is None: raise FileNotFoundError("No TSV file path was provided") @@ -57,7 +60,10 @@ def copy_file_to_s3( tsv_version = paths.get_tsv_version(tsv_file_path) s3_key = f"{s3_prefix}/{tsv_file.name}" logger.info(f"Uploading {tsv_file_path} to {s3_bucket}:{s3_key}") - s3 = S3Hook(aws_conn_id=aws_conn_id) + s3 = S3Hook( + aws_conn_id=aws_conn_id, + extra_args=extra_args or {}, + ) s3.load_file(tsv_file_path, s3_key, bucket_name=s3_bucket) ti.xcom_push(key="tsv_version", value=tsv_version) ti.xcom_push(key="s3_key", value=s3_key) diff --git a/catalog/dags/providers/provider_dag_factory.py b/catalog/dags/providers/provider_dag_factory.py index 1e5a5d04068..dc31be4892c 100644 --- a/catalog/dags/providers/provider_dag_factory.py +++ b/catalog/dags/providers/provider_dag_factory.py @@ -230,6 +230,9 @@ def append_day_shift(id_str): else None, ), "aws_conn_id": AWS_CONN_ID, + "extra_args": { + "StorageClass": conf.s3_tsv_storage_class, + }, }, trigger_rule=TriggerRule.NONE_SKIPPED, ) diff --git a/catalog/dags/providers/provider_workflows.py b/catalog/dags/providers/provider_workflows.py index 08dc13bc22a..0c1f54858eb 100644 --- a/catalog/dags/providers/provider_workflows.py +++ b/catalog/dags/providers/provider_workflows.py @@ -166,6 +166,25 @@ class ProviderWorkflow: tags: list[str] = field(default_factory=list) overrides: list[TaskOverride] = field(default_factory=list) + # Set when the object is uploaded, even though we access the object later in + # the DAG. IA incurs additional retrieval fees per request, unlike plain + # standard storage. However, as of writing, that costs 0.001 USD (1/10th of + # a US cent) per 1k requests. In other words, a minuscule amount, considering + # we will access the object once later in the DAG, to upsert it to the DB, + # and then in all likelihood never access it again. + # Even if we did, and had to pay the retrieval fee, we would still come out + # ahead on storage costs, because IA is so much less expensive than regular + # storage. We could set the storage class in a later task in the DAG, to + # avoid the one time retrieval fee. However, that adds complexity to the DAG + # that we can avoid by eagerly setting the storage class early, and the actual + # savings would probably be nil, factoring in the time spent in standard storage + # incurring standard storage costs. If it absolutely needs to be rationalised, + # consider the amount of energy spent on the extra request to S3 to update the + # storage cost to try to get around a retrieval fee (which, again, will not + # actually cost more, all things considered). Saving that energy could melt + # the glaciers all that much more slowly. + s3_tsv_storage_class: str = "STANDARD_IA" + def _get_module_info(self): # Get the module the ProviderDataIngester was defined in provider_script = inspect.getmodule(self.ingester_class) @@ -186,12 +205,30 @@ def __post_init__(self): if not self.doc_md: self.doc_md = provider_script.__doc__ - # Check for custom configuration overrides, which will be applied when - # the DAG is generated. + self._process_configuration_overrides() + + def _process_configuration_overrides(self): + """ + Check for and apply custom configuration overrides. + + These are only applied when the DAG is generated. + """ + + # Provider-specific configuration overrides self.overrides = Variable.get( "CONFIGURATION_OVERRIDES", default_var={}, deserialize_json=True ).get(self.dag_id, []) + # Allow forcing the default to something other than `STANDARD_IA` + # Primarily meant for use in local development where minio is used + # which does not support all AWS storage classes + # https://github.com/minio/minio/issues/5469 + # This intentionally applies to all providers, rather than the provider-specific + # overrides above + self.s3_tsv_storage_class = Variable.get( + "DEFAULT_S3_TSV_STORAGE_CLASS", default_var=self.s3_tsv_storage_class + ) + PROVIDER_WORKFLOWS = [ ProviderWorkflow( diff --git a/catalog/env.template b/catalog/env.template index 7125009f6ec..a86dd4c6af4 100644 --- a/catalog/env.template +++ b/catalog/env.template @@ -63,6 +63,12 @@ AIRFLOW_CONN_SLACK_NOTIFICATIONS=https://slack AIRFLOW_CONN_SLACK_ALERTS=https://slack S3_LOCAL_ENDPOINT=http://s3:5000 +# Set to a non-default value supported by minio in local development to workaround +# Minio's lack of support for all AWS storage classes, while still using a non-default +# value so that the expected behaviour can be verified (specifically, that the storage +# class is not the default "STANDARD") +# https://github.com/minio/minio/issues/5469 +AIRFLOW_VAR_DEFAULT_S3_TSV_STORAGE_CLASS=REDUCED_REDUNDANCY # Connection to the Ingestion Server, used for managing data refreshes. Default is used to # connect to your locally running ingestion server. diff --git a/catalog/tests/dags/providers/test_provider_workflows.py b/catalog/tests/dags/providers/test_provider_workflows.py index c6f0a6ee17a..09cd3e0b643 100644 --- a/catalog/tests/dags/providers/test_provider_workflows.py +++ b/catalog/tests/dags/providers/test_provider_workflows.py @@ -91,6 +91,7 @@ def test_overrides(configuration_overrides, expected_overrides): with mock.patch("providers.provider_workflows.Variable") as MockVariable: MockVariable.get.side_effect = [ configuration_overrides, + MockVariable.get_original()[0], ] test_workflow = ProviderWorkflow( dag_id="my_dag_id", diff --git a/justfile b/justfile index d5f584b7940..db091073b7d 100644 --- a/justfile +++ b/justfile @@ -200,6 +200,10 @@ init: down *flags: just dc down {{ flags }} +# Take all services down then call the specified app's up recipe. ex.: `just dup catalog` is useful for restarting the catalog with new environment variables +dup app: + just down && just {{ app }}/up + # Recreate all volumes and containers from scratch recreate: just down -v