From 8615579a7c2fc93f1f6d481abbdd11819e4144a4 Mon Sep 17 00:00:00 2001
From: Staci Mullins <63313398+stacimc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:58:02 -0800
Subject: [PATCH 1/4] Extend Europeana timeout even further (#3826)

---
 catalog/dags/providers/provider_workflows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/catalog/dags/providers/provider_workflows.py b/catalog/dags/providers/provider_workflows.py
index 441b59b57c6..08dc13bc22a 100644
--- a/catalog/dags/providers/provider_workflows.py
+++ b/catalog/dags/providers/provider_workflows.py
@@ -218,7 +218,7 @@ def __post_init__(self):
         start_date=datetime(2022, 10, 27),
         schedule_string="@daily",
         dated=True,
-        pull_timeout=timedelta(weeks=1),
+        pull_timeout=timedelta(days=12),
     ),
     ProviderWorkflow(
         ingester_class=FinnishMuseumsDataIngester,

From 157bb6c9ea31983d8b8adf69ab1b572727e419f8 Mon Sep 17 00:00:00 2001
From: "Openverse (Bot)" <101814513+openverse-bot@users.noreply.github.com>
Date: Tue, 27 Feb 2024 07:59:27 +1100
Subject: [PATCH 2/4] Publish changelog for frontend-2024.02.26.18.58.35
 (#3829)

Co-authored-by: AetherUnbound <AetherUnbound@users.noreply.github.com>
---
 documentation/changelogs/frontend/2024.02.26.18.58.35.md | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 documentation/changelogs/frontend/2024.02.26.18.58.35.md

diff --git a/documentation/changelogs/frontend/2024.02.26.18.58.35.md b/documentation/changelogs/frontend/2024.02.26.18.58.35.md
new file mode 100644
index 00000000000..136625f2ee5
--- /dev/null
+++ b/documentation/changelogs/frontend/2024.02.26.18.58.35.md
@@ -0,0 +1,6 @@
+# 2024.02.26.18.58.35
+
+## Bug Fixes
+
+- Fix sketchfab loading state error
+  ([#3794](https://github.com/WordPress/openverse/pull/3794)) by @zackkrida

From 2cb8b5eb11f0abb39f79697ae2b8ddc065b7e0f7 Mon Sep 17 00:00:00 2001
From: Kanishka Bansode <96020697+kb-0311@users.noreply.github.com>
Date: Tue, 27 Feb 2024 09:00:57 +0530
Subject: [PATCH 3/4] Replaced `get_token_info` calls with the
 `request.auth.application` (#3528)

* Replace get_token_info with request.auth.application

Co-authored-by: Kanishka Bansode <96020697+kb-0311@users.noreply.github.com>

* Remove usage of request.user for anon request checks

* Invert logic for checking authorization

* Update test to reflect that the request is anonymous

Previously the request in this test was considered not anonymous,
because `request` was None. With the update it is identified as
anonymous, and therefore the authority boost is not applied.

---------

Co-authored-by: sarayourfriend <git@sarayourfriend.pictures>
Co-authored-by: Staci Cooper <staci.cooper@automattic.com>
---
 .../middleware/response_headers_middleware.py | 17 +++--
 api/api/serializers/media_serializers.py      |  4 +-
 api/api/utils/oauth2_helper.py                | 65 -------------------
 api/api/utils/throttle.py                     | 20 +++---
 api/api/views/oauth2_views.py                 | 20 ++----
 .../test_search_controller_search_query.py    |  1 -
 api/test/unit/utils/test_throttle.py          |  1 +
 7 files changed, 31 insertions(+), 97 deletions(-)
 delete mode 100644 api/api/utils/oauth2_helper.py

diff --git a/api/api/middleware/response_headers_middleware.py b/api/api/middleware/response_headers_middleware.py
index 76b21e4ae6a..740c4650378 100644
--- a/api/api/middleware/response_headers_middleware.py
+++ b/api/api/middleware/response_headers_middleware.py
@@ -1,4 +1,6 @@
-from api.utils.oauth2_helper import get_token_info
+from rest_framework.request import Request
+
+from api.models.oauth import ThrottledApplication
 
 
 def response_headers_middleware(get_response):
@@ -11,14 +13,15 @@ def response_headers_middleware(get_response):
     to identify malicious requesters or request patterns.
     """
 
-    def middleware(request):
+    def middleware(request: Request):
         response = get_response(request)
 
-        if hasattr(request, "auth") and request.auth:
-            token_info = get_token_info(str(request.auth))
-            if token_info:
-                response["x-ov-client-application-name"] = token_info.application_name
-                response["x-ov-client-application-verified"] = token_info.verified
+        if not (hasattr(request, "auth") and hasattr(request.auth, "application")):
+            return response
+
+        application: ThrottledApplication = request.auth.application
+        response["x-ov-client-application-name"] = application.name
+        response["x-ov-client-application-verified"] = application.verified
 
         return response
 
diff --git a/api/api/serializers/media_serializers.py b/api/api/serializers/media_serializers.py
index 961ce5b5027..1dd62e8b054 100644
--- a/api/api/serializers/media_serializers.py
+++ b/api/api/serializers/media_serializers.py
@@ -54,7 +54,7 @@ class PaginatedRequestSerializer(serializers.Serializer):
 
     def validate_page_size(self, value):
         request = self.context.get("request")
-        is_anonymous = bool(request and request.user and request.user.is_anonymous)
+        is_anonymous = getattr(request, "auth", None) is None
         max_value = (
             settings.MAX_ANONYMOUS_PAGE_SIZE
             if is_anonymous
@@ -247,7 +247,7 @@ class MediaSearchRequestSerializer(PaginatedRequestSerializer):
 
     def is_request_anonymous(self):
         request = self.context.get("request")
-        return bool(request and request.user and request.user.is_anonymous)
+        return getattr(request, "auth", None) is None
 
     @staticmethod
     def _truncate(value):
diff --git a/api/api/utils/oauth2_helper.py b/api/api/utils/oauth2_helper.py
deleted file mode 100644
index 6ffd0c4c0c9..00000000000
--- a/api/api/utils/oauth2_helper.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import datetime as dt
-import logging
-from dataclasses import dataclass
-
-from oauth2_provider.models import AccessToken
-
-from api import models
-
-
-parent_logger = logging.getLogger(__name__)
-
-
-@dataclass
-class TokenInfo:
-    """Extracted ``models.ThrottledApplication`` metadata."""
-
-    client_id: str
-    rate_limit_model: str
-    verified: bool
-    application_name: str
-
-    @property
-    def valid(self):
-        return self.client_id and self.verified
-
-
-def get_token_info(token: str) -> None | TokenInfo:
-    """
-    Recover an OAuth2 application client ID and rate limit model from an access token.
-
-    :param token: An OAuth2 access token.
-    :return: If the token is valid, return the client ID associated with the
-    token, rate limit model, and email verification status as a tuple; else
-    return ``(None, None, None)``.
-    """
-    logger = parent_logger.getChild("get_token_info")
-    try:
-        token = AccessToken.objects.get(token=token)
-    except AccessToken.DoesNotExist:
-        return None
-
-    try:
-        application = models.ThrottledApplication.objects.get(accesstoken=token)
-    except models.ThrottledApplication.DoesNotExist:
-        # Critical because it indicates a data integrity problem.
-        # In practice should never occur so long as the preceding
-        # operation to retrieve the access token was successful.
-        logger.critical("Failed to find application associated with access token.")
-        return None
-
-    expired = token.expires < dt.datetime.now(token.expires.tzinfo)
-    if expired:
-        logger.info(
-            "rejected expired access token "
-            f"application.name={application.name} "
-            f"application.client_id={application.client_id} "
-        )
-        return None
-
-    return TokenInfo(
-        client_id=str(application.client_id),
-        rate_limit_model=application.rate_limit_model,
-        verified=application.verified,
-        application_name=application.name,
-    )
diff --git a/api/api/utils/throttle.py b/api/api/utils/throttle.py
index 4e3effaab34..8d4f4b1a859 100644
--- a/api/api/utils/throttle.py
+++ b/api/api/utils/throttle.py
@@ -5,8 +5,6 @@
 
 from redis.exceptions import ConnectionError
 
-from api.utils.oauth2_helper import get_token_info
-
 
 parent_logger = logging.getLogger(__name__)
 
@@ -47,8 +45,11 @@ def has_valid_token(self, request):
         if not request.auth:
             return False
 
-        token_info = get_token_info(str(request.auth))
-        return token_info and token_info.valid
+        application = getattr(request.auth, "application", None)
+        if application is None:
+            return False
+
+        return application.client_id and application.verified
 
     def get_cache_key(self, request, view):
         return self.cache_format % {
@@ -146,15 +147,16 @@ class AbstractOAuth2IdRateThrottle(SimpleRateThrottle, metaclass=abc.ABCMeta):
 
     def get_cache_key(self, request, view):
         # Find the client ID associated with the access token.
-        auth = str(request.auth)
-        token_info = get_token_info(auth)
-        if not (token_info and token_info.valid):
+        if not self.has_valid_token(request):
             return None
 
-        if token_info.rate_limit_model not in self.applies_to_rate_limit_model:
+        # `self.has_valid_token` call earlier ensures accessing `application` will not fail
+        application = request.auth.application
+
+        if application.rate_limit_model not in self.applies_to_rate_limit_model:
             return None
 
-        return self.cache_format % {"scope": self.scope, "ident": token_info.client_id}
+        return self.cache_format % {"scope": self.scope, "ident": application.client_id}
 
 
 class OAuth2IdThumbnailRateThrottle(AbstractOAuth2IdRateThrottle):
diff --git a/api/api/views/oauth2_views.py b/api/api/views/oauth2_views.py
index db4d780af72..b8511ae03ad 100644
--- a/api/api/views/oauth2_views.py
+++ b/api/api/views/oauth2_views.py
@@ -7,6 +7,7 @@
 from django.conf import settings
 from django.core.cache import cache
 from django.core.mail import send_mail
+from rest_framework.request import Request
 from rest_framework.response import Response
 from rest_framework.reverse import reverse
 from rest_framework.views import APIView
@@ -22,7 +23,6 @@
     OAuth2KeyInfoSerializer,
     OAuth2RegistrationSerializer,
 )
-from api.utils.oauth2_helper import get_token_info
 from api.utils.throttle import OnePerSecond, TenPerDay
 
 
@@ -169,7 +169,7 @@ class CheckRates(APIView):
     throttle_classes = (OnePerSecond,)
 
     @key_info
-    def get(self, request, format=None):
+    def get(self, request: Request, format=None):
         """
         Get information about your API key.
 
@@ -181,23 +181,17 @@ def get(self, request, format=None):
         """
 
         # TODO: Replace 403 responses with DRF `authentication_classes`.
-        if not request.auth:
+        if not request.auth or not hasattr(request.auth, "application"):
             return Response(status=403, data="Forbidden")
 
-        access_token = str(request.auth)
-        token_info = get_token_info(access_token)
+        application: ThrottledApplication = request.auth.application
 
-        if not token_info:
-            # This shouldn't happen if `request.auth` was true above,
-            # but better safe than sorry
-            return Response(status=403, data="Forbidden")
-
-        client_id = token_info.client_id
+        client_id = application.client_id
 
         if not client_id:
             return Response(status=403, data="Forbidden")
 
-        throttle_type = token_info.rate_limit_model
+        throttle_type = application.rate_limit_model
         throttle_key = "throttle_{scope}_{client_id}"
         if throttle_type == "standard":
             sustained_throttle_key = throttle_key.format(
@@ -242,7 +236,7 @@ def get(self, request, format=None):
                 "requests_this_minute": burst_requests,
                 "requests_today": sustained_requests,
                 "rate_limit_model": throttle_type,
-                "verified": token_info.verified,
+                "verified": application.verified,
             }
         )
         return Response(status=status, data=response_data.data)
diff --git a/api/test/unit/controllers/test_search_controller_search_query.py b/api/test/unit/controllers/test_search_controller_search_query.py
index 3ccd56daded..081bd356c27 100644
--- a/api/test/unit/controllers/test_search_controller_search_query.py
+++ b/api/test/unit/controllers/test_search_controller_search_query.py
@@ -172,7 +172,6 @@ def test_create_search_query_q_search_with_filters(image_media_type_config):
                 }
             },
             {"rank_feature": {"boost": 10000, "field": "standardized_popularity"}},
-            {"rank_feature": {"boost": 25000, "field": "authority_boost"}},
         ],
     }
 
diff --git a/api/test/unit/utils/test_throttle.py b/api/test/unit/utils/test_throttle.py
index 1afdaff5325..a727ea2b836 100644
--- a/api/test/unit/utils/test_throttle.py
+++ b/api/test/unit/utils/test_throttle.py
@@ -71,6 +71,7 @@ def enable_throttles(settings):
 def access_token():
     token = AccessTokenFactory.create()
     token.application.verified = True
+    token.application.client_id = 123
     token.application.save()
     return token
 

From 2cffcb9f8da6961e84a00854a3cd472fd0f9dad8 Mon Sep 17 00:00:00 2001
From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com>
Date: Tue, 27 Feb 2024 14:31:55 +1100
Subject: [PATCH 4/4] Use infrequent access when uploading provider TSVs
 (#3810)

* Add recipe useful for testing env var changes

* Use infrequent access storage class for provider TSVs

* Fix mock failing on new call

* Update catalog/env.template

Co-authored-by: Madison Swain-Bowden <bowdenm@spu.edu>

---------

Co-authored-by: Madison Swain-Bowden <bowdenm@spu.edu>
---
 catalog/dags/common/loader/s3.py              |  8 +++-
 .../dags/providers/provider_dag_factory.py    |  3 ++
 catalog/dags/providers/provider_workflows.py  | 41 ++++++++++++++++++-
 catalog/env.template                          |  6 +++
 .../dags/providers/test_provider_workflows.py |  1 +
 justfile                                      |  4 ++
 6 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/catalog/dags/common/loader/s3.py b/catalog/dags/common/loader/s3.py
index 7410bac6780..99f267f86e4 100644
--- a/catalog/dags/common/loader/s3.py
+++ b/catalog/dags/common/loader/s3.py
@@ -42,12 +42,15 @@ def copy_file_to_s3(
     s3_prefix,
     aws_conn_id,
     ti,
+    extra_args=None,
 ):
     """
     Copy a TSV file to S3 with the given prefix.
     The TSV's version is pushed to the `tsv_version` XCom, and the constructed
     S3 key is pushed to the `s3_key` XCom.
     The TSV is removed after the upload is complete.
+
+    ``extra_args`` refers to the S3Hook argument.
     """
     if tsv_file_path is None:
         raise FileNotFoundError("No TSV file path was provided")
@@ -57,7 +60,10 @@ def copy_file_to_s3(
     tsv_version = paths.get_tsv_version(tsv_file_path)
     s3_key = f"{s3_prefix}/{tsv_file.name}"
     logger.info(f"Uploading {tsv_file_path} to {s3_bucket}:{s3_key}")
-    s3 = S3Hook(aws_conn_id=aws_conn_id)
+    s3 = S3Hook(
+        aws_conn_id=aws_conn_id,
+        extra_args=extra_args or {},
+    )
     s3.load_file(tsv_file_path, s3_key, bucket_name=s3_bucket)
     ti.xcom_push(key="tsv_version", value=tsv_version)
     ti.xcom_push(key="s3_key", value=s3_key)
diff --git a/catalog/dags/providers/provider_dag_factory.py b/catalog/dags/providers/provider_dag_factory.py
index 1e5a5d04068..dc31be4892c 100644
--- a/catalog/dags/providers/provider_dag_factory.py
+++ b/catalog/dags/providers/provider_dag_factory.py
@@ -230,6 +230,9 @@ def append_day_shift(id_str):
                             else None,
                         ),
                         "aws_conn_id": AWS_CONN_ID,
+                        "extra_args": {
+                            "StorageClass": conf.s3_tsv_storage_class,
+                        },
                     },
                     trigger_rule=TriggerRule.NONE_SKIPPED,
                 )
diff --git a/catalog/dags/providers/provider_workflows.py b/catalog/dags/providers/provider_workflows.py
index 08dc13bc22a..0c1f54858eb 100644
--- a/catalog/dags/providers/provider_workflows.py
+++ b/catalog/dags/providers/provider_workflows.py
@@ -166,6 +166,25 @@ class ProviderWorkflow:
     tags: list[str] = field(default_factory=list)
     overrides: list[TaskOverride] = field(default_factory=list)
 
+    # Set when the object is uploaded, even though we access the object later in
+    # the DAG. IA incurs additional retrieval fees per request, unlike plain
+    # standard storage. However, as of writing, that costs 0.001 USD (1/10th of
+    # a US cent) per 1k requests. In other words, a minuscule amount, considering
+    # we will access the object once later in the DAG, to upsert it to the DB,
+    # and then in all likelihood never access it again.
+    # Even if we did, and had to pay the retrieval fee, we would still come out
+    # ahead on storage costs, because IA is so much less expensive than regular
+    # storage. We could set the storage class in a later task in the DAG, to
+    # avoid the one time retrieval fee. However, that adds complexity to the DAG
+    # that we can avoid by eagerly setting the storage class early, and the actual
+    # savings would probably be nil, factoring in the time spent in standard storage
+    # incurring standard storage costs. If it absolutely needs to be rationalised,
+    # consider the amount of energy spent on the extra request to S3 to update the
+    # storage cost to try to get around a retrieval fee (which, again, will not
+    # actually cost more, all things considered). Saving that energy could melt
+    # the glaciers all that much more slowly.
+    s3_tsv_storage_class: str = "STANDARD_IA"
+
     def _get_module_info(self):
         # Get the module the ProviderDataIngester was defined in
         provider_script = inspect.getmodule(self.ingester_class)
@@ -186,12 +205,30 @@ def __post_init__(self):
         if not self.doc_md:
             self.doc_md = provider_script.__doc__
 
-        # Check for custom configuration overrides, which will be applied when
-        # the DAG is generated.
+        self._process_configuration_overrides()
+
+    def _process_configuration_overrides(self):
+        """
+        Check for and apply custom configuration overrides.
+
+        These are only applied when the DAG is generated.
+        """
+
+        # Provider-specific configuration overrides
         self.overrides = Variable.get(
             "CONFIGURATION_OVERRIDES", default_var={}, deserialize_json=True
         ).get(self.dag_id, [])
 
+        # Allow forcing the default to something other than `STANDARD_IA`
+        # Primarily meant for use in local development where minio is used
+        # which does not support all AWS storage classes
+        # https://github.com/minio/minio/issues/5469
+        # This intentionally applies to all providers, rather than the provider-specific
+        # overrides above
+        self.s3_tsv_storage_class = Variable.get(
+            "DEFAULT_S3_TSV_STORAGE_CLASS", default_var=self.s3_tsv_storage_class
+        )
+
 
 PROVIDER_WORKFLOWS = [
     ProviderWorkflow(
diff --git a/catalog/env.template b/catalog/env.template
index 7125009f6ec..a86dd4c6af4 100644
--- a/catalog/env.template
+++ b/catalog/env.template
@@ -63,6 +63,12 @@ AIRFLOW_CONN_SLACK_NOTIFICATIONS=https://slack
 AIRFLOW_CONN_SLACK_ALERTS=https://slack
 
 S3_LOCAL_ENDPOINT=http://s3:5000
+# Set to a non-default value supported by minio in local development to workaround
+# Minio's lack of support for all AWS storage classes, while still using a non-default
+# value so that the expected behaviour can be verified (specifically, that the storage
+# class is not the default "STANDARD")
+# https://github.com/minio/minio/issues/5469
+AIRFLOW_VAR_DEFAULT_S3_TSV_STORAGE_CLASS=REDUCED_REDUNDANCY
 
 # Connection to the Ingestion Server, used for managing data refreshes. Default is used to
 # connect to your locally running ingestion server.
diff --git a/catalog/tests/dags/providers/test_provider_workflows.py b/catalog/tests/dags/providers/test_provider_workflows.py
index c6f0a6ee17a..09cd3e0b643 100644
--- a/catalog/tests/dags/providers/test_provider_workflows.py
+++ b/catalog/tests/dags/providers/test_provider_workflows.py
@@ -91,6 +91,7 @@ def test_overrides(configuration_overrides, expected_overrides):
     with mock.patch("providers.provider_workflows.Variable") as MockVariable:
         MockVariable.get.side_effect = [
             configuration_overrides,
+            MockVariable.get_original()[0],
         ]
         test_workflow = ProviderWorkflow(
             dag_id="my_dag_id",
diff --git a/justfile b/justfile
index d5f584b7940..db091073b7d 100644
--- a/justfile
+++ b/justfile
@@ -200,6 +200,10 @@ init:
 down *flags:
     just dc down {{ flags }}
 
+# Take all services down then call the specified app's up recipe. ex.: `just dup catalog` is useful for restarting the catalog with new environment variables
+dup app:
+    just down && just {{ app }}/up
+
 # Recreate all volumes and containers from scratch
 recreate:
     just down -v