Make links in docstring display as hyperlinks in docs site (#4195)

WordPress · Apr 24, 2024 · d500b77 · d500b77
1 parent 7c6298a
commit d500b77
Show file tree

Hide file tree

Showing 30 changed files with 103 additions and 97 deletions.
diff --git a/catalog/dags/common/cloudwatch.py b/catalog/dags/common/cloudwatch.py
@@ -1,6 +1,6 @@
 """
 CloudwatchWrapper extracted partially from
-https://github.com/awsdocs/aws-doc-sdk-examples/blob/54c3b82d8f9a12a862f9fcec44909829bda849af/python/example_code/cloudwatch/cloudwatch_basics.py
+<https://github.com/awsdocs/aws-doc-sdk-examples/blob/54c3b82d8f9a12a862f9fcec44909829bda849af/python/example_code/cloudwatch/cloudwatch_basics.py>
 
 The CloudwatchWrapper requires the AWS_CLOUDWATCH_CONN_ID, or the `aws_default`
 connection, to be set in the Airflow Connections.

diff --git a/catalog/dags/common/sensors/utils.py b/catalog/dags/common/sensors/utils.py
@@ -19,7 +19,7 @@ def _get_most_recent_dag_run(dag_id) -> list[datetime] | datetime:
 
     For use as ``execution_date_fn`` argument to ``ExternalTaskSensor``.
 
-    Adapted from https://stackoverflow.com/a/74017474
+    Adapted from <https://stackoverflow.com/a/74017474>
     CC BY-SA 4.0 by Stack Overflow user Nahid O.
     """
     dag_runs = DagRun.find(dag_id=dag_id)

diff --git a/catalog/dags/common/slack.py b/catalog/dags/common/slack.py
@@ -6,7 +6,7 @@
     - attach text, fields
 
 This class is intended to be used with a channel-specific slack webhook.
-More information can be found here: https://app.slack.com/block-kit-builder.
+More information can be found here: <https://app.slack.com/block-kit-builder>.
 
 ## Messages are not configured to send in development
 

diff --git a/catalog/dags/common/storage/media.py b/catalog/dags/common/storage/media.py
@@ -323,8 +323,8 @@ def _validate_integer(value: int | None) -> int | None:
 
         If the value exceeds this maximum, None is returned.
         TODO: Remove this logic once the column has been embiggened
-        https://github.com/WordPress/openverse-catalog/issues/730
-        https://github.com/WordPress/openverse-catalog/issues/873
+        <https://github.com/WordPress/openverse-catalog/issues/730>
+        <https://github.com/WordPress/openverse-catalog/issues/873>
         """
         if value and value >= PG_INTEGER_MAXIMUM:
             logger.warning(f"Value exceeds Postgres maximum integer value: {value}")

diff --git a/catalog/dags/database/batched_update/batched_update_dag.py b/catalog/dags/database/batched_update/batched_update_dag.py
@@ -11,7 +11,7 @@
 when all batches have been updated. It handles all deadlocking and timeout concerns,
 ensuring that the provided SQL is run without interfering with ingestion. For more
 information, see the implementation plan:
-https://docs.openverse.org/projects/proposals/popularity_optimizations/20230420-implementation_plan_popularity_optimizations.html#special-considerations-avoiding-deadlocks-and-timeouts
+<https://docs.openverse.org/projects/proposals/popularity_optimizations/20230420-implementation_plan_popularity_optimizations.html#special-considerations-avoiding-deadlocks-and-timeouts>
 
 By default the DAG will run as a dry_run, logging the generated SQL but not actually
 running it. To actually perform the update, the `dry_run` parameter must be

diff --git a/catalog/dags/database/staging_database_restore/staging_database_restore.py b/catalog/dags/database/staging_database_restore/staging_database_restore.py
@@ -41,7 +41,7 @@ def skip_restore(should_skip: bool = False) -> bool:
     Can be overridden by setting the `SKIP_STAGING_DATABASE_RESTORE` Airflow Variable
     to `true`.
     Should return `True` to have the DAG continue, and `False` to have it skipped.
-    https://docs.astronomer.io/learn/airflow-branch-operator#taskshort_circuit-shortcircuitoperator
+    <https://docs.astronomer.io/learn/airflow-branch-operator#taskshort_circuit-shortcircuitoperator>
     """
     should_continue = not (
         should_skip
@@ -63,7 +63,7 @@ def skip_restore(should_skip: bool = False) -> bool:
 def get_latest_prod_snapshot(rds_hook: RdsHook = None) -> str:
     """
     Get the latest automated snapshot for the production database.
-    https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/rds/client/describe_db_snapshots.html
+    <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/rds/client/describe_db_snapshots.html>
     Status is checked using a sensor in a later step, in case a snapshot creation is
     currently in progress.
     """
@@ -91,7 +91,7 @@ def get_staging_db_details(rds_hook: RdsHook = None) -> dict:
     """
     Retrieve the details of the staging database. Only some details are required (and
     others are actually sensitive) so filter down to only what we need.
-    https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/rds/client/describe_db_instances.html
+    <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/rds/client/describe_db_instances.html>
     """
     # Get staging DB details
     instances = rds_hook.conn.describe_db_instances(
@@ -126,7 +126,7 @@ def restore_staging_from_snapshot(
     Restore the staging database from the latest snapshot.
     Augment the restore operation with the existing details determined from
     a previous step.
-    https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/rds/client/restore_db_instance_from_db_snapshot.html
+    <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/rds/client/restore_db_instance_from_db_snapshot.html>
     """
     log.info(
         f"Creating a new {constants.TEMP_IDENTIFIER} instance from {latest_snapshot} "
@@ -145,7 +145,7 @@ def rename_db_instance(source: str, target: str, rds_hook: RdsHook = None) -> No
     """
     Rename a database instance.
     This can only be run on instances where mutation is allowed.
-    https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/rds/client/modify_db_instance.html
+    <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/rds/client/modify_db_instance.html>
     """
     log.info("Checking input values")
     ensure_mutate_allowed(source)

diff --git a/catalog/dags/database/staging_database_restore/staging_database_restore_dag.py b/catalog/dags/database/staging_database_restore/staging_database_restore_dag.py
@@ -5,7 +5,7 @@
 snapshot of the production database.
 
 For a full explanation of the DAG, see the implementation plan description:
-https://docs.openverse.org/projects/proposals/search_relevancy_sandbox/20230406-implementation_plan_update_staging_database.html#dag
+<https://docs.openverse.org/projects/proposals/search_relevancy_sandbox/20230406-implementation_plan_update_staging_database.html#dag>
 
 This DAG can be skipped by setting the `SKIP_STAGING_DATABASE_RESTORE` Airflow Variable
 to `true`. To change this variable, navigate to Admin > Variables in the Airflow UI,

diff --git a/catalog/dags/maintenance/pr_review_reminders/pr_review_reminders.py b/catalog/dags/maintenance/pr_review_reminders/pr_review_reminders.py
@@ -64,7 +64,7 @@ def days_without_weekends(
     Return the number of days between two dates, excluding weekends.
 
     Adapted from:
-    https://stackoverflow.com/a/3615984 CC BY-SA 2.5
+    <https://stackoverflow.com/a/3615984> CC BY-SA 2.5
     """
     if today.weekday() == 0 and (today - updated_at).days < 3:
         # shortcut mondays to 0 if last updated on the weekend

diff --git a/catalog/dags/providers/provider_api_scripts/auckland_museum.py b/catalog/dags/providers/provider_api_scripts/auckland_museum.py
@@ -6,10 +6,10 @@
 Output:                 TSV file containing the media and the
                         respective meta-data.
 
-Notes:                  https://api.aucklandmuseum.com/
+Notes:                  <https://api.aucklandmuseum.com/>
 
-Resource:               https://api.aucklandmuseum.com/
-                        https://github.com/AucklandMuseum/API/wiki/Tutorial
+Resource:               <https://api.aucklandmuseum.com/>
+                        <https://github.com/AucklandMuseum/API/wiki/Tutorial>
 
 Resource | Requests per second | Requests per day
 -- | -- | --

diff --git a/catalog/dags/providers/provider_api_scripts/cc_mixter.py b/catalog/dags/providers/provider_api_scripts/cc_mixter.py
@@ -6,7 +6,7 @@
 Output:                 TSV file containing the media and the
                         respective meta-data.
 
-Notes:                  Documentation: https://ccmixter.org/query-api
+Notes:                  Documentation: <https://ccmixter.org/query-api>
                         ccMixter sends bad JSON and extremely huge headers, both
                         of which need workarounds that are handled by this DAG.
 """

diff --git a/catalog/dags/providers/provider_api_scripts/europeana.py b/catalog/dags/providers/provider_api_scripts/europeana.py
@@ -6,7 +6,7 @@
 Output:                 TSV file containing the images and the
                         respective meta-data.
 
-Notes:                  https://pro.europeana.eu/page/search
+Notes:                  <https://pro.europeana.eu/page/search>
 """
 
 import argparse

diff --git a/catalog/dags/providers/provider_api_scripts/finnish_museums.py b/catalog/dags/providers/provider_api_scripts/finnish_museums.py
@@ -6,8 +6,8 @@
 Output:                 TSV file containing the images and the
                         respective meta-data.
 
-Notes:                  https://api.finna.fi/swagger-ui/
-                        https://www.finna.fi/Content/help-syntax?lng=en-gb
+Notes:                  <https://api.finna.fi/swagger-ui/>
+                        <https://www.finna.fi/Content/help-syntax?lng=en-gb>
                         The Finnish Museums provider script is a dated DAG that
                         ingests all records that were last updated in the previous
                         day. Because of this, it is not necessary to run a separate

diff --git a/catalog/dags/providers/provider_api_scripts/flickr.py b/catalog/dags/providers/provider_api_scripts/flickr.py
@@ -6,8 +6,9 @@
 Output:                 TSV file containing the images and the
                         respective meta-data.
 
-Notes:                  https://www.flickr.com/help/terms/api
-                        Rate limit: 3600 requests per hour.
+Notes:                  <https://www.flickr.com/help/terms/api>
+
+Rate limit:             3600 requests per hour.
 """
 
 import argparse

diff --git a/catalog/dags/providers/provider_api_scripts/freesound.py b/catalog/dags/providers/provider_api_scripts/freesound.py
@@ -6,10 +6,11 @@
 Output:                 TSV file containing the image, the respective
                         meta-data.
 
-Notes:                  https://freesound.org/docs/api/
-                        Rate limit: No limit for our API key.
-                        This script can be run either to ingest the full dataset or
-                        as a dated DAG.
+Notes:                  <https://freesound.org/docs/api/>
+
+Rate limit:             No limit for our API key.
+This script can be run either to ingest the full dataset or
+as a dated DAG.
 """
 
 import functools

diff --git a/catalog/dags/providers/provider_api_scripts/inaturalist.py b/catalog/dags/providers/provider_api_scripts/inaturalist.py
@@ -4,15 +4,15 @@
 Output:     Records loaded to the image catalog table.
 
 Notes:      The iNaturalist API is not intended for data scraping.
-            https://api.inaturalist.org/v1/docs/
+            <https://api.inaturalist.org/v1/docs/>
             But there is a full dump intended for sharing on S3.
-            https://github.com/inaturalist/inaturalist-open-data/tree/documentation/Metadata
+            <https://github.com/inaturalist/inaturalist-open-data/tree/documentation/Metadata>
             Because these are exceptionally large normalized tables, as opposed to more document
             oriented API responses, we found that bringing the data into postgres first
             was the most effective approach. More detail in slack here:
-            https://wordpress.slack.com/archives/C02012JB00N/p1653145643080479?thread_ts=1653082292.714469&cid=C02012JB00N
+            <https://wordpress.slack.com/archives/C02012JB00N/p1653145643080479?thread_ts=1653082292.714469&cid=C02012JB00N>
             We use the table structure defined here,
-            https://github.com/inaturalist/inaturalist-open-data/blob/main/Metadata/structure.sql
+            <https://github.com/inaturalist/inaturalist-open-data/blob/main/Metadata/structure.sql>
             except for adding ancestry tags to the taxa table.
 """
 

diff --git a/catalog/dags/providers/provider_api_scripts/jamendo.py b/catalog/dags/providers/provider_api_scripts/jamendo.py
@@ -5,7 +5,7 @@
 
 Output:                 TSV file containing the audio meta-data.
 
-Notes:                  https://api.jamendo.com/v3.0/tracks/
+Notes:                  <https://api.jamendo.com/v3.0/tracks/>
                         35,000 requests per month for non-commercial apps
                         Jamendo Music has more than 500,000 tracks shared by
                         40,000 artists from over 150 countries all over
@@ -104,8 +104,8 @@ def _add_trailing_slash(url: str | None) -> str | None:
         """
         Jamendo image URLs are missing a trailing slash, which when viewed normally in
         the browser get redirected to the correct URL. Example:
-        - https://usercontent.jamendo.com?type=album&id=100007&width=300 (before)
-        - https://usercontent.jamendo.com/?type=album&id=100007&width=300 (after)
+        - <https://usercontent.jamendo.com?type=album&id=100007&width=300> (before)
+        - <https://usercontent.jamendo.com/?type=album&id=100007&width=300> (after)
 
         Due to the way photon processes thumbnails, we need to add this trailing slash
         to the url prior to the query params if it does not have one.
@@ -120,7 +120,7 @@ def _get_audio_url(self, data):
 
         Audio URLs have a "from" param which seems to encapsulate information about the
         calling application. Example from the API:
-        https://prod-1.storage.jamendo.com/?trackid=1532771&format=mp31&from=app-devsite
+        <https://prod-1.storage.jamendo.com/?trackid=1532771&format=mp31&from=app-devsite>
         This information looks like an API key or secret when returned, so we remove it
         since it's not necessary for serving the audio files.
         >>> base_url = "https://prod-1.storage.jamendo.com/"

diff --git a/catalog/dags/providers/provider_api_scripts/justtakeitfree.py b/catalog/dags/providers/provider_api_scripts/justtakeitfree.py
@@ -6,8 +6,8 @@
 Output:                 TSV file containing the media and the
                         respective meta-data.
 
-Notes:                  https://justtakeitfree.com/api/api.php
-This API requires an API key. For more details, see https://github.com/WordPress/openverse/pull/2793
+Notes:                  <https://justtakeitfree.com/api/api.php>
+This API requires an API key. For more details, see <https://github.com/WordPress/openverse/pull/2793>
 """
 
 import logging

diff --git a/catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/catalog/dags/providers/provider_api_scripts/metropolitan_museum.py
@@ -6,22 +6,22 @@
 Output:                 TSV file containing the image, their respective
                         meta-data.
 
-Notes:                  https://metmuseum.github.io/#search
+Notes:                  <https://metmuseum.github.io/#search>
                         "Please limit requests to 80 requests per second." May need to
                         bump up the delay (e.g. to 3 seconds), to avoid of blocking
                         during local development testing.
 
                         Some analysis to improve data quality was conducted using a
-                        separate csv file here: https://github.com/metmuseum/openaccess
+                        separate csv file here: <https://github.com/metmuseum/openaccess>
 
                         Get a list of object IDs:
-                        https://collectionapi.metmuseum.org/public/collection/v1/objects?metadataDate=2022-08-10
+                        <https://collectionapi.metmuseum.org/public/collection/v1/objects?metadataDate=2022-08-10>
                         Get a specific object:
-                        https://collectionapi.metmuseum.org/public/collection/v1/objects/1027
+                        <https://collectionapi.metmuseum.org/public/collection/v1/objects/1027>
                         The search functionality requires a specific query (term search)
                         in addition to date and public domain. It seems like it won't
                         connect with just date and license.
-                        https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07
+                        <https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07>
 
 """
 

diff --git a/catalog/dags/providers/provider_api_scripts/nappy.py b/catalog/dags/providers/provider_api_scripts/nappy.py
@@ -7,7 +7,7 @@
 
 Notes:                  This api was written specially for Openverse.
                         There are no known limits or restrictions.
-                        https://nappy.co/
+                        <https://nappy.co/>
 
 """
 

diff --git a/catalog/dags/providers/provider_api_scripts/phylopic.py b/catalog/dags/providers/provider_api_scripts/phylopic.py
@@ -6,7 +6,7 @@
 Output:                 TSV file containing the image,
                         their respective meta-data.
 
-Notes:                  http://api-docs.phylopic.org/v2/
+Notes:                  <http://api-docs.phylopic.org/v2/>
                         No rate limit specified.
 """
 

diff --git a/catalog/dags/providers/provider_api_scripts/rawpixel.py b/catalog/dags/providers/provider_api_scripts/rawpixel.py
@@ -10,7 +10,7 @@
                         directly if we run into any issues.
                         The public API max results range is limited to 100,000 results,
                         although the API key we've been given can circumvent this limit.
-                        https://www.rawpixel.com/api/v1/search?tags=$publicdomain&page=1&pagesize=100
+                        <https://www.rawpixel.com/api/v1/search?tags=$publicdomain&page=1&pagesize=100>
 """
 
 import base64
@@ -97,8 +97,8 @@ def _get_signature(self, query_params: dict) -> str:
 
         URL encode the ordered parameters in a way that matches Node's
         querystring.stringify as closely as possible
-        See: https://docs.python.org/3.10/library/urllib.parse.html#urllib.parse.urlencode
-        and https://nodejs.org/api/querystring.html#querystringstringifyobj-sep-eq-options
+        See: <https://docs.python.org/3.10/library/urllib.parse.html#urllib.parse.urlencode>
+        and <https://nodejs.org/api/querystring.html#querystringstringifyobj-sep-eq-options>
         """  # noqa: E501
         # Params must be ordered for deterministic computation
         ordered_params = {k: v for k, v in sorted(query_params.items())}

diff --git a/catalog/dags/providers/provider_api_scripts/science_museum.py b/catalog/dags/providers/provider_api_scripts/science_museum.py
@@ -6,7 +6,7 @@
 Output:                 TSV file containing the image, the respective
                         meta-data.
 
-Notes:                  https://github.com/TheScienceMuseum/collectionsonline/wiki/Collections-Online-API
+Notes:                  <https://github.com/TheScienceMuseum/collectionsonline/wiki/Collections-Online-API>
                         Rate limited, no specific rate given.
 """  # noqa: E501
 
@@ -50,7 +50,7 @@ def _get_year_ranges(final_year: int) -> list[tuple[int, int]]:
 
         The Science Museum API currently raises a 400 when attempting to access
         any page number higher than 50
-        (https://github.com/TheScienceMuseum/collectionsonline/issues/1470).
+        (<https://github.com/TheScienceMuseum/collectionsonline/issues/1470>).
 
         To avoid this, we ingest data for small ranges of years at a time,
         in order to split the data into batches less than 50 pages each.

diff --git a/catalog/dags/providers/provider_api_scripts/smithsonian.py b/catalog/dags/providers/provider_api_scripts/smithsonian.py
@@ -5,7 +5,7 @@
 
 Output:             TSV file containing the images and the respective meta-data.
 
-Notes:              https://api.si.edu/openaccess/api/v1.0/search
+Notes:              <https://api.si.edu/openaccess/api/v1.0/search>
 """
 
 import logging

diff --git a/catalog/dags/providers/provider_api_scripts/smk.py b/catalog/dags/providers/provider_api_scripts/smk.py
@@ -5,7 +5,7 @@
 
 Output:                 TSV file containing the media metadata.
 
-Notes:                  https://www.smk.dk/en/article/smk-api/
+Notes:                  <https://www.smk.dk/en/article/smk-api/>
 """
 
 import logging

diff --git a/catalog/dags/providers/provider_api_scripts/stocksnap.py b/catalog/dags/providers/provider_api_scripts/stocksnap.py
@@ -5,8 +5,8 @@
 
 Output:                 TSV file containing the image, the respective meta-data.
 
-Notes:                  https://stocksnap.io/api/load-photos/date/desc/1
-                        https://stocksnap.io/faq
+Notes:                  <https://stocksnap.io/api/load-photos/date/desc/1>
+                        <https://stocksnap.io/faq>
                         All images are licensed under CC0.
                         No rate limits or authorization required.
                         API is undocumented.

diff --git a/catalog/dags/providers/provider_api_scripts/time_delineated_provider_data_ingester.py b/catalog/dags/providers/provider_api_scripts/time_delineated_provider_data_ingester.py
@@ -121,7 +121,7 @@ def _get_timestamp_pairs(self, **kwargs):
         Determine a set of timestamp pairs.
         Some provider APIs can behave unexpectedly when querying large datasets,
         resulting in large numbers of duplicates and eventual DAG timeouts
-        (see https://github.com/WordPress/openverse-catalog/pull/879 for an
+        (see <https://github.com/WordPress/openverse-catalog/pull/879> for an
         example). To avoid this, when we detect that a time period contains a large
         number of records we split it up into multiple smaller time periods and
         run ingestion separately for each.

diff --git a/catalog/dags/providers/provider_api_scripts/wordpress.py b/catalog/dags/providers/provider_api_scripts/wordpress.py
@@ -5,7 +5,7 @@
 
 Output:                 TSV file containing the media metadata.
 
-Notes:                  https://wordpress.org/photos/wp-json/wp/v2
+Notes:                  <https://wordpress.org/photos/wp-json/wp/v2>
                         Provide photos, media, users and more related resources.
                         No rate limit specified.
 """

diff --git a/catalog/dags/retired/providers/provider_api_scripts/thingiverse.py b/catalog/dags/retired/providers/provider_api_scripts/thingiverse.py
@@ -6,7 +6,7 @@
 
 Output:                 TSV file containing the 3D models, their respective images and meta-data.
 
-Notes:                  https://www.thingiverse.com/developers/getting-started
+Notes:                  <https://www.thingiverse.com/developers/getting-started>
                         All API requests require authentication.
                         Rate limiting is 300 per 5 minute window.
 """

diff --git a/catalog/dags/retired/providers/provider_api_scripts/walters.py b/catalog/dags/retired/providers/provider_api_scripts/walters.py
@@ -6,7 +6,7 @@
 Output:                 TSV file containing the images and the
                         respective meta-data.
 
-Notes:                  http://api.thewalters.org/
+Notes:                  <http://api.thewalters.org/>
                         Rate limit: 250000 Per Day Per Key
 """