From c6d673566dc23d482275fff8dd0a641949e234d4 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Mon, 25 Sep 2023 17:27:35 -0400 Subject: [PATCH 1/8] move DAGs.md to documentation folder --- catalog/justfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/catalog/justfile b/catalog/justfile index 8d9f0c333d6..6ded10433ff 100644 --- a/catalog/justfile +++ b/catalog/justfile @@ -127,10 +127,10 @@ generate-dag-docs fail_on_diff="false": {{ SERVICE }} \ "bash -c 'python catalog/utilities/dag_doc_gen/dag_doc_generation.py && chmod 666 /opt/airflow/catalog/utilities/dag_doc_gen/DAGs.md'" # Move the file to the top level, since that level is not mounted into the container - mv utilities/dag_doc_gen/DAGs.md DAGs.md + mv utilities/dag_doc_gen/DAGs.md ../documentation/DAGs.md echo -n "Running linting..." # Linting step afterwards is necessary since the generated output differs greatly from what prettier expects - just ../lint prettier catalog/DAGs.md &>/dev/null || true + just ../lint prettier documentation/DAGs.md &>/dev/null || true echo "Done!" if {{ fail_on_diff }}; then set +e From aa5d40b1c4dbf27973af7bc8b9b14ee12cd87c98 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Mon, 25 Sep 2023 17:37:29 -0400 Subject: [PATCH 2/8] revise comment --- catalog/justfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalog/justfile b/catalog/justfile index 6ded10433ff..a491c0f6d51 100644 --- a/catalog/justfile +++ b/catalog/justfile @@ -126,7 +126,7 @@ generate-dag-docs fail_on_diff="false": just ../run \ {{ SERVICE }} \ "bash -c 'python catalog/utilities/dag_doc_gen/dag_doc_generation.py && chmod 666 /opt/airflow/catalog/utilities/dag_doc_gen/DAGs.md'" - # Move the file to the top level, since that level is not mounted into the container + # Move the file to the top level on the documentation folder, since that level is not mounted into the container mv utilities/dag_doc_gen/DAGs.md ../documentation/DAGs.md echo -n "Running linting..." # Linting step afterwards is necessary since the generated output differs greatly from what prettier expects From 43beb15d902f78e1d72d3c70dd1c4496e4d887cb Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Mon, 25 Sep 2023 17:41:17 -0400 Subject: [PATCH 3/8] add DAGs.md to documentation folder --- documentation/DAGs.md | 1117 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1117 insertions(+) create mode 100644 documentation/DAGs.md diff --git a/documentation/DAGs.md b/documentation/DAGs.md new file mode 100644 index 00000000000..ac3212fae71 --- /dev/null +++ b/documentation/DAGs.md @@ -0,0 +1,1117 @@ +# DAGs + +_Note: this document is auto-generated and should not be manually edited_ + +This document describes the DAGs available along with pertinent DAG information +and the DAG's documentation. + +The DAGs are shown in two forms: + +- [DAGs by Type](#dags-by-type) +- [Individual DAG documentation](#dag-documentation) + +# DAGs by Type + +The following are DAGs grouped by their primary tag: + +1. [Data Normalization](#data_normalization) +1. [Data Refresh](#data_refresh) +1. [Database](#database) +1. [Maintenance](#maintenance) +1. [Oauth](#oauth) +1. [Other](#other) +1. [Popularity Refresh](#popularity_refresh) +1. [Provider](#provider) +1. [Provider Reingestion](#provider-reingestion) + +## Data Normalization + +| DAG ID | Schedule Interval | +| ------------------------------------- | ----------------- | +| [`add_license_url`](#add_license_url) | `None` | + +## Data Refresh + +| DAG ID | Schedule Interval | +| ------------------------------------------------------------- | ----------------- | +| [`audio_data_refresh`](#audio_data_refresh) | `0 0 * * 1` | +| [`create_filtered_audio_index`](#create_filtered_audio_index) | `None` | +| [`create_filtered_image_index`](#create_filtered_image_index) | `None` | +| [`image_data_refresh`](#image_data_refresh) | `0 0 * * 1` | + +## Database + +| DAG ID | Schedule Interval | +| --------------------------------------------------------------------------------- | ----------------- | +| [`batched_update`](#batched_update) | `None` | +| [`recreate_audio_popularity_calculation`](#recreate_audio_popularity_calculation) | `None` | +| [`recreate_image_popularity_calculation`](#recreate_image_popularity_calculation) | `None` | +| [`report_pending_reported_media`](#report_pending_reported_media) | `@weekly` | +| [`staging_database_restore`](#staging_database_restore) | `@monthly` | + +## Maintenance + +| DAG ID | Schedule Interval | +| --------------------------------------------------------------------------- | ----------------- | +| [`airflow_log_cleanup`](#airflow_log_cleanup) | `@weekly` | +| [`check_silenced_dags`](#check_silenced_dags) | `@weekly` | +| [`flickr_audit_sub_provider_workflow`](#flickr_audit_sub_provider_workflow) | `@monthly` | +| [`pr_review_reminders`](#pr_review_reminders) | `0 0 * * 1-5` | +| [`rotate_db_snapshots`](#rotate_db_snapshots) | `0 0 * * 6` | + +## Oauth + +| DAG ID | Schedule Interval | +| ----------------------------------------------- | ----------------- | +| [`oauth2_authorization`](#oauth2_authorization) | `None` | +| [`oauth2_token_refresh`](#oauth2_token_refresh) | `0 */12 * * *` | + +## Other + +| DAG ID | Schedule Interval | +| --------------------------------------------------------- | ----------------- | +| [`flickr_thumbnails_removal`](#flickr_thumbnails_removal) | `None` | + +## Popularity Refresh + +| DAG ID | Schedule Interval | +| ------------------------------------------------------- | ----------------- | +| [`audio_popularity_refresh`](#audio_popularity_refresh) | `@monthly` | +| [`image_popularity_refresh`](#image_popularity_refresh) | `@monthly` | + +## Provider + +| DAG ID | Schedule Interval | Dated | Media Type(s) | +| --------------------------------------------------------------- | ----------------- | ------- | ------------- | +| `brooklyn_museum_workflow` | `@monthly` | `False` | image | +| `cleveland_museum_workflow` | `@monthly` | `False` | image | +| [`europeana_workflow`](#europeana_workflow) | `@daily` | `True` | image | +| [`finnish_museums_workflow`](#finnish_museums_workflow) | `@daily` | `True` | image | +| [`flickr_workflow`](#flickr_workflow) | `@daily` | `True` | image | +| [`freesound_workflow`](#freesound_workflow) | `@quarterly` | `False` | audio | +| [`inaturalist_workflow`](#inaturalist_workflow) | `0 0 2 * *` | `False` | image | +| [`jamendo_workflow`](#jamendo_workflow) | `@monthly` | `False` | audio | +| [`justtakeitfree_workflow`](#justtakeitfree_workflow) | `@monthly` | `False` | image | +| [`metropolitan_museum_workflow`](#metropolitan_museum_workflow) | `@daily` | `True` | image | +| `museum_victoria_workflow` | `@monthly` | `False` | image | +| [`nappy_workflow`](#nappy_workflow) | `@monthly` | `False` | image | +| `nypl_workflow` | `@monthly` | `False` | image | +| [`phylopic_workflow`](#phylopic_workflow) | `@weekly` | `False` | image | +| [`rawpixel_workflow`](#rawpixel_workflow) | `@monthly` | `False` | image | +| [`science_museum_workflow`](#science_museum_workflow) | `@monthly` | `False` | image | +| [`smithsonian_workflow`](#smithsonian_workflow) | `@weekly` | `False` | image | +| [`smk_workflow`](#smk_workflow) | `@monthly` | `False` | image | +| [`stocksnap_workflow`](#stocksnap_workflow) | `@monthly` | `False` | image | +| [`wikimedia_commons_workflow`](#wikimedia_commons_workflow) | `@daily` | `True` | image, audio | +| [`wordpress_workflow`](#wordpress_workflow) | `@monthly` | `False` | image | + +## Provider Reingestion + +| DAG ID | Schedule Interval | +| --------------------------------------------------------------------------------------- | ----------------- | +| [`flickr_reingestion_workflow`](#flickr_reingestion_workflow) | `@weekly` | +| [`metropolitan_museum_reingestion_workflow`](#metropolitan_museum_reingestion_workflow) | `@weekly` | +| [`phylopic_reingestion_workflow`](#phylopic_reingestion_workflow) | `@weekly` | +| [`wikimedia_reingestion_workflow`](#wikimedia_reingestion_workflow) | `@weekly` | + +# DAG documentation + +The following is documentation associated with each DAG (where available): + +1. [`add_license_url`](#add_license_url) +1. [`airflow_log_cleanup`](#airflow_log_cleanup) +1. [`audio_data_refresh`](#audio_data_refresh) +1. [`audio_popularity_refresh`](#audio_popularity_refresh) +1. [`batched_update`](#batched_update) +1. [`check_silenced_dags`](#check_silenced_dags) +1. [`create_filtered_audio_index`](#create_filtered_audio_index) +1. [`create_filtered_image_index`](#create_filtered_image_index) +1. [`europeana_workflow`](#europeana_workflow) +1. [`finnish_museums_workflow`](#finnish_museums_workflow) +1. [`flickr_audit_sub_provider_workflow`](#flickr_audit_sub_provider_workflow) +1. [`flickr_reingestion_workflow`](#flickr_reingestion_workflow) +1. [`flickr_thumbnails_removal`](#flickr_thumbnails_removal) +1. [`flickr_workflow`](#flickr_workflow) +1. [`freesound_workflow`](#freesound_workflow) +1. [`image_data_refresh`](#image_data_refresh) +1. [`image_popularity_refresh`](#image_popularity_refresh) +1. [`inaturalist_workflow`](#inaturalist_workflow) +1. [`jamendo_workflow`](#jamendo_workflow) +1. [`justtakeitfree_workflow`](#justtakeitfree_workflow) +1. [`metropolitan_museum_reingestion_workflow`](#metropolitan_museum_reingestion_workflow) +1. [`metropolitan_museum_workflow`](#metropolitan_museum_workflow) +1. [`nappy_workflow`](#nappy_workflow) +1. [`oauth2_authorization`](#oauth2_authorization) +1. [`oauth2_token_refresh`](#oauth2_token_refresh) +1. [`phylopic_reingestion_workflow`](#phylopic_reingestion_workflow) +1. [`phylopic_workflow`](#phylopic_workflow) +1. [`pr_review_reminders`](#pr_review_reminders) +1. [`rawpixel_workflow`](#rawpixel_workflow) +1. [`recreate_audio_popularity_calculation`](#recreate_audio_popularity_calculation) +1. [`recreate_image_popularity_calculation`](#recreate_image_popularity_calculation) +1. [`report_pending_reported_media`](#report_pending_reported_media) +1. [`rotate_db_snapshots`](#rotate_db_snapshots) +1. [`science_museum_workflow`](#science_museum_workflow) +1. [`smithsonian_workflow`](#smithsonian_workflow) +1. [`smk_workflow`](#smk_workflow) +1. [`staging_database_restore`](#staging_database_restore) +1. [`stocksnap_workflow`](#stocksnap_workflow) +1. [`wikimedia_commons_workflow`](#wikimedia_commons_workflow) +1. [`wikimedia_reingestion_workflow`](#wikimedia_reingestion_workflow) +1. [`wordpress_workflow`](#wordpress_workflow) + +## `add_license_url` + +### Add license URL + +Add `license_url` to all rows that have `NULL` in their `meta_data` fields. This +PR sets the meta_data value to "{license_url: https://... }", where the url is +constructed from the `license` and `license_version` columns. + +This is a maintenance DAG that should be run once. If all the null values in the +`meta_data` column are updated, the DAG will only run the first and the last +step, logging the statistics. + +## `airflow_log_cleanup` + +### Clean up airflow logs + +A maintenance workflow that you can deploy into Airflow to periodically clean +out the task logs to avoid those getting too big. By default, this will also +clean child process logs from the 'scheduler' directory. + +Can remove all log files by setting "maxLogAgeInDays" to -1. If you want to test +the DAG in the Airflow Web UI, you can also set enableDelete to `false`, and +then you will see a list of log folders that can be deleted, but will not +actually delete them. + +This should all go on one line: + +``` +airflow dags trigger --conf +'{"maxLogAgeInDays":-1, "enableDelete": "false"}' airflow_log_cleanup +``` + +`--conf` options: + +- maxLogAgeInDays: - Optional +- enableDelete: - Optional + +## `audio_data_refresh` + +### Data Refresh DAG Factory + +This file generates our data refresh DAGs using a factory function. For the +given media type these DAGs will initiate a data refresh on the ingestion server +and await the success or failure of that task. + +A data refresh occurs on the Ingestion server in the Openverse project. This is +a task which imports data from the upstream Catalog database into the API, +copies contents to a new Elasticsearch index, and finally makes the index +"live". This process is necessary to make new content added to the Catalog by +our provider DAGs available to the API. You can read more in the +[README](https://github.com/WordPress/openverse/blob/main/ingestion_server/README.md) +Importantly, the data refresh TaskGroup is also configured to handle concurrency +requirements of the Ingestion server. Finally, once the origin indexes have been +refreshed, the corresponding filtered index creation DAG is triggered. + +You can find more background information on this process in the following issues +and related PRs: + +- [[Feature] Data refresh orchestration DAG](https://github.com/WordPress/openverse-catalog/issues/353) +- [[Feature] Merge popularity calculations and data refresh into a single DAG](https://github.com/WordPress/openverse-catalog/issues/453) + +## `audio_popularity_refresh` + +### Popularity Refresh DAG Factory + +This file generates our popularity refresh DAGs using a factory function. + +For the given media type these DAGs will first update the popularity metrics +table, adding any new metrics and updating the percentile that is used in +calculating the popularity constants. It then refreshes the popularity constants +view, which recalculates the popularity constant for each provider. + +Once the constants have been updated, the DAG will trigger a `batched_update` +DagRun for each provider of this media_type that is configured to support +popularity data. The batched update recalculates standardized popularity scores +for all records, using the new constant. When the updates are complete, all +records have up-to-date popularity data. This DAG can be run concurrently with +data refreshes and regular ingestion. + +You can find more background information on this process in the following +implementation plan: + +- [[Implementation Plan] Decoupling Popularity Calculations from the Data Refresh](https://docs.openverse.org/projects/proposals/popularity_optimizations/20230420-implementation_plan_popularity_optimizations.html) + +## `batched_update` + +Batched Update DAG + +This DAG is used to run a batched SQL update on a media table in the Catalog +database. It is automatically triggered by the `popularity_refresh` DAGs to +refresh popularity data using newly calculated constants, but can also be +triggered manually with custom SQL operations. + +The DAG must be run with a valid dag_run configuration specifying the SQL +commands to be run. The DAG will then split the rows to be updated into batches, +and report to Slack when all batches have been updated. It handles all +deadlocking and timeout concerns, ensuring that the provided SQL is run without +interfering with ingestion. For more information, see the implementation plan: +https://docs.openverse.org/projects/proposals/popularity_optimizations/20230420-implementation_plan_popularity_optimizations.html#special-considerations-avoiding-deadlocks-and-timeouts + +By default the DAG will run as a dry_run, logging the generated SQL but not +actually running it. To actually perform the update, the `dry_run` parameter +must be explicitly set to `false` in the configuration. + +Required Dagrun Configuration parameters: + +- query_id: a string identifier which will be appended to temporary table used + in the update +- table_name: the name of the table to update. Must be a valid media table +- select_query: a SQL `WHERE` clause used to select the rows that will be + updated +- update_query: the SQL `UPDATE` expression to be run on all selected rows + +Optional params: + +- dry_run: bool, whether to actually run the generated SQL. True by default. +- batch_size: int number of records to process in each batch. By default, 10_000 +- update_timeout: int number of seconds to run an individual batch update before + timing out. By default, 3600 (or one hour) +- resume_update: boolean indicating whether to attempt to resume an update using + an existing temp table matching the `query_id`. When True, a new temp table is + not created. + +An example dag_run configuration used to set the thumbnails of all Flickr images +to null would look like this: + +``` +{ + "query_id": "my_flickr_query", + "table_name": "image", + "select_query": "WHERE provider='flickr'", + "update_query": "SET thumbnail=null", + "batch_size": 10, + "dry_run": false +} +``` + +The `update_batches` task automatically keeps track of its progress in an +Airflow variable suffixed with the `query_id`. If the task fails, when it +resumes (either through a retry or by being manually cleared), it will pick up +from where it left off. Manually managing this Airflow variable should not be +necessary. + +It is also possible to start an entirely new DagRun using an existing temp +table, by setting the `resume_update` param to True. With this option enabled, +the DAG will skip creating the temp table and instead attempt to run an update +with an existing temp table matching the `query_id`. This option should only be +used when the DagRun configuration needs to be changed after the table was +already created: for example, if there was a problem with the `update_query` +which caused DAG failures during the `update_batches` step. + +## `check_silenced_dags` + +### Silenced DAGs check + +Check for DAGs that have silenced Slack alerts or skipped errors which may need +to be turned back on. + +When a DAG has known failures, it can be omitted from Slack error reporting by +adding an entry to the `SILENCED_SLACK_NOTIFICATIONS` Airflow variable. +Similarly, errors that occur during the `pull_data` step may be configured to +skip and allow ingestion to continue, using the `SKIPPED_INGESTION_ERRORS` +Airflow variable. These variables are dictionaries where the key is the `dag_id` +of the affected DAG, and the value is a list of configuration dictionaries +mapping an error `predicate` to be skipped/silenced to an open GitHub issue. + +The `check_silenced_dags` DAG iterates over the entries in the +`SILENCED_SLACK_NOTIFICATIONS` and `SKIPPED_INGESTION_ERRORS` configurations and +verifies that the associated GitHub issues are still open. If an issue has been +closed, it is assumed that the entry should be removed, and an alert is sent to +prompt manual update of the configuration. This prevents developers from +forgetting to re-enable Slack reporting or turnoff error skipping after the +issue has been resolved. + +The DAG runs weekly. + +## `create_filtered_audio_index` + +### Create filtered index DAG factory + +This module creates the filtered index creation DAGs for each media type using a +factory function. + +Filtered index creation is handled by the ingestion server. The DAGs generated +by the `build_create_filtered_index_dag` function in this module are responsible +for triggering the ingestion server action to create and populate the filtered +index for a given media type. The DAG awaits the completion of the filtered +index creation and then points the filtered index alias for the media type to +the newly created index. + +#### When this DAG runs + +The DAGs generated in this module are triggered by the data refresh DAGs. +Maintaining this process separate from the data refresh DAGs, while still +triggering it there, allows us to run filtered index creation independently of +the full data refresh. This is primarily useful in two cases: for testing +changes to the filtered index creation; and for re-running filtered index +creation if an urgent change to the sensitive terms calls for an immediate +recreation of the filtered indexes. + +#### Race conditions + +Because filtered index creation employs the `reindex` Elasticsearch API to +derive the filtered index from an existing index, we need to be mindful of the +race condition that potentially exists between the data refresh DAG and this +DAG. The race condition is caused by the fact that the data refresh DAG always +deletes the previous index once the new index for the media type is finished +being created. Consider the situation where filtered index creation is triggered +to run during a data refresh. The filtered index is being derived from the +previous index for the media type. Once the data refresh is finished, it will +delete that index, causing the reindex to halt because suddenly it has no data +source from which to pull documents. + +There are two mechanisms that prevent this from happening: + +1. The filtered index creation DAGs are not allowed to run if a data refresh for + the media type is already running. +2. The data refresh DAGs will wait for any pre-existing filtered index creation + DAG runs for the media type to finish before continuing. + +This ensures that neither are depending on or modifying the origin indexes +critical for the creation of the filtered indexes. + +Because the data refresh DAG triggers the filtered index creation DAG, we do +allow a `force` param to be passed to the DAGs generated by this module. This +parameter is only for use by the data refresh DAG and should not be used when +manually triggering the DAG unless you are absolutely certain of what you are +doing. + +## `create_filtered_image_index` + +### Create filtered index DAG factory + +This module creates the filtered index creation DAGs for each media type using a +factory function. + +Filtered index creation is handled by the ingestion server. The DAGs generated +by the `build_create_filtered_index_dag` function in this module are responsible +for triggering the ingestion server action to create and populate the filtered +index for a given media type. The DAG awaits the completion of the filtered +index creation and then points the filtered index alias for the media type to +the newly created index. + +#### When this DAG runs + +The DAGs generated in this module are triggered by the data refresh DAGs. +Maintaining this process separate from the data refresh DAGs, while still +triggering it there, allows us to run filtered index creation independently of +the full data refresh. This is primarily useful in two cases: for testing +changes to the filtered index creation; and for re-running filtered index +creation if an urgent change to the sensitive terms calls for an immediate +recreation of the filtered indexes. + +#### Race conditions + +Because filtered index creation employs the `reindex` Elasticsearch API to +derive the filtered index from an existing index, we need to be mindful of the +race condition that potentially exists between the data refresh DAG and this +DAG. The race condition is caused by the fact that the data refresh DAG always +deletes the previous index once the new index for the media type is finished +being created. Consider the situation where filtered index creation is triggered +to run during a data refresh. The filtered index is being derived from the +previous index for the media type. Once the data refresh is finished, it will +delete that index, causing the reindex to halt because suddenly it has no data +source from which to pull documents. + +There are two mechanisms that prevent this from happening: + +1. The filtered index creation DAGs are not allowed to run if a data refresh for + the media type is already running. +2. The data refresh DAGs will wait for any pre-existing filtered index creation + DAG runs for the media type to finish before continuing. + +This ensures that neither are depending on or modifying the origin indexes +critical for the creation of the filtered indexes. + +Because the data refresh DAG triggers the filtered index creation DAG, we do +allow a `force` param to be passed to the DAGs generated by this module. This +parameter is only for use by the data refresh DAG and should not be used when +manually triggering the DAG unless you are absolutely certain of what you are +doing. + +## `europeana_workflow` + +Content Provider: Europeana + +ETL Process: Use the API to identify all CC licensed images. + +Output: TSV file containing the images and the respective meta-data. + +Notes: https://pro.europeana.eu/page/search + +## `finnish_museums_workflow` + +Content Provider: Finnish Museums + +ETL Process: Use the API to identify all CC licensed images. + +Output: TSV file containing the images and the respective meta-data. + +Notes: https://api.finna.fi/swagger-ui/ +https://www.finna.fi/Content/help-syntax?lng=en-gb The Finnish Museums provider +script is a dated DAG that ingests all records that were last updated in the +previous day. Because of this, it is not necessary to run a separate reingestion +DAG, as updated data will be processed during regular ingestion. + +## `flickr_audit_sub_provider_workflow` + +### Flickr Sub Provider Audit + +Check the list of member institutions of the Flickr Commons for institutions +that have cc-licensed images and are not already configured as sub-providers for +the Flickr DAG. Report suggestions for new sub-providers to Slack. + +## `flickr_reingestion_workflow` + +Content Provider: Flickr + +ETL Process: Use the API to identify all CC licensed images. + +Output: TSV file containing the images and the respective meta-data. + +Notes: https://www.flickr.com/help/terms/api Rate limit: 3600 requests per hour. + +## `flickr_thumbnails_removal` + +One-time run DAG to remove progressively all the old Flickr thumbnails, as they +were determined to be unsuitable for the Openverse UI requirements. + +## `flickr_workflow` + +Content Provider: Flickr + +ETL Process: Use the API to identify all CC licensed images. + +Output: TSV file containing the images and the respective meta-data. + +Notes: https://www.flickr.com/help/terms/api Rate limit: 3600 requests per hour. + +## `freesound_workflow` + +Content Provider: Freesound + +ETL Process: Use the API to identify all CC-licensed images. + +Output: TSV file containing the image, the respective meta-data. + +Notes: https://freesound.org/docs/api/ Rate limit: No limit for our API key. +This script can be run either to ingest the full dataset or as a dated DAG. + +## `image_data_refresh` + +### Data Refresh DAG Factory + +This file generates our data refresh DAGs using a factory function. For the +given media type these DAGs will initiate a data refresh on the ingestion server +and await the success or failure of that task. + +A data refresh occurs on the Ingestion server in the Openverse project. This is +a task which imports data from the upstream Catalog database into the API, +copies contents to a new Elasticsearch index, and finally makes the index +"live". This process is necessary to make new content added to the Catalog by +our provider DAGs available to the API. You can read more in the +[README](https://github.com/WordPress/openverse/blob/main/ingestion_server/README.md) +Importantly, the data refresh TaskGroup is also configured to handle concurrency +requirements of the Ingestion server. Finally, once the origin indexes have been +refreshed, the corresponding filtered index creation DAG is triggered. + +You can find more background information on this process in the following issues +and related PRs: + +- [[Feature] Data refresh orchestration DAG](https://github.com/WordPress/openverse-catalog/issues/353) +- [[Feature] Merge popularity calculations and data refresh into a single DAG](https://github.com/WordPress/openverse-catalog/issues/453) + +## `image_popularity_refresh` + +### Popularity Refresh DAG Factory + +This file generates our popularity refresh DAGs using a factory function. + +For the given media type these DAGs will first update the popularity metrics +table, adding any new metrics and updating the percentile that is used in +calculating the popularity constants. It then refreshes the popularity constants +view, which recalculates the popularity constant for each provider. + +Once the constants have been updated, the DAG will trigger a `batched_update` +DagRun for each provider of this media_type that is configured to support +popularity data. The batched update recalculates standardized popularity scores +for all records, using the new constant. When the updates are complete, all +records have up-to-date popularity data. This DAG can be run concurrently with +data refreshes and regular ingestion. + +You can find more background information on this process in the following +implementation plan: + +- [[Implementation Plan] Decoupling Popularity Calculations from the Data Refresh](https://docs.openverse.org/projects/proposals/popularity_optimizations/20230420-implementation_plan_popularity_optimizations.html) + +## `inaturalist_workflow` + +Provider: iNaturalist + +Output: Records loaded to the image catalog table. + +Notes: The iNaturalist API is not intended for data scraping. +https://api.inaturalist.org/v1/docs/ But there is a full dump intended for +sharing on S3. +https://github.com/inaturalist/inaturalist-open-data/tree/documentation/Metadata +Because these are very large normalized tables, as opposed to more document +oriented API responses, we found that bringing the data into postgres first was +the most effective approach. More detail in slack here: +https://wordpress.slack.com/archives/C02012JB00N/p1653145643080479?thread_ts=1653082292.714469&cid=C02012JB00N +We use the table structure defined here, +https://github.com/inaturalist/inaturalist-open-data/blob/main/Metadata/structure.sql +except for adding ancestry tags to the taxa table. + +## `jamendo_workflow` + +Content Provider: Jamendo + +ETL Process: Use the API to identify all CC-licensed audio. + +Output: TSV file containing the audio meta-data. + +Notes: https://api.jamendo.com/v3.0/tracks/ 35,000 requests per month for +non-commercial apps Jamendo Music has more than 500,000 tracks shared by 40,000 +artists from over 150 countries all over the world. Audio quality: uploaded as +WAV/ FLAC/ AIFF bit depth: 16/24 sample rate: 44.1 or 48 kHz channels: 1/2 + +## `justtakeitfree_workflow` + +Content Provider: Justtakeitfree + +ETL Process: Use the API to identify all CC licensed media. + +Output: TSV file containing the media and the respective meta-data. + +Notes: https://justtakeitfree.com/api/api.php This API requires an API key. For +more details, see https://github.com/WordPress/openverse/pull/2793 + +## `metropolitan_museum_reingestion_workflow` + +Content Provider: Metropolitan Museum of Art + +ETL Process: Use the API to identify all CC0 artworks. + +Output: TSV file containing the image, their respective meta-data. + +Notes: https://metmuseum.github.io/#search "Please limit requests to 80 requests +per second." May need to bump up the delay (e.g. to 3 seconds), to avoid of +blocking during local development testing. + + Some analysis to improve data quality was conducted using a + separate csv file here: https://github.com/metmuseum/openaccess + + Get a list of object IDs: + https://collectionapi.metmuseum.org/public/collection/v1/objects?metadataDate=2022-08-10 + Get a specific object: + https://collectionapi.metmuseum.org/public/collection/v1/objects/1027 + The search functionality requires a specific query (term search) + in addition to date and public domain. It seems like it won't + connect with just date and license. + https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 + +## `metropolitan_museum_workflow` + +Content Provider: Metropolitan Museum of Art + +ETL Process: Use the API to identify all CC0 artworks. + +Output: TSV file containing the image, their respective meta-data. + +Notes: https://metmuseum.github.io/#search "Please limit requests to 80 requests +per second." May need to bump up the delay (e.g. to 3 seconds), to avoid of +blocking during local development testing. + + Some analysis to improve data quality was conducted using a + separate csv file here: https://github.com/metmuseum/openaccess + + Get a list of object IDs: + https://collectionapi.metmuseum.org/public/collection/v1/objects?metadataDate=2022-08-10 + Get a specific object: + https://collectionapi.metmuseum.org/public/collection/v1/objects/1027 + The search functionality requires a specific query (term search) + in addition to date and public domain. It seems like it won't + connect with just date and license. + https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 + +## `nappy_workflow` + +Content Provider: Nappy + +ETL Process: Use the API to identify all CC0-licensed images. + +Output: TSV file containing the image meta-data. + +Notes: This api was written specially for Openverse. There are no known limits +or restrictions. https://nappy.co/ + +## `oauth2_authorization` + +### OAuth Provider Authorization + +Iterates through all the OAuth2 providers and attempts to authorize them using +tokens found in the in the `OAUTH2_AUTH_KEYS` Variable. Once authorization has +been completed successfully, the auth token is removed from that Variable. The +authorization will create an access/refresh token pair in the +`OAUTH2_ACCESS_TOKENS` Variable. + +**Current Providers**: + +- Freesound + +## `oauth2_token_refresh` + +### OAuth Provider Token Refresh + +Iterates through all OAuth2 providers and attempts to refresh the access token +using the refresh token stored in the `OAUTH2_ACCESS_TOKENS` Variable. This DAG +will update the tokens stored in the Variable upon successful refresh. + +**Current Providers**: + +- Freesound + +## `phylopic_reingestion_workflow` + +Content Provider: PhyloPic + +ETL Process: Use the API to identify all CC licensed images. + +Output: TSV file containing the image, their respective meta-data. + +Notes: http://api-docs.phylopic.org/v2/ No rate limit specified. + +## `phylopic_workflow` + +Content Provider: PhyloPic + +ETL Process: Use the API to identify all CC licensed images. + +Output: TSV file containing the image, their respective meta-data. + +Notes: http://api-docs.phylopic.org/v2/ No rate limit specified. + +## `pr_review_reminders` + +### PR Review Reminders + +Iterates through open PRs in our repositories and pings assigned reviewers who +have not yet approved the PR or explicitly requested changes. + +This DAG runs daily and pings on the following schedule based on priority label: + +| priority | days | +| -------- | ------- | +| critical | 1 day | +| high | >2 days | +| medium | >4 days | +| low | >7 days | + +The DAG does not ping on Saturday and Sunday and accounts for weekend days when +determining how much time has passed since the review. + +Unfortunately the DAG does not know when someone is on vacation. It is up to the +author of the PR to re-assign review if one of the randomly selected reviewers +is unavailable for the time period during which the PR should be reviewed. + +## `rawpixel_workflow` + +Content Provider: Rawpixel + +ETL Process: Use the API to identify all CC-licensed images. + +Output: TSV file containing the image meta-data. + +Notes: Rawpixel has given Openverse beta access to their API. This API is +undocumented, and we will need to contact Rawpixel directly if we run into any +issues. The public API max results range is limited to 100,000 results, although +the API key we've been given can circumvent this limit. +https://www.rawpixel.com/api/v1/search?tags=$publicdomain&page=1&pagesize=100 + +## `recreate_audio_popularity_calculation` + +This file generates Apache Airflow DAGs that, for the given media type, +completely wipes out and recreates the PostgreSQL functions involved in +calculating our standardized popularity metric. + +Note that they do not drop any tables or views related to popularity, and they +do not perform any popularity calculations. Once this DAG has been run, the +associated popularity refresh DAG must be run in order to actually recalculate +popularity constants and standardized popularity scores using the new functions. + +These DAGs are not on a schedule, and should only be run manually when new SQL +code is deployed for the calculation. + +## `recreate_image_popularity_calculation` + +This file generates Apache Airflow DAGs that, for the given media type, +completely wipes out and recreates the PostgreSQL functions involved in +calculating our standardized popularity metric. + +Note that they do not drop any tables or views related to popularity, and they +do not perform any popularity calculations. Once this DAG has been run, the +associated popularity refresh DAG must be run in order to actually recalculate +popularity constants and standardized popularity scores using the new functions. + +These DAGs are not on a schedule, and should only be run manually when new SQL +code is deployed for the calculation. + +## `report_pending_reported_media` + +### Report Pending Reported Media DAG + +This DAG checks for any user-reported media pending manual review, and alerts +via Slack. + +Media may be reported for mature content or copyright infringement, for example. +Once reported, these require manual review through the Django Admin to determine +whether further action (such as deindexing the record) needs to be taken. If a +record has been reported multiple times, it only needs to be reviewed once and +so is only counted once in the reporting by this DAG. + +## `rotate_db_snapshots` + +Manages weekly database snapshots. + +RDS does not support weekly snapshots schedules on its own, so we need a DAG to +manage this for us. + +It runs on Saturdays at 00:00 UTC in order to happen before the data refresh. + +The DAG will automatically delete the oldest snapshots when more snaphots exist +than it is configured to retain. + +Requires two variables: + +`CATALOG_RDS_DB_IDENTIFIER`: The "DBIdentifier" of the RDS DB instance. +`CATALOG_RDS_SNAPSHOTS_TO_RETAIN`: How many historical snapshots to retain. + +## `science_museum_workflow` + +Content Provider: Science Museum + +ETL Process: Use the API to identify all CC-licensed images. + +Output: TSV file containing the image, the respective meta-data. + +Notes: +https://github.com/TheScienceMuseum/collectionsonline/wiki/Collections-Online-API +Rate limited, no specific rate given. + +## `smithsonian_workflow` + +Content Provider: Smithsonian + +ETL Process: Use the API to identify all CC licensed images. + +Output: TSV file containing the images and the respective meta-data. + +Notes: https://api.si.edu/openaccess/api/v1.0/search + +## `smk_workflow` + +Content Provider: Statens Museum for Kunst (National Gallery of Denmark) + +ETL Process: Use the API to identify all openly licensed media. + +Output: TSV file containing the media metadata. + +Notes: https://www.smk.dk/en/article/smk-api/ + +## `staging_database_restore` + +### Update the staging database + +This DAG is responsible for updating the staging database using the most recent +snapshot of the production database. + +For a full explanation of the DAG, see the implementation plan description: +https://docs.openverse.org/projects/proposals/search_relevancy_sandbox/20230406-implementation_plan_update_staging_database.html#dag + +This DAG can be skipped by setting the `SKIP_STAGING_DATABASE_RESTORE` Airflow +Variable to `true`. To change this variable, navigate to Admin > Variables in +the Airflow UI, then click the "edit" button next to the variable and set the +value to either `true` or `false`. + +This DAG will default to using the standard AWS connection ID for the RDS +operations. For local testing, you can set up two environment variables to have +the RDS operations run using a different hook: + +- `AWS_RDS_CONN_ID`: The Airflow connection ID to use for RDS operations (e.g. + `aws_rds`) +- `AIRFLOW_CONN_`: The connection string to use for RDS operations (per the + above example, it might be `AIRFLOW_CONN_AWS_RDS`) + +## `stocksnap_workflow` + +Content Provider: StockSnap + +ETL Process: Use the API to identify all CC-licensed images. + +Output: TSV file containing the image, the respective meta-data. + +Notes: https://stocksnap.io/api/load-photos/date/desc/1 https://stocksnap.io/faq +All images are licensed under CC0. No rate limits or authorization required. API +is undocumented. + +## `wikimedia_commons_workflow` + +**Content Provider:** Wikimedia Commons + +**ETL Process:** Use the API to identify all CC-licensed images. + +**Output:** TSV file containing the image, the respective meta-data. + +### Notes + +Rate limit of no more than 200 requests/second, and we are required to set a +unique User-Agent field +([docs](https://www.mediawiki.org/wiki/Wikimedia_REST_API#Terms_and_conditions)). + +Wikimedia Commons uses an implementation of the +[MediaWiki API](https://www.mediawiki.org/wiki/API:Main_page). This API is +incredibly complex in the level of configuration you can provide when querying, +and as such it can also be quite abstruse. The most straightforward docs can be +found on the +[Wikimedia website directly](https://commons.wikimedia.org/w/api.php?action=help&modules=query), +as these show all the parameters available. Specifications on queries can also +be found on the [query page](https://www.mediawiki.org/wiki/API:Query). + +Different kinds of queries can be made against the API using "modules", we use +the [allimages module](https://www.mediawiki.org/wiki/API:Allimages), which lets +us search for images in a given time range (see `"generator": "allimages"` in +the query params). + +Many queries will return results in batches, with the API supplying a "continue" +token. This token is used on subsequent calls to tell the API where to start the +next set of results from; it functions as a page offset +([docs](https://www.mediawiki.org/wiki/API:Query#Continuing_queries)). + +We can also specify what kinds of information we want for the query. Wikimedia +has a massive amount of data on it, so it only returns what we ask for. The +fields that are returned are defined by the +[properties](https://www.mediawiki.org/wiki/API:Properties) or "props" +parameter. Sub-properties can also be defined, with the parameter name of the +sub-property determined by an abbreviation of the higher-level property. For +instance, if our property is "imageinfo" and we want to set sub-property values, +we would define those in "iiprops". + +The data within a property is paginated as well, Wikimedia will handle iteration +through the property sub-pages using the "continue" token +([see here for more details](https://www.mediawiki.org/wiki/API:Properties#Additional_notes)). + +Depending on the kind of property data that's being returned, it's possible for +the API to iterate extensively on a specific media item. What Wikimedia is +iterating over in these cases can be gleaned from the "continue" token. Those +tokens take the form of, as I understand it, +"||", paired with an "continue" value +for the property being iterated over. For example, if we're were iterating over +a set of image properties, the token might look like: + +``` +{ + "iicontinue": "The_Railway_Chronicle_1844.pdf|20221209222801", + "gaicontinue": "20221209222614|NTUL-0527100_英國產業革命史略.pdf", + "continue": "gaicontinue||globalusage", +} +``` + +In this case, we're iterating over the "global all images" generator +(gaicontinue) as our primary iterator, with the "image properties" (iicontinue) +as the secondary continue iterator. The "globalusage" property would be the next +property to iterate over. It's also possible for multiple sub-properties to be +iterated over simultaneously, in which case the "continue" token would not have +a secondary value (e.g. `gaicontinue||`). + +In most runs, the "continue" key will be `gaicontinue||` after the first +request, which means that we have more than one batch to iterate over for the +primary iterator. Some days will have fewer images than the batch limit but +still have multiple batches of content on the secondary iterator, which means +the "continue" key may not have a primary iteration component (e.g. +`||globalusage`). This token can also be seen when the first request has more +data in the secondary iterator, before we've processed any data on the primary +iterator. + +Occasionally, the ingester will come across a piece of media that has many +results for the property it's iterating over. An example of this can include an +item being on many pages, this it would have many "global usage" results. In +order to process the entire batch, we have to iterate over _all_ of the returned +results; Wikimedia does not provide a mechanism to "skip to the end" of a batch. +On numerous occasions, this iteration has been so extensive that the pull media +task has hit the task's timeout. To avoid this, we limit the number of +iterations we make for parsing through a sub-property's data. If we hit the +limit, we re-issue the original query _without_ requesting properties that +returned large amounts of data. Unfortunately, this means that we will **not** +have that property's data for these items the second time around (e.g. +popularity data if we needed to skip global usage). In the case of popularity, +especially since the problem with these images is that they're so popular, we +want to preserve that information where possible! So we cache the popularity +data from previous iterations and use it in subsequent ones if we come across +the same item again. + +Below are some specific references to various properties, with examples for +cases where they might exceed the limit. Technically, it's feasible for almost +any property to exceed the limit, but these are the ones that we've seen in +practice. + +#### `imageinfo` + +[Docs](https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bimageinfo) + +[Example where metadata has hundreds of data points](https://commons.wikimedia.org/wiki/File:The_Railway_Chronicle_1844.pdf#metadata) +(see "Metadata" table, which may need to be expanded). + +For these requests, we can remove the `metadata` property from the `iiprops` +parameter to avoid this issue on subsequent iterations. + +#### `globalusage` + +[Docs](https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bglobalusage) + +[Example where an image is used on almost every wiki](https://commons.wikimedia.org/w/index.php?curid=4298234). + +For these requests, we can remove the `globalusage` property from the `prop` +parameter entirely and eschew the popularity data for these items. + +## `wikimedia_reingestion_workflow` + +**Content Provider:** Wikimedia Commons + +**ETL Process:** Use the API to identify all CC-licensed images. + +**Output:** TSV file containing the image, the respective meta-data. + +### Notes + +Rate limit of no more than 200 requests/second, and we are required to set a +unique User-Agent field +([docs](https://www.mediawiki.org/wiki/Wikimedia_REST_API#Terms_and_conditions)). + +Wikimedia Commons uses an implementation of the +[MediaWiki API](https://www.mediawiki.org/wiki/API:Main_page). This API is +incredibly complex in the level of configuration you can provide when querying, +and as such it can also be quite abstruse. The most straightforward docs can be +found on the +[Wikimedia website directly](https://commons.wikimedia.org/w/api.php?action=help&modules=query), +as these show all the parameters available. Specifications on queries can also +be found on the [query page](https://www.mediawiki.org/wiki/API:Query). + +Different kinds of queries can be made against the API using "modules", we use +the [allimages module](https://www.mediawiki.org/wiki/API:Allimages), which lets +us search for images in a given time range (see `"generator": "allimages"` in +the query params). + +Many queries will return results in batches, with the API supplying a "continue" +token. This token is used on subsequent calls to tell the API where to start the +next set of results from; it functions as a page offset +([docs](https://www.mediawiki.org/wiki/API:Query#Continuing_queries)). + +We can also specify what kinds of information we want for the query. Wikimedia +has a massive amount of data on it, so it only returns what we ask for. The +fields that are returned are defined by the +[properties](https://www.mediawiki.org/wiki/API:Properties) or "props" +parameter. Sub-properties can also be defined, with the parameter name of the +sub-property determined by an abbreviation of the higher-level property. For +instance, if our property is "imageinfo" and we want to set sub-property values, +we would define those in "iiprops". + +The data within a property is paginated as well, Wikimedia will handle iteration +through the property sub-pages using the "continue" token +([see here for more details](https://www.mediawiki.org/wiki/API:Properties#Additional_notes)). + +Depending on the kind of property data that's being returned, it's possible for +the API to iterate extensively on a specific media item. What Wikimedia is +iterating over in these cases can be gleaned from the "continue" token. Those +tokens take the form of, as I understand it, +"||", paired with an "continue" value +for the property being iterated over. For example, if we're were iterating over +a set of image properties, the token might look like: + +``` +{ + "iicontinue": "The_Railway_Chronicle_1844.pdf|20221209222801", + "gaicontinue": "20221209222614|NTUL-0527100_英國產業革命史略.pdf", + "continue": "gaicontinue||globalusage", +} +``` + +In this case, we're iterating over the "global all images" generator +(gaicontinue) as our primary iterator, with the "image properties" (iicontinue) +as the secondary continue iterator. The "globalusage" property would be the next +property to iterate over. It's also possible for multiple sub-properties to be +iterated over simultaneously, in which case the "continue" token would not have +a secondary value (e.g. `gaicontinue||`). + +In most runs, the "continue" key will be `gaicontinue||` after the first +request, which means that we have more than one batch to iterate over for the +primary iterator. Some days will have fewer images than the batch limit but +still have multiple batches of content on the secondary iterator, which means +the "continue" key may not have a primary iteration component (e.g. +`||globalusage`). This token can also be seen when the first request has more +data in the secondary iterator, before we've processed any data on the primary +iterator. + +Occasionally, the ingester will come across a piece of media that has many +results for the property it's iterating over. An example of this can include an +item being on many pages, this it would have many "global usage" results. In +order to process the entire batch, we have to iterate over _all_ of the returned +results; Wikimedia does not provide a mechanism to "skip to the end" of a batch. +On numerous occasions, this iteration has been so extensive that the pull media +task has hit the task's timeout. To avoid this, we limit the number of +iterations we make for parsing through a sub-property's data. If we hit the +limit, we re-issue the original query _without_ requesting properties that +returned large amounts of data. Unfortunately, this means that we will **not** +have that property's data for these items the second time around (e.g. +popularity data if we needed to skip global usage). In the case of popularity, +especially since the problem with these images is that they're so popular, we +want to preserve that information where possible! So we cache the popularity +data from previous iterations and use it in subsequent ones if we come across +the same item again. + +Below are some specific references to various properties, with examples for +cases where they might exceed the limit. Technically, it's feasible for almost +any property to exceed the limit, but these are the ones that we've seen in +practice. + +#### `imageinfo` + +[Docs](https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bimageinfo) + +[Example where metadata has hundreds of data points](https://commons.wikimedia.org/wiki/File:The_Railway_Chronicle_1844.pdf#metadata) +(see "Metadata" table, which may need to be expanded). + +For these requests, we can remove the `metadata` property from the `iiprops` +parameter to avoid this issue on subsequent iterations. + +#### `globalusage` + +[Docs](https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bglobalusage) + +[Example where an image is used on almost every wiki](https://commons.wikimedia.org/w/index.php?curid=4298234). + +For these requests, we can remove the `globalusage` property from the `prop` +parameter entirely and eschew the popularity data for these items. + +## `wordpress_workflow` + +Content Provider: WordPress Photo Directory + +ETL Process: Use the API to identify all openly licensed media. + +Output: TSV file containing the media metadata. + +Notes: https://wordpress.org/photos/wp-json/wp/v2 Provide photos, media, users +and more related resources. No rate limit specified. From e319e523d418e32661113e254a5301bcca00761f Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Mon, 25 Sep 2023 18:54:31 -0400 Subject: [PATCH 4/8] DAGs.md to documentation/catalog/reference and add table of content --- catalog/justfile | 4 +- documentation/DAGs.md | 1117 ----------------- .../catalog/reference}/DAGs.md | 0 documentation/catalog/reference/index.md | 7 + 4 files changed, 9 insertions(+), 1119 deletions(-) delete mode 100644 documentation/DAGs.md rename {catalog => documentation/catalog/reference}/DAGs.md (100%) create mode 100644 documentation/catalog/reference/index.md diff --git a/catalog/justfile b/catalog/justfile index a491c0f6d51..dec78c046b9 100644 --- a/catalog/justfile +++ b/catalog/justfile @@ -127,10 +127,10 @@ generate-dag-docs fail_on_diff="false": {{ SERVICE }} \ "bash -c 'python catalog/utilities/dag_doc_gen/dag_doc_generation.py && chmod 666 /opt/airflow/catalog/utilities/dag_doc_gen/DAGs.md'" # Move the file to the top level on the documentation folder, since that level is not mounted into the container - mv utilities/dag_doc_gen/DAGs.md ../documentation/DAGs.md + mv utilities/dag_doc_gen/DAGs.md ../documentation/catalog/reference/DAGs.md echo -n "Running linting..." # Linting step afterwards is necessary since the generated output differs greatly from what prettier expects - just ../lint prettier documentation/DAGs.md &>/dev/null || true + just ../lint prettier documentation/catalog/reference/DAGs.md &>/dev/null || true echo "Done!" if {{ fail_on_diff }}; then set +e diff --git a/documentation/DAGs.md b/documentation/DAGs.md deleted file mode 100644 index ac3212fae71..00000000000 --- a/documentation/DAGs.md +++ /dev/null @@ -1,1117 +0,0 @@ -# DAGs - -_Note: this document is auto-generated and should not be manually edited_ - -This document describes the DAGs available along with pertinent DAG information -and the DAG's documentation. - -The DAGs are shown in two forms: - -- [DAGs by Type](#dags-by-type) -- [Individual DAG documentation](#dag-documentation) - -# DAGs by Type - -The following are DAGs grouped by their primary tag: - -1. [Data Normalization](#data_normalization) -1. [Data Refresh](#data_refresh) -1. [Database](#database) -1. [Maintenance](#maintenance) -1. [Oauth](#oauth) -1. [Other](#other) -1. [Popularity Refresh](#popularity_refresh) -1. [Provider](#provider) -1. [Provider Reingestion](#provider-reingestion) - -## Data Normalization - -| DAG ID | Schedule Interval | -| ------------------------------------- | ----------------- | -| [`add_license_url`](#add_license_url) | `None` | - -## Data Refresh - -| DAG ID | Schedule Interval | -| ------------------------------------------------------------- | ----------------- | -| [`audio_data_refresh`](#audio_data_refresh) | `0 0 * * 1` | -| [`create_filtered_audio_index`](#create_filtered_audio_index) | `None` | -| [`create_filtered_image_index`](#create_filtered_image_index) | `None` | -| [`image_data_refresh`](#image_data_refresh) | `0 0 * * 1` | - -## Database - -| DAG ID | Schedule Interval | -| --------------------------------------------------------------------------------- | ----------------- | -| [`batched_update`](#batched_update) | `None` | -| [`recreate_audio_popularity_calculation`](#recreate_audio_popularity_calculation) | `None` | -| [`recreate_image_popularity_calculation`](#recreate_image_popularity_calculation) | `None` | -| [`report_pending_reported_media`](#report_pending_reported_media) | `@weekly` | -| [`staging_database_restore`](#staging_database_restore) | `@monthly` | - -## Maintenance - -| DAG ID | Schedule Interval | -| --------------------------------------------------------------------------- | ----------------- | -| [`airflow_log_cleanup`](#airflow_log_cleanup) | `@weekly` | -| [`check_silenced_dags`](#check_silenced_dags) | `@weekly` | -| [`flickr_audit_sub_provider_workflow`](#flickr_audit_sub_provider_workflow) | `@monthly` | -| [`pr_review_reminders`](#pr_review_reminders) | `0 0 * * 1-5` | -| [`rotate_db_snapshots`](#rotate_db_snapshots) | `0 0 * * 6` | - -## Oauth - -| DAG ID | Schedule Interval | -| ----------------------------------------------- | ----------------- | -| [`oauth2_authorization`](#oauth2_authorization) | `None` | -| [`oauth2_token_refresh`](#oauth2_token_refresh) | `0 */12 * * *` | - -## Other - -| DAG ID | Schedule Interval | -| --------------------------------------------------------- | ----------------- | -| [`flickr_thumbnails_removal`](#flickr_thumbnails_removal) | `None` | - -## Popularity Refresh - -| DAG ID | Schedule Interval | -| ------------------------------------------------------- | ----------------- | -| [`audio_popularity_refresh`](#audio_popularity_refresh) | `@monthly` | -| [`image_popularity_refresh`](#image_popularity_refresh) | `@monthly` | - -## Provider - -| DAG ID | Schedule Interval | Dated | Media Type(s) | -| --------------------------------------------------------------- | ----------------- | ------- | ------------- | -| `brooklyn_museum_workflow` | `@monthly` | `False` | image | -| `cleveland_museum_workflow` | `@monthly` | `False` | image | -| [`europeana_workflow`](#europeana_workflow) | `@daily` | `True` | image | -| [`finnish_museums_workflow`](#finnish_museums_workflow) | `@daily` | `True` | image | -| [`flickr_workflow`](#flickr_workflow) | `@daily` | `True` | image | -| [`freesound_workflow`](#freesound_workflow) | `@quarterly` | `False` | audio | -| [`inaturalist_workflow`](#inaturalist_workflow) | `0 0 2 * *` | `False` | image | -| [`jamendo_workflow`](#jamendo_workflow) | `@monthly` | `False` | audio | -| [`justtakeitfree_workflow`](#justtakeitfree_workflow) | `@monthly` | `False` | image | -| [`metropolitan_museum_workflow`](#metropolitan_museum_workflow) | `@daily` | `True` | image | -| `museum_victoria_workflow` | `@monthly` | `False` | image | -| [`nappy_workflow`](#nappy_workflow) | `@monthly` | `False` | image | -| `nypl_workflow` | `@monthly` | `False` | image | -| [`phylopic_workflow`](#phylopic_workflow) | `@weekly` | `False` | image | -| [`rawpixel_workflow`](#rawpixel_workflow) | `@monthly` | `False` | image | -| [`science_museum_workflow`](#science_museum_workflow) | `@monthly` | `False` | image | -| [`smithsonian_workflow`](#smithsonian_workflow) | `@weekly` | `False` | image | -| [`smk_workflow`](#smk_workflow) | `@monthly` | `False` | image | -| [`stocksnap_workflow`](#stocksnap_workflow) | `@monthly` | `False` | image | -| [`wikimedia_commons_workflow`](#wikimedia_commons_workflow) | `@daily` | `True` | image, audio | -| [`wordpress_workflow`](#wordpress_workflow) | `@monthly` | `False` | image | - -## Provider Reingestion - -| DAG ID | Schedule Interval | -| --------------------------------------------------------------------------------------- | ----------------- | -| [`flickr_reingestion_workflow`](#flickr_reingestion_workflow) | `@weekly` | -| [`metropolitan_museum_reingestion_workflow`](#metropolitan_museum_reingestion_workflow) | `@weekly` | -| [`phylopic_reingestion_workflow`](#phylopic_reingestion_workflow) | `@weekly` | -| [`wikimedia_reingestion_workflow`](#wikimedia_reingestion_workflow) | `@weekly` | - -# DAG documentation - -The following is documentation associated with each DAG (where available): - -1. [`add_license_url`](#add_license_url) -1. [`airflow_log_cleanup`](#airflow_log_cleanup) -1. [`audio_data_refresh`](#audio_data_refresh) -1. [`audio_popularity_refresh`](#audio_popularity_refresh) -1. [`batched_update`](#batched_update) -1. [`check_silenced_dags`](#check_silenced_dags) -1. [`create_filtered_audio_index`](#create_filtered_audio_index) -1. [`create_filtered_image_index`](#create_filtered_image_index) -1. [`europeana_workflow`](#europeana_workflow) -1. [`finnish_museums_workflow`](#finnish_museums_workflow) -1. [`flickr_audit_sub_provider_workflow`](#flickr_audit_sub_provider_workflow) -1. [`flickr_reingestion_workflow`](#flickr_reingestion_workflow) -1. [`flickr_thumbnails_removal`](#flickr_thumbnails_removal) -1. [`flickr_workflow`](#flickr_workflow) -1. [`freesound_workflow`](#freesound_workflow) -1. [`image_data_refresh`](#image_data_refresh) -1. [`image_popularity_refresh`](#image_popularity_refresh) -1. [`inaturalist_workflow`](#inaturalist_workflow) -1. [`jamendo_workflow`](#jamendo_workflow) -1. [`justtakeitfree_workflow`](#justtakeitfree_workflow) -1. [`metropolitan_museum_reingestion_workflow`](#metropolitan_museum_reingestion_workflow) -1. [`metropolitan_museum_workflow`](#metropolitan_museum_workflow) -1. [`nappy_workflow`](#nappy_workflow) -1. [`oauth2_authorization`](#oauth2_authorization) -1. [`oauth2_token_refresh`](#oauth2_token_refresh) -1. [`phylopic_reingestion_workflow`](#phylopic_reingestion_workflow) -1. [`phylopic_workflow`](#phylopic_workflow) -1. [`pr_review_reminders`](#pr_review_reminders) -1. [`rawpixel_workflow`](#rawpixel_workflow) -1. [`recreate_audio_popularity_calculation`](#recreate_audio_popularity_calculation) -1. [`recreate_image_popularity_calculation`](#recreate_image_popularity_calculation) -1. [`report_pending_reported_media`](#report_pending_reported_media) -1. [`rotate_db_snapshots`](#rotate_db_snapshots) -1. [`science_museum_workflow`](#science_museum_workflow) -1. [`smithsonian_workflow`](#smithsonian_workflow) -1. [`smk_workflow`](#smk_workflow) -1. [`staging_database_restore`](#staging_database_restore) -1. [`stocksnap_workflow`](#stocksnap_workflow) -1. [`wikimedia_commons_workflow`](#wikimedia_commons_workflow) -1. [`wikimedia_reingestion_workflow`](#wikimedia_reingestion_workflow) -1. [`wordpress_workflow`](#wordpress_workflow) - -## `add_license_url` - -### Add license URL - -Add `license_url` to all rows that have `NULL` in their `meta_data` fields. This -PR sets the meta_data value to "{license_url: https://... }", where the url is -constructed from the `license` and `license_version` columns. - -This is a maintenance DAG that should be run once. If all the null values in the -`meta_data` column are updated, the DAG will only run the first and the last -step, logging the statistics. - -## `airflow_log_cleanup` - -### Clean up airflow logs - -A maintenance workflow that you can deploy into Airflow to periodically clean -out the task logs to avoid those getting too big. By default, this will also -clean child process logs from the 'scheduler' directory. - -Can remove all log files by setting "maxLogAgeInDays" to -1. If you want to test -the DAG in the Airflow Web UI, you can also set enableDelete to `false`, and -then you will see a list of log folders that can be deleted, but will not -actually delete them. - -This should all go on one line: - -``` -airflow dags trigger --conf -'{"maxLogAgeInDays":-1, "enableDelete": "false"}' airflow_log_cleanup -``` - -`--conf` options: - -- maxLogAgeInDays: - Optional -- enableDelete: - Optional - -## `audio_data_refresh` - -### Data Refresh DAG Factory - -This file generates our data refresh DAGs using a factory function. For the -given media type these DAGs will initiate a data refresh on the ingestion server -and await the success or failure of that task. - -A data refresh occurs on the Ingestion server in the Openverse project. This is -a task which imports data from the upstream Catalog database into the API, -copies contents to a new Elasticsearch index, and finally makes the index -"live". This process is necessary to make new content added to the Catalog by -our provider DAGs available to the API. You can read more in the -[README](https://github.com/WordPress/openverse/blob/main/ingestion_server/README.md) -Importantly, the data refresh TaskGroup is also configured to handle concurrency -requirements of the Ingestion server. Finally, once the origin indexes have been -refreshed, the corresponding filtered index creation DAG is triggered. - -You can find more background information on this process in the following issues -and related PRs: - -- [[Feature] Data refresh orchestration DAG](https://github.com/WordPress/openverse-catalog/issues/353) -- [[Feature] Merge popularity calculations and data refresh into a single DAG](https://github.com/WordPress/openverse-catalog/issues/453) - -## `audio_popularity_refresh` - -### Popularity Refresh DAG Factory - -This file generates our popularity refresh DAGs using a factory function. - -For the given media type these DAGs will first update the popularity metrics -table, adding any new metrics and updating the percentile that is used in -calculating the popularity constants. It then refreshes the popularity constants -view, which recalculates the popularity constant for each provider. - -Once the constants have been updated, the DAG will trigger a `batched_update` -DagRun for each provider of this media_type that is configured to support -popularity data. The batched update recalculates standardized popularity scores -for all records, using the new constant. When the updates are complete, all -records have up-to-date popularity data. This DAG can be run concurrently with -data refreshes and regular ingestion. - -You can find more background information on this process in the following -implementation plan: - -- [[Implementation Plan] Decoupling Popularity Calculations from the Data Refresh](https://docs.openverse.org/projects/proposals/popularity_optimizations/20230420-implementation_plan_popularity_optimizations.html) - -## `batched_update` - -Batched Update DAG - -This DAG is used to run a batched SQL update on a media table in the Catalog -database. It is automatically triggered by the `popularity_refresh` DAGs to -refresh popularity data using newly calculated constants, but can also be -triggered manually with custom SQL operations. - -The DAG must be run with a valid dag_run configuration specifying the SQL -commands to be run. The DAG will then split the rows to be updated into batches, -and report to Slack when all batches have been updated. It handles all -deadlocking and timeout concerns, ensuring that the provided SQL is run without -interfering with ingestion. For more information, see the implementation plan: -https://docs.openverse.org/projects/proposals/popularity_optimizations/20230420-implementation_plan_popularity_optimizations.html#special-considerations-avoiding-deadlocks-and-timeouts - -By default the DAG will run as a dry_run, logging the generated SQL but not -actually running it. To actually perform the update, the `dry_run` parameter -must be explicitly set to `false` in the configuration. - -Required Dagrun Configuration parameters: - -- query_id: a string identifier which will be appended to temporary table used - in the update -- table_name: the name of the table to update. Must be a valid media table -- select_query: a SQL `WHERE` clause used to select the rows that will be - updated -- update_query: the SQL `UPDATE` expression to be run on all selected rows - -Optional params: - -- dry_run: bool, whether to actually run the generated SQL. True by default. -- batch_size: int number of records to process in each batch. By default, 10_000 -- update_timeout: int number of seconds to run an individual batch update before - timing out. By default, 3600 (or one hour) -- resume_update: boolean indicating whether to attempt to resume an update using - an existing temp table matching the `query_id`. When True, a new temp table is - not created. - -An example dag_run configuration used to set the thumbnails of all Flickr images -to null would look like this: - -``` -{ - "query_id": "my_flickr_query", - "table_name": "image", - "select_query": "WHERE provider='flickr'", - "update_query": "SET thumbnail=null", - "batch_size": 10, - "dry_run": false -} -``` - -The `update_batches` task automatically keeps track of its progress in an -Airflow variable suffixed with the `query_id`. If the task fails, when it -resumes (either through a retry or by being manually cleared), it will pick up -from where it left off. Manually managing this Airflow variable should not be -necessary. - -It is also possible to start an entirely new DagRun using an existing temp -table, by setting the `resume_update` param to True. With this option enabled, -the DAG will skip creating the temp table and instead attempt to run an update -with an existing temp table matching the `query_id`. This option should only be -used when the DagRun configuration needs to be changed after the table was -already created: for example, if there was a problem with the `update_query` -which caused DAG failures during the `update_batches` step. - -## `check_silenced_dags` - -### Silenced DAGs check - -Check for DAGs that have silenced Slack alerts or skipped errors which may need -to be turned back on. - -When a DAG has known failures, it can be omitted from Slack error reporting by -adding an entry to the `SILENCED_SLACK_NOTIFICATIONS` Airflow variable. -Similarly, errors that occur during the `pull_data` step may be configured to -skip and allow ingestion to continue, using the `SKIPPED_INGESTION_ERRORS` -Airflow variable. These variables are dictionaries where the key is the `dag_id` -of the affected DAG, and the value is a list of configuration dictionaries -mapping an error `predicate` to be skipped/silenced to an open GitHub issue. - -The `check_silenced_dags` DAG iterates over the entries in the -`SILENCED_SLACK_NOTIFICATIONS` and `SKIPPED_INGESTION_ERRORS` configurations and -verifies that the associated GitHub issues are still open. If an issue has been -closed, it is assumed that the entry should be removed, and an alert is sent to -prompt manual update of the configuration. This prevents developers from -forgetting to re-enable Slack reporting or turnoff error skipping after the -issue has been resolved. - -The DAG runs weekly. - -## `create_filtered_audio_index` - -### Create filtered index DAG factory - -This module creates the filtered index creation DAGs for each media type using a -factory function. - -Filtered index creation is handled by the ingestion server. The DAGs generated -by the `build_create_filtered_index_dag` function in this module are responsible -for triggering the ingestion server action to create and populate the filtered -index for a given media type. The DAG awaits the completion of the filtered -index creation and then points the filtered index alias for the media type to -the newly created index. - -#### When this DAG runs - -The DAGs generated in this module are triggered by the data refresh DAGs. -Maintaining this process separate from the data refresh DAGs, while still -triggering it there, allows us to run filtered index creation independently of -the full data refresh. This is primarily useful in two cases: for testing -changes to the filtered index creation; and for re-running filtered index -creation if an urgent change to the sensitive terms calls for an immediate -recreation of the filtered indexes. - -#### Race conditions - -Because filtered index creation employs the `reindex` Elasticsearch API to -derive the filtered index from an existing index, we need to be mindful of the -race condition that potentially exists between the data refresh DAG and this -DAG. The race condition is caused by the fact that the data refresh DAG always -deletes the previous index once the new index for the media type is finished -being created. Consider the situation where filtered index creation is triggered -to run during a data refresh. The filtered index is being derived from the -previous index for the media type. Once the data refresh is finished, it will -delete that index, causing the reindex to halt because suddenly it has no data -source from which to pull documents. - -There are two mechanisms that prevent this from happening: - -1. The filtered index creation DAGs are not allowed to run if a data refresh for - the media type is already running. -2. The data refresh DAGs will wait for any pre-existing filtered index creation - DAG runs for the media type to finish before continuing. - -This ensures that neither are depending on or modifying the origin indexes -critical for the creation of the filtered indexes. - -Because the data refresh DAG triggers the filtered index creation DAG, we do -allow a `force` param to be passed to the DAGs generated by this module. This -parameter is only for use by the data refresh DAG and should not be used when -manually triggering the DAG unless you are absolutely certain of what you are -doing. - -## `create_filtered_image_index` - -### Create filtered index DAG factory - -This module creates the filtered index creation DAGs for each media type using a -factory function. - -Filtered index creation is handled by the ingestion server. The DAGs generated -by the `build_create_filtered_index_dag` function in this module are responsible -for triggering the ingestion server action to create and populate the filtered -index for a given media type. The DAG awaits the completion of the filtered -index creation and then points the filtered index alias for the media type to -the newly created index. - -#### When this DAG runs - -The DAGs generated in this module are triggered by the data refresh DAGs. -Maintaining this process separate from the data refresh DAGs, while still -triggering it there, allows us to run filtered index creation independently of -the full data refresh. This is primarily useful in two cases: for testing -changes to the filtered index creation; and for re-running filtered index -creation if an urgent change to the sensitive terms calls for an immediate -recreation of the filtered indexes. - -#### Race conditions - -Because filtered index creation employs the `reindex` Elasticsearch API to -derive the filtered index from an existing index, we need to be mindful of the -race condition that potentially exists between the data refresh DAG and this -DAG. The race condition is caused by the fact that the data refresh DAG always -deletes the previous index once the new index for the media type is finished -being created. Consider the situation where filtered index creation is triggered -to run during a data refresh. The filtered index is being derived from the -previous index for the media type. Once the data refresh is finished, it will -delete that index, causing the reindex to halt because suddenly it has no data -source from which to pull documents. - -There are two mechanisms that prevent this from happening: - -1. The filtered index creation DAGs are not allowed to run if a data refresh for - the media type is already running. -2. The data refresh DAGs will wait for any pre-existing filtered index creation - DAG runs for the media type to finish before continuing. - -This ensures that neither are depending on or modifying the origin indexes -critical for the creation of the filtered indexes. - -Because the data refresh DAG triggers the filtered index creation DAG, we do -allow a `force` param to be passed to the DAGs generated by this module. This -parameter is only for use by the data refresh DAG and should not be used when -manually triggering the DAG unless you are absolutely certain of what you are -doing. - -## `europeana_workflow` - -Content Provider: Europeana - -ETL Process: Use the API to identify all CC licensed images. - -Output: TSV file containing the images and the respective meta-data. - -Notes: https://pro.europeana.eu/page/search - -## `finnish_museums_workflow` - -Content Provider: Finnish Museums - -ETL Process: Use the API to identify all CC licensed images. - -Output: TSV file containing the images and the respective meta-data. - -Notes: https://api.finna.fi/swagger-ui/ -https://www.finna.fi/Content/help-syntax?lng=en-gb The Finnish Museums provider -script is a dated DAG that ingests all records that were last updated in the -previous day. Because of this, it is not necessary to run a separate reingestion -DAG, as updated data will be processed during regular ingestion. - -## `flickr_audit_sub_provider_workflow` - -### Flickr Sub Provider Audit - -Check the list of member institutions of the Flickr Commons for institutions -that have cc-licensed images and are not already configured as sub-providers for -the Flickr DAG. Report suggestions for new sub-providers to Slack. - -## `flickr_reingestion_workflow` - -Content Provider: Flickr - -ETL Process: Use the API to identify all CC licensed images. - -Output: TSV file containing the images and the respective meta-data. - -Notes: https://www.flickr.com/help/terms/api Rate limit: 3600 requests per hour. - -## `flickr_thumbnails_removal` - -One-time run DAG to remove progressively all the old Flickr thumbnails, as they -were determined to be unsuitable for the Openverse UI requirements. - -## `flickr_workflow` - -Content Provider: Flickr - -ETL Process: Use the API to identify all CC licensed images. - -Output: TSV file containing the images and the respective meta-data. - -Notes: https://www.flickr.com/help/terms/api Rate limit: 3600 requests per hour. - -## `freesound_workflow` - -Content Provider: Freesound - -ETL Process: Use the API to identify all CC-licensed images. - -Output: TSV file containing the image, the respective meta-data. - -Notes: https://freesound.org/docs/api/ Rate limit: No limit for our API key. -This script can be run either to ingest the full dataset or as a dated DAG. - -## `image_data_refresh` - -### Data Refresh DAG Factory - -This file generates our data refresh DAGs using a factory function. For the -given media type these DAGs will initiate a data refresh on the ingestion server -and await the success or failure of that task. - -A data refresh occurs on the Ingestion server in the Openverse project. This is -a task which imports data from the upstream Catalog database into the API, -copies contents to a new Elasticsearch index, and finally makes the index -"live". This process is necessary to make new content added to the Catalog by -our provider DAGs available to the API. You can read more in the -[README](https://github.com/WordPress/openverse/blob/main/ingestion_server/README.md) -Importantly, the data refresh TaskGroup is also configured to handle concurrency -requirements of the Ingestion server. Finally, once the origin indexes have been -refreshed, the corresponding filtered index creation DAG is triggered. - -You can find more background information on this process in the following issues -and related PRs: - -- [[Feature] Data refresh orchestration DAG](https://github.com/WordPress/openverse-catalog/issues/353) -- [[Feature] Merge popularity calculations and data refresh into a single DAG](https://github.com/WordPress/openverse-catalog/issues/453) - -## `image_popularity_refresh` - -### Popularity Refresh DAG Factory - -This file generates our popularity refresh DAGs using a factory function. - -For the given media type these DAGs will first update the popularity metrics -table, adding any new metrics and updating the percentile that is used in -calculating the popularity constants. It then refreshes the popularity constants -view, which recalculates the popularity constant for each provider. - -Once the constants have been updated, the DAG will trigger a `batched_update` -DagRun for each provider of this media_type that is configured to support -popularity data. The batched update recalculates standardized popularity scores -for all records, using the new constant. When the updates are complete, all -records have up-to-date popularity data. This DAG can be run concurrently with -data refreshes and regular ingestion. - -You can find more background information on this process in the following -implementation plan: - -- [[Implementation Plan] Decoupling Popularity Calculations from the Data Refresh](https://docs.openverse.org/projects/proposals/popularity_optimizations/20230420-implementation_plan_popularity_optimizations.html) - -## `inaturalist_workflow` - -Provider: iNaturalist - -Output: Records loaded to the image catalog table. - -Notes: The iNaturalist API is not intended for data scraping. -https://api.inaturalist.org/v1/docs/ But there is a full dump intended for -sharing on S3. -https://github.com/inaturalist/inaturalist-open-data/tree/documentation/Metadata -Because these are very large normalized tables, as opposed to more document -oriented API responses, we found that bringing the data into postgres first was -the most effective approach. More detail in slack here: -https://wordpress.slack.com/archives/C02012JB00N/p1653145643080479?thread_ts=1653082292.714469&cid=C02012JB00N -We use the table structure defined here, -https://github.com/inaturalist/inaturalist-open-data/blob/main/Metadata/structure.sql -except for adding ancestry tags to the taxa table. - -## `jamendo_workflow` - -Content Provider: Jamendo - -ETL Process: Use the API to identify all CC-licensed audio. - -Output: TSV file containing the audio meta-data. - -Notes: https://api.jamendo.com/v3.0/tracks/ 35,000 requests per month for -non-commercial apps Jamendo Music has more than 500,000 tracks shared by 40,000 -artists from over 150 countries all over the world. Audio quality: uploaded as -WAV/ FLAC/ AIFF bit depth: 16/24 sample rate: 44.1 or 48 kHz channels: 1/2 - -## `justtakeitfree_workflow` - -Content Provider: Justtakeitfree - -ETL Process: Use the API to identify all CC licensed media. - -Output: TSV file containing the media and the respective meta-data. - -Notes: https://justtakeitfree.com/api/api.php This API requires an API key. For -more details, see https://github.com/WordPress/openverse/pull/2793 - -## `metropolitan_museum_reingestion_workflow` - -Content Provider: Metropolitan Museum of Art - -ETL Process: Use the API to identify all CC0 artworks. - -Output: TSV file containing the image, their respective meta-data. - -Notes: https://metmuseum.github.io/#search "Please limit requests to 80 requests -per second." May need to bump up the delay (e.g. to 3 seconds), to avoid of -blocking during local development testing. - - Some analysis to improve data quality was conducted using a - separate csv file here: https://github.com/metmuseum/openaccess - - Get a list of object IDs: - https://collectionapi.metmuseum.org/public/collection/v1/objects?metadataDate=2022-08-10 - Get a specific object: - https://collectionapi.metmuseum.org/public/collection/v1/objects/1027 - The search functionality requires a specific query (term search) - in addition to date and public domain. It seems like it won't - connect with just date and license. - https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 - -## `metropolitan_museum_workflow` - -Content Provider: Metropolitan Museum of Art - -ETL Process: Use the API to identify all CC0 artworks. - -Output: TSV file containing the image, their respective meta-data. - -Notes: https://metmuseum.github.io/#search "Please limit requests to 80 requests -per second." May need to bump up the delay (e.g. to 3 seconds), to avoid of -blocking during local development testing. - - Some analysis to improve data quality was conducted using a - separate csv file here: https://github.com/metmuseum/openaccess - - Get a list of object IDs: - https://collectionapi.metmuseum.org/public/collection/v1/objects?metadataDate=2022-08-10 - Get a specific object: - https://collectionapi.metmuseum.org/public/collection/v1/objects/1027 - The search functionality requires a specific query (term search) - in addition to date and public domain. It seems like it won't - connect with just date and license. - https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 - -## `nappy_workflow` - -Content Provider: Nappy - -ETL Process: Use the API to identify all CC0-licensed images. - -Output: TSV file containing the image meta-data. - -Notes: This api was written specially for Openverse. There are no known limits -or restrictions. https://nappy.co/ - -## `oauth2_authorization` - -### OAuth Provider Authorization - -Iterates through all the OAuth2 providers and attempts to authorize them using -tokens found in the in the `OAUTH2_AUTH_KEYS` Variable. Once authorization has -been completed successfully, the auth token is removed from that Variable. The -authorization will create an access/refresh token pair in the -`OAUTH2_ACCESS_TOKENS` Variable. - -**Current Providers**: - -- Freesound - -## `oauth2_token_refresh` - -### OAuth Provider Token Refresh - -Iterates through all OAuth2 providers and attempts to refresh the access token -using the refresh token stored in the `OAUTH2_ACCESS_TOKENS` Variable. This DAG -will update the tokens stored in the Variable upon successful refresh. - -**Current Providers**: - -- Freesound - -## `phylopic_reingestion_workflow` - -Content Provider: PhyloPic - -ETL Process: Use the API to identify all CC licensed images. - -Output: TSV file containing the image, their respective meta-data. - -Notes: http://api-docs.phylopic.org/v2/ No rate limit specified. - -## `phylopic_workflow` - -Content Provider: PhyloPic - -ETL Process: Use the API to identify all CC licensed images. - -Output: TSV file containing the image, their respective meta-data. - -Notes: http://api-docs.phylopic.org/v2/ No rate limit specified. - -## `pr_review_reminders` - -### PR Review Reminders - -Iterates through open PRs in our repositories and pings assigned reviewers who -have not yet approved the PR or explicitly requested changes. - -This DAG runs daily and pings on the following schedule based on priority label: - -| priority | days | -| -------- | ------- | -| critical | 1 day | -| high | >2 days | -| medium | >4 days | -| low | >7 days | - -The DAG does not ping on Saturday and Sunday and accounts for weekend days when -determining how much time has passed since the review. - -Unfortunately the DAG does not know when someone is on vacation. It is up to the -author of the PR to re-assign review if one of the randomly selected reviewers -is unavailable for the time period during which the PR should be reviewed. - -## `rawpixel_workflow` - -Content Provider: Rawpixel - -ETL Process: Use the API to identify all CC-licensed images. - -Output: TSV file containing the image meta-data. - -Notes: Rawpixel has given Openverse beta access to their API. This API is -undocumented, and we will need to contact Rawpixel directly if we run into any -issues. The public API max results range is limited to 100,000 results, although -the API key we've been given can circumvent this limit. -https://www.rawpixel.com/api/v1/search?tags=$publicdomain&page=1&pagesize=100 - -## `recreate_audio_popularity_calculation` - -This file generates Apache Airflow DAGs that, for the given media type, -completely wipes out and recreates the PostgreSQL functions involved in -calculating our standardized popularity metric. - -Note that they do not drop any tables or views related to popularity, and they -do not perform any popularity calculations. Once this DAG has been run, the -associated popularity refresh DAG must be run in order to actually recalculate -popularity constants and standardized popularity scores using the new functions. - -These DAGs are not on a schedule, and should only be run manually when new SQL -code is deployed for the calculation. - -## `recreate_image_popularity_calculation` - -This file generates Apache Airflow DAGs that, for the given media type, -completely wipes out and recreates the PostgreSQL functions involved in -calculating our standardized popularity metric. - -Note that they do not drop any tables or views related to popularity, and they -do not perform any popularity calculations. Once this DAG has been run, the -associated popularity refresh DAG must be run in order to actually recalculate -popularity constants and standardized popularity scores using the new functions. - -These DAGs are not on a schedule, and should only be run manually when new SQL -code is deployed for the calculation. - -## `report_pending_reported_media` - -### Report Pending Reported Media DAG - -This DAG checks for any user-reported media pending manual review, and alerts -via Slack. - -Media may be reported for mature content or copyright infringement, for example. -Once reported, these require manual review through the Django Admin to determine -whether further action (such as deindexing the record) needs to be taken. If a -record has been reported multiple times, it only needs to be reviewed once and -so is only counted once in the reporting by this DAG. - -## `rotate_db_snapshots` - -Manages weekly database snapshots. - -RDS does not support weekly snapshots schedules on its own, so we need a DAG to -manage this for us. - -It runs on Saturdays at 00:00 UTC in order to happen before the data refresh. - -The DAG will automatically delete the oldest snapshots when more snaphots exist -than it is configured to retain. - -Requires two variables: - -`CATALOG_RDS_DB_IDENTIFIER`: The "DBIdentifier" of the RDS DB instance. -`CATALOG_RDS_SNAPSHOTS_TO_RETAIN`: How many historical snapshots to retain. - -## `science_museum_workflow` - -Content Provider: Science Museum - -ETL Process: Use the API to identify all CC-licensed images. - -Output: TSV file containing the image, the respective meta-data. - -Notes: -https://github.com/TheScienceMuseum/collectionsonline/wiki/Collections-Online-API -Rate limited, no specific rate given. - -## `smithsonian_workflow` - -Content Provider: Smithsonian - -ETL Process: Use the API to identify all CC licensed images. - -Output: TSV file containing the images and the respective meta-data. - -Notes: https://api.si.edu/openaccess/api/v1.0/search - -## `smk_workflow` - -Content Provider: Statens Museum for Kunst (National Gallery of Denmark) - -ETL Process: Use the API to identify all openly licensed media. - -Output: TSV file containing the media metadata. - -Notes: https://www.smk.dk/en/article/smk-api/ - -## `staging_database_restore` - -### Update the staging database - -This DAG is responsible for updating the staging database using the most recent -snapshot of the production database. - -For a full explanation of the DAG, see the implementation plan description: -https://docs.openverse.org/projects/proposals/search_relevancy_sandbox/20230406-implementation_plan_update_staging_database.html#dag - -This DAG can be skipped by setting the `SKIP_STAGING_DATABASE_RESTORE` Airflow -Variable to `true`. To change this variable, navigate to Admin > Variables in -the Airflow UI, then click the "edit" button next to the variable and set the -value to either `true` or `false`. - -This DAG will default to using the standard AWS connection ID for the RDS -operations. For local testing, you can set up two environment variables to have -the RDS operations run using a different hook: - -- `AWS_RDS_CONN_ID`: The Airflow connection ID to use for RDS operations (e.g. - `aws_rds`) -- `AIRFLOW_CONN_`: The connection string to use for RDS operations (per the - above example, it might be `AIRFLOW_CONN_AWS_RDS`) - -## `stocksnap_workflow` - -Content Provider: StockSnap - -ETL Process: Use the API to identify all CC-licensed images. - -Output: TSV file containing the image, the respective meta-data. - -Notes: https://stocksnap.io/api/load-photos/date/desc/1 https://stocksnap.io/faq -All images are licensed under CC0. No rate limits or authorization required. API -is undocumented. - -## `wikimedia_commons_workflow` - -**Content Provider:** Wikimedia Commons - -**ETL Process:** Use the API to identify all CC-licensed images. - -**Output:** TSV file containing the image, the respective meta-data. - -### Notes - -Rate limit of no more than 200 requests/second, and we are required to set a -unique User-Agent field -([docs](https://www.mediawiki.org/wiki/Wikimedia_REST_API#Terms_and_conditions)). - -Wikimedia Commons uses an implementation of the -[MediaWiki API](https://www.mediawiki.org/wiki/API:Main_page). This API is -incredibly complex in the level of configuration you can provide when querying, -and as such it can also be quite abstruse. The most straightforward docs can be -found on the -[Wikimedia website directly](https://commons.wikimedia.org/w/api.php?action=help&modules=query), -as these show all the parameters available. Specifications on queries can also -be found on the [query page](https://www.mediawiki.org/wiki/API:Query). - -Different kinds of queries can be made against the API using "modules", we use -the [allimages module](https://www.mediawiki.org/wiki/API:Allimages), which lets -us search for images in a given time range (see `"generator": "allimages"` in -the query params). - -Many queries will return results in batches, with the API supplying a "continue" -token. This token is used on subsequent calls to tell the API where to start the -next set of results from; it functions as a page offset -([docs](https://www.mediawiki.org/wiki/API:Query#Continuing_queries)). - -We can also specify what kinds of information we want for the query. Wikimedia -has a massive amount of data on it, so it only returns what we ask for. The -fields that are returned are defined by the -[properties](https://www.mediawiki.org/wiki/API:Properties) or "props" -parameter. Sub-properties can also be defined, with the parameter name of the -sub-property determined by an abbreviation of the higher-level property. For -instance, if our property is "imageinfo" and we want to set sub-property values, -we would define those in "iiprops". - -The data within a property is paginated as well, Wikimedia will handle iteration -through the property sub-pages using the "continue" token -([see here for more details](https://www.mediawiki.org/wiki/API:Properties#Additional_notes)). - -Depending on the kind of property data that's being returned, it's possible for -the API to iterate extensively on a specific media item. What Wikimedia is -iterating over in these cases can be gleaned from the "continue" token. Those -tokens take the form of, as I understand it, -"||", paired with an "continue" value -for the property being iterated over. For example, if we're were iterating over -a set of image properties, the token might look like: - -``` -{ - "iicontinue": "The_Railway_Chronicle_1844.pdf|20221209222801", - "gaicontinue": "20221209222614|NTUL-0527100_英國產業革命史略.pdf", - "continue": "gaicontinue||globalusage", -} -``` - -In this case, we're iterating over the "global all images" generator -(gaicontinue) as our primary iterator, with the "image properties" (iicontinue) -as the secondary continue iterator. The "globalusage" property would be the next -property to iterate over. It's also possible for multiple sub-properties to be -iterated over simultaneously, in which case the "continue" token would not have -a secondary value (e.g. `gaicontinue||`). - -In most runs, the "continue" key will be `gaicontinue||` after the first -request, which means that we have more than one batch to iterate over for the -primary iterator. Some days will have fewer images than the batch limit but -still have multiple batches of content on the secondary iterator, which means -the "continue" key may not have a primary iteration component (e.g. -`||globalusage`). This token can also be seen when the first request has more -data in the secondary iterator, before we've processed any data on the primary -iterator. - -Occasionally, the ingester will come across a piece of media that has many -results for the property it's iterating over. An example of this can include an -item being on many pages, this it would have many "global usage" results. In -order to process the entire batch, we have to iterate over _all_ of the returned -results; Wikimedia does not provide a mechanism to "skip to the end" of a batch. -On numerous occasions, this iteration has been so extensive that the pull media -task has hit the task's timeout. To avoid this, we limit the number of -iterations we make for parsing through a sub-property's data. If we hit the -limit, we re-issue the original query _without_ requesting properties that -returned large amounts of data. Unfortunately, this means that we will **not** -have that property's data for these items the second time around (e.g. -popularity data if we needed to skip global usage). In the case of popularity, -especially since the problem with these images is that they're so popular, we -want to preserve that information where possible! So we cache the popularity -data from previous iterations and use it in subsequent ones if we come across -the same item again. - -Below are some specific references to various properties, with examples for -cases where they might exceed the limit. Technically, it's feasible for almost -any property to exceed the limit, but these are the ones that we've seen in -practice. - -#### `imageinfo` - -[Docs](https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bimageinfo) - -[Example where metadata has hundreds of data points](https://commons.wikimedia.org/wiki/File:The_Railway_Chronicle_1844.pdf#metadata) -(see "Metadata" table, which may need to be expanded). - -For these requests, we can remove the `metadata` property from the `iiprops` -parameter to avoid this issue on subsequent iterations. - -#### `globalusage` - -[Docs](https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bglobalusage) - -[Example where an image is used on almost every wiki](https://commons.wikimedia.org/w/index.php?curid=4298234). - -For these requests, we can remove the `globalusage` property from the `prop` -parameter entirely and eschew the popularity data for these items. - -## `wikimedia_reingestion_workflow` - -**Content Provider:** Wikimedia Commons - -**ETL Process:** Use the API to identify all CC-licensed images. - -**Output:** TSV file containing the image, the respective meta-data. - -### Notes - -Rate limit of no more than 200 requests/second, and we are required to set a -unique User-Agent field -([docs](https://www.mediawiki.org/wiki/Wikimedia_REST_API#Terms_and_conditions)). - -Wikimedia Commons uses an implementation of the -[MediaWiki API](https://www.mediawiki.org/wiki/API:Main_page). This API is -incredibly complex in the level of configuration you can provide when querying, -and as such it can also be quite abstruse. The most straightforward docs can be -found on the -[Wikimedia website directly](https://commons.wikimedia.org/w/api.php?action=help&modules=query), -as these show all the parameters available. Specifications on queries can also -be found on the [query page](https://www.mediawiki.org/wiki/API:Query). - -Different kinds of queries can be made against the API using "modules", we use -the [allimages module](https://www.mediawiki.org/wiki/API:Allimages), which lets -us search for images in a given time range (see `"generator": "allimages"` in -the query params). - -Many queries will return results in batches, with the API supplying a "continue" -token. This token is used on subsequent calls to tell the API where to start the -next set of results from; it functions as a page offset -([docs](https://www.mediawiki.org/wiki/API:Query#Continuing_queries)). - -We can also specify what kinds of information we want for the query. Wikimedia -has a massive amount of data on it, so it only returns what we ask for. The -fields that are returned are defined by the -[properties](https://www.mediawiki.org/wiki/API:Properties) or "props" -parameter. Sub-properties can also be defined, with the parameter name of the -sub-property determined by an abbreviation of the higher-level property. For -instance, if our property is "imageinfo" and we want to set sub-property values, -we would define those in "iiprops". - -The data within a property is paginated as well, Wikimedia will handle iteration -through the property sub-pages using the "continue" token -([see here for more details](https://www.mediawiki.org/wiki/API:Properties#Additional_notes)). - -Depending on the kind of property data that's being returned, it's possible for -the API to iterate extensively on a specific media item. What Wikimedia is -iterating over in these cases can be gleaned from the "continue" token. Those -tokens take the form of, as I understand it, -"||", paired with an "continue" value -for the property being iterated over. For example, if we're were iterating over -a set of image properties, the token might look like: - -``` -{ - "iicontinue": "The_Railway_Chronicle_1844.pdf|20221209222801", - "gaicontinue": "20221209222614|NTUL-0527100_英國產業革命史略.pdf", - "continue": "gaicontinue||globalusage", -} -``` - -In this case, we're iterating over the "global all images" generator -(gaicontinue) as our primary iterator, with the "image properties" (iicontinue) -as the secondary continue iterator. The "globalusage" property would be the next -property to iterate over. It's also possible for multiple sub-properties to be -iterated over simultaneously, in which case the "continue" token would not have -a secondary value (e.g. `gaicontinue||`). - -In most runs, the "continue" key will be `gaicontinue||` after the first -request, which means that we have more than one batch to iterate over for the -primary iterator. Some days will have fewer images than the batch limit but -still have multiple batches of content on the secondary iterator, which means -the "continue" key may not have a primary iteration component (e.g. -`||globalusage`). This token can also be seen when the first request has more -data in the secondary iterator, before we've processed any data on the primary -iterator. - -Occasionally, the ingester will come across a piece of media that has many -results for the property it's iterating over. An example of this can include an -item being on many pages, this it would have many "global usage" results. In -order to process the entire batch, we have to iterate over _all_ of the returned -results; Wikimedia does not provide a mechanism to "skip to the end" of a batch. -On numerous occasions, this iteration has been so extensive that the pull media -task has hit the task's timeout. To avoid this, we limit the number of -iterations we make for parsing through a sub-property's data. If we hit the -limit, we re-issue the original query _without_ requesting properties that -returned large amounts of data. Unfortunately, this means that we will **not** -have that property's data for these items the second time around (e.g. -popularity data if we needed to skip global usage). In the case of popularity, -especially since the problem with these images is that they're so popular, we -want to preserve that information where possible! So we cache the popularity -data from previous iterations and use it in subsequent ones if we come across -the same item again. - -Below are some specific references to various properties, with examples for -cases where they might exceed the limit. Technically, it's feasible for almost -any property to exceed the limit, but these are the ones that we've seen in -practice. - -#### `imageinfo` - -[Docs](https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bimageinfo) - -[Example where metadata has hundreds of data points](https://commons.wikimedia.org/wiki/File:The_Railway_Chronicle_1844.pdf#metadata) -(see "Metadata" table, which may need to be expanded). - -For these requests, we can remove the `metadata` property from the `iiprops` -parameter to avoid this issue on subsequent iterations. - -#### `globalusage` - -[Docs](https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bglobalusage) - -[Example where an image is used on almost every wiki](https://commons.wikimedia.org/w/index.php?curid=4298234). - -For these requests, we can remove the `globalusage` property from the `prop` -parameter entirely and eschew the popularity data for these items. - -## `wordpress_workflow` - -Content Provider: WordPress Photo Directory - -ETL Process: Use the API to identify all openly licensed media. - -Output: TSV file containing the media metadata. - -Notes: https://wordpress.org/photos/wp-json/wp/v2 Provide photos, media, users -and more related resources. No rate limit specified. diff --git a/catalog/DAGs.md b/documentation/catalog/reference/DAGs.md similarity index 100% rename from catalog/DAGs.md rename to documentation/catalog/reference/DAGs.md diff --git a/documentation/catalog/reference/index.md b/documentation/catalog/reference/index.md new file mode 100644 index 00000000000..b98ba63ec03 --- /dev/null +++ b/documentation/catalog/reference/index.md @@ -0,0 +1,7 @@ +# Reference + +```{toctree} +:titlesonly: + +DAGs +``` From 734dc22e5a14602331edf9c8ee03bc5c6477e57b Mon Sep 17 00:00:00 2001 From: Madison Swain-Bowden Date: Mon, 25 Sep 2023 17:51:25 -0700 Subject: [PATCH 5/8] Reference multi-word titles appropriately --- catalog/utilities/dag_doc_gen/dag_doc_generation.py | 2 +- documentation/catalog/reference/DAGs.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/catalog/utilities/dag_doc_gen/dag_doc_generation.py b/catalog/utilities/dag_doc_gen/dag_doc_generation.py index b3c71734f4e..8b114681d4a 100644 --- a/catalog/utilities/dag_doc_gen/dag_doc_generation.py +++ b/catalog/utilities/dag_doc_gen/dag_doc_generation.py @@ -217,7 +217,7 @@ def generate_dag_doc(dag_folder: Path = DAG_FOLDER) -> str: # For each type we generate a sub-list of DAGs. We add a link to each generated # sub-list as part of a table of contents, but defer adding the sub-lists until # all are generated. - text += f" 1. [{name}](#{type_})\n" + text += f" 1. [{name}](#{type_.replace('_', '-')})\n" dag_types.append(generate_type_subsection(name, dags, is_provider)) text += "\n" + "\n\n".join(dag_types) diff --git a/documentation/catalog/reference/DAGs.md b/documentation/catalog/reference/DAGs.md index ac3212fae71..aeebd259513 100644 --- a/documentation/catalog/reference/DAGs.md +++ b/documentation/catalog/reference/DAGs.md @@ -14,13 +14,13 @@ The DAGs are shown in two forms: The following are DAGs grouped by their primary tag: -1. [Data Normalization](#data_normalization) -1. [Data Refresh](#data_refresh) +1. [Data Normalization](#data-normalization) +1. [Data Refresh](#data-refresh) 1. [Database](#database) 1. [Maintenance](#maintenance) 1. [Oauth](#oauth) 1. [Other](#other) -1. [Popularity Refresh](#popularity_refresh) +1. [Popularity Refresh](#popularity-refresh) 1. [Provider](#provider) 1. [Provider Reingestion](#provider-reingestion) From 7e5ff17b7d1eeba79c479e2e8e137b146e7a3152 Mon Sep 17 00:00:00 2001 From: Madison Swain-Bowden Date: Mon, 25 Sep 2023 17:57:26 -0700 Subject: [PATCH 6/8] Increase indentation by 1 level so only "DAGs" shows in TOC --- .../dag_doc_gen/dag_doc_generation.py | 12 +- documentation/catalog/reference/DAGs.md | 154 +++++++++--------- 2 files changed, 83 insertions(+), 83 deletions(-) diff --git a/catalog/utilities/dag_doc_gen/dag_doc_generation.py b/catalog/utilities/dag_doc_gen/dag_doc_generation.py index 8b114681d4a..436bb709607 100644 --- a/catalog/utilities/dag_doc_gen/dag_doc_generation.py +++ b/catalog/utilities/dag_doc_gen/dag_doc_generation.py @@ -45,13 +45,13 @@ - [DAGs by Type](#dags-by-type) - [Individual DAG documentation](#dag-documentation) -# DAGs by Type +## DAGs by Type The following are DAGs grouped by their primary tag: """ MIDAMBLE = """ -# DAG documentation +## DAG documentation The following is documentation associated with each DAG (where available): @@ -94,7 +94,7 @@ def get_provider_workflows() -> dict[str, ProviderWorkflow]: def fix_headings(doc: str) -> str: """ - Increase all heading levels by 2. + Increase all heading levels by 3. This is necessary to accommodate the embedded setting of the DAG docs in the final Markdown output. @@ -105,7 +105,7 @@ def fix_headings(doc: str) -> str: for match in reversed(list(HEADING_PROG.finditer(doc))): start, end = match.span() original_heading = match.string[start:end] - new_heading = f"##{original_heading}" + new_heading = f"###{original_heading}" doc = f"{doc[:start]}{new_heading}{doc[end:]}" return doc @@ -149,7 +149,7 @@ def generate_type_subsection( ) -> str: """Generate the documentation for a "DAGs by type" subsection.""" log.info(f"Building subsection for '{name}'") - text = f"## {name}\n\n" + text = f"### {name}\n\n" # Columns for all DAGs header = "| DAG ID | Schedule Interval |" # Conditionally add the other columns for the provider-specific DAGs @@ -183,7 +183,7 @@ def generate_type_subsection( def generate_single_documentation(dag: DagInfo) -> str: """Generate the documentation for a single DAG.""" return f""" -## `{dag.dag_id}` +### `{dag.dag_id}` {dag.doc} diff --git a/documentation/catalog/reference/DAGs.md b/documentation/catalog/reference/DAGs.md index aeebd259513..72a3b8f14b6 100644 --- a/documentation/catalog/reference/DAGs.md +++ b/documentation/catalog/reference/DAGs.md @@ -10,7 +10,7 @@ The DAGs are shown in two forms: - [DAGs by Type](#dags-by-type) - [Individual DAG documentation](#dag-documentation) -# DAGs by Type +## DAGs by Type The following are DAGs grouped by their primary tag: @@ -24,13 +24,13 @@ The following are DAGs grouped by their primary tag: 1. [Provider](#provider) 1. [Provider Reingestion](#provider-reingestion) -## Data Normalization +### Data Normalization | DAG ID | Schedule Interval | | ------------------------------------- | ----------------- | | [`add_license_url`](#add_license_url) | `None` | -## Data Refresh +### Data Refresh | DAG ID | Schedule Interval | | ------------------------------------------------------------- | ----------------- | @@ -39,7 +39,7 @@ The following are DAGs grouped by their primary tag: | [`create_filtered_image_index`](#create_filtered_image_index) | `None` | | [`image_data_refresh`](#image_data_refresh) | `0 0 * * 1` | -## Database +### Database | DAG ID | Schedule Interval | | --------------------------------------------------------------------------------- | ----------------- | @@ -49,7 +49,7 @@ The following are DAGs grouped by their primary tag: | [`report_pending_reported_media`](#report_pending_reported_media) | `@weekly` | | [`staging_database_restore`](#staging_database_restore) | `@monthly` | -## Maintenance +### Maintenance | DAG ID | Schedule Interval | | --------------------------------------------------------------------------- | ----------------- | @@ -59,27 +59,27 @@ The following are DAGs grouped by their primary tag: | [`pr_review_reminders`](#pr_review_reminders) | `0 0 * * 1-5` | | [`rotate_db_snapshots`](#rotate_db_snapshots) | `0 0 * * 6` | -## Oauth +### Oauth | DAG ID | Schedule Interval | | ----------------------------------------------- | ----------------- | | [`oauth2_authorization`](#oauth2_authorization) | `None` | | [`oauth2_token_refresh`](#oauth2_token_refresh) | `0 */12 * * *` | -## Other +### Other | DAG ID | Schedule Interval | | --------------------------------------------------------- | ----------------- | | [`flickr_thumbnails_removal`](#flickr_thumbnails_removal) | `None` | -## Popularity Refresh +### Popularity Refresh | DAG ID | Schedule Interval | | ------------------------------------------------------- | ----------------- | | [`audio_popularity_refresh`](#audio_popularity_refresh) | `@monthly` | | [`image_popularity_refresh`](#image_popularity_refresh) | `@monthly` | -## Provider +### Provider | DAG ID | Schedule Interval | Dated | Media Type(s) | | --------------------------------------------------------------- | ----------------- | ------- | ------------- | @@ -105,7 +105,7 @@ The following are DAGs grouped by their primary tag: | [`wikimedia_commons_workflow`](#wikimedia_commons_workflow) | `@daily` | `True` | image, audio | | [`wordpress_workflow`](#wordpress_workflow) | `@monthly` | `False` | image | -## Provider Reingestion +### Provider Reingestion | DAG ID | Schedule Interval | | --------------------------------------------------------------------------------------- | ----------------- | @@ -114,7 +114,7 @@ The following are DAGs grouped by their primary tag: | [`phylopic_reingestion_workflow`](#phylopic_reingestion_workflow) | `@weekly` | | [`wikimedia_reingestion_workflow`](#wikimedia_reingestion_workflow) | `@weekly` | -# DAG documentation +## DAG documentation The following is documentation associated with each DAG (where available): @@ -160,9 +160,9 @@ The following is documentation associated with each DAG (where available): 1. [`wikimedia_reingestion_workflow`](#wikimedia_reingestion_workflow) 1. [`wordpress_workflow`](#wordpress_workflow) -## `add_license_url` +### `add_license_url` -### Add license URL +#### Add license URL Add `license_url` to all rows that have `NULL` in their `meta_data` fields. This PR sets the meta_data value to "{license_url: https://... }", where the url is @@ -172,9 +172,9 @@ This is a maintenance DAG that should be run once. If all the null values in the `meta_data` column are updated, the DAG will only run the first and the last step, logging the statistics. -## `airflow_log_cleanup` +### `airflow_log_cleanup` -### Clean up airflow logs +#### Clean up airflow logs A maintenance workflow that you can deploy into Airflow to periodically clean out the task logs to avoid those getting too big. By default, this will also @@ -197,9 +197,9 @@ airflow dags trigger --conf - maxLogAgeInDays: - Optional - enableDelete: - Optional -## `audio_data_refresh` +### `audio_data_refresh` -### Data Refresh DAG Factory +#### Data Refresh DAG Factory This file generates our data refresh DAGs using a factory function. For the given media type these DAGs will initiate a data refresh on the ingestion server @@ -221,9 +221,9 @@ and related PRs: - [[Feature] Data refresh orchestration DAG](https://github.com/WordPress/openverse-catalog/issues/353) - [[Feature] Merge popularity calculations and data refresh into a single DAG](https://github.com/WordPress/openverse-catalog/issues/453) -## `audio_popularity_refresh` +### `audio_popularity_refresh` -### Popularity Refresh DAG Factory +#### Popularity Refresh DAG Factory This file generates our popularity refresh DAGs using a factory function. @@ -244,7 +244,7 @@ implementation plan: - [[Implementation Plan] Decoupling Popularity Calculations from the Data Refresh](https://docs.openverse.org/projects/proposals/popularity_optimizations/20230420-implementation_plan_popularity_optimizations.html) -## `batched_update` +### `batched_update` Batched Update DAG @@ -311,9 +311,9 @@ used when the DagRun configuration needs to be changed after the table was already created: for example, if there was a problem with the `update_query` which caused DAG failures during the `update_batches` step. -## `check_silenced_dags` +### `check_silenced_dags` -### Silenced DAGs check +#### Silenced DAGs check Check for DAGs that have silenced Slack alerts or skipped errors which may need to be turned back on. @@ -336,9 +336,9 @@ issue has been resolved. The DAG runs weekly. -## `create_filtered_audio_index` +### `create_filtered_audio_index` -### Create filtered index DAG factory +#### Create filtered index DAG factory This module creates the filtered index creation DAGs for each media type using a factory function. @@ -350,7 +350,7 @@ index for a given media type. The DAG awaits the completion of the filtered index creation and then points the filtered index alias for the media type to the newly created index. -#### When this DAG runs +##### When this DAG runs The DAGs generated in this module are triggered by the data refresh DAGs. Maintaining this process separate from the data refresh DAGs, while still @@ -360,7 +360,7 @@ changes to the filtered index creation; and for re-running filtered index creation if an urgent change to the sensitive terms calls for an immediate recreation of the filtered indexes. -#### Race conditions +##### Race conditions Because filtered index creation employs the `reindex` Elasticsearch API to derive the filtered index from an existing index, we need to be mindful of the @@ -389,9 +389,9 @@ parameter is only for use by the data refresh DAG and should not be used when manually triggering the DAG unless you are absolutely certain of what you are doing. -## `create_filtered_image_index` +### `create_filtered_image_index` -### Create filtered index DAG factory +#### Create filtered index DAG factory This module creates the filtered index creation DAGs for each media type using a factory function. @@ -403,7 +403,7 @@ index for a given media type. The DAG awaits the completion of the filtered index creation and then points the filtered index alias for the media type to the newly created index. -#### When this DAG runs +##### When this DAG runs The DAGs generated in this module are triggered by the data refresh DAGs. Maintaining this process separate from the data refresh DAGs, while still @@ -413,7 +413,7 @@ changes to the filtered index creation; and for re-running filtered index creation if an urgent change to the sensitive terms calls for an immediate recreation of the filtered indexes. -#### Race conditions +##### Race conditions Because filtered index creation employs the `reindex` Elasticsearch API to derive the filtered index from an existing index, we need to be mindful of the @@ -442,7 +442,7 @@ parameter is only for use by the data refresh DAG and should not be used when manually triggering the DAG unless you are absolutely certain of what you are doing. -## `europeana_workflow` +### `europeana_workflow` Content Provider: Europeana @@ -452,7 +452,7 @@ Output: TSV file containing the images and the respective meta-data. Notes: https://pro.europeana.eu/page/search -## `finnish_museums_workflow` +### `finnish_museums_workflow` Content Provider: Finnish Museums @@ -466,15 +466,15 @@ script is a dated DAG that ingests all records that were last updated in the previous day. Because of this, it is not necessary to run a separate reingestion DAG, as updated data will be processed during regular ingestion. -## `flickr_audit_sub_provider_workflow` +### `flickr_audit_sub_provider_workflow` -### Flickr Sub Provider Audit +#### Flickr Sub Provider Audit Check the list of member institutions of the Flickr Commons for institutions that have cc-licensed images and are not already configured as sub-providers for the Flickr DAG. Report suggestions for new sub-providers to Slack. -## `flickr_reingestion_workflow` +### `flickr_reingestion_workflow` Content Provider: Flickr @@ -484,12 +484,12 @@ Output: TSV file containing the images and the respective meta-data. Notes: https://www.flickr.com/help/terms/api Rate limit: 3600 requests per hour. -## `flickr_thumbnails_removal` +### `flickr_thumbnails_removal` One-time run DAG to remove progressively all the old Flickr thumbnails, as they were determined to be unsuitable for the Openverse UI requirements. -## `flickr_workflow` +### `flickr_workflow` Content Provider: Flickr @@ -499,7 +499,7 @@ Output: TSV file containing the images and the respective meta-data. Notes: https://www.flickr.com/help/terms/api Rate limit: 3600 requests per hour. -## `freesound_workflow` +### `freesound_workflow` Content Provider: Freesound @@ -510,9 +510,9 @@ Output: TSV file containing the image, the respective meta-data. Notes: https://freesound.org/docs/api/ Rate limit: No limit for our API key. This script can be run either to ingest the full dataset or as a dated DAG. -## `image_data_refresh` +### `image_data_refresh` -### Data Refresh DAG Factory +#### Data Refresh DAG Factory This file generates our data refresh DAGs using a factory function. For the given media type these DAGs will initiate a data refresh on the ingestion server @@ -534,9 +534,9 @@ and related PRs: - [[Feature] Data refresh orchestration DAG](https://github.com/WordPress/openverse-catalog/issues/353) - [[Feature] Merge popularity calculations and data refresh into a single DAG](https://github.com/WordPress/openverse-catalog/issues/453) -## `image_popularity_refresh` +### `image_popularity_refresh` -### Popularity Refresh DAG Factory +#### Popularity Refresh DAG Factory This file generates our popularity refresh DAGs using a factory function. @@ -557,7 +557,7 @@ implementation plan: - [[Implementation Plan] Decoupling Popularity Calculations from the Data Refresh](https://docs.openverse.org/projects/proposals/popularity_optimizations/20230420-implementation_plan_popularity_optimizations.html) -## `inaturalist_workflow` +### `inaturalist_workflow` Provider: iNaturalist @@ -575,7 +575,7 @@ We use the table structure defined here, https://github.com/inaturalist/inaturalist-open-data/blob/main/Metadata/structure.sql except for adding ancestry tags to the taxa table. -## `jamendo_workflow` +### `jamendo_workflow` Content Provider: Jamendo @@ -588,7 +588,7 @@ non-commercial apps Jamendo Music has more than 500,000 tracks shared by 40,000 artists from over 150 countries all over the world. Audio quality: uploaded as WAV/ FLAC/ AIFF bit depth: 16/24 sample rate: 44.1 or 48 kHz channels: 1/2 -## `justtakeitfree_workflow` +### `justtakeitfree_workflow` Content Provider: Justtakeitfree @@ -599,7 +599,7 @@ Output: TSV file containing the media and the respective meta-data. Notes: https://justtakeitfree.com/api/api.php This API requires an API key. For more details, see https://github.com/WordPress/openverse/pull/2793 -## `metropolitan_museum_reingestion_workflow` +### `metropolitan_museum_reingestion_workflow` Content Provider: Metropolitan Museum of Art @@ -623,7 +623,7 @@ blocking during local development testing. connect with just date and license. https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 -## `metropolitan_museum_workflow` +### `metropolitan_museum_workflow` Content Provider: Metropolitan Museum of Art @@ -647,7 +647,7 @@ blocking during local development testing. connect with just date and license. https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 -## `nappy_workflow` +### `nappy_workflow` Content Provider: Nappy @@ -658,9 +658,9 @@ Output: TSV file containing the image meta-data. Notes: This api was written specially for Openverse. There are no known limits or restrictions. https://nappy.co/ -## `oauth2_authorization` +### `oauth2_authorization` -### OAuth Provider Authorization +#### OAuth Provider Authorization Iterates through all the OAuth2 providers and attempts to authorize them using tokens found in the in the `OAUTH2_AUTH_KEYS` Variable. Once authorization has @@ -672,9 +672,9 @@ authorization will create an access/refresh token pair in the - Freesound -## `oauth2_token_refresh` +### `oauth2_token_refresh` -### OAuth Provider Token Refresh +#### OAuth Provider Token Refresh Iterates through all OAuth2 providers and attempts to refresh the access token using the refresh token stored in the `OAUTH2_ACCESS_TOKENS` Variable. This DAG @@ -684,7 +684,7 @@ will update the tokens stored in the Variable upon successful refresh. - Freesound -## `phylopic_reingestion_workflow` +### `phylopic_reingestion_workflow` Content Provider: PhyloPic @@ -694,7 +694,7 @@ Output: TSV file containing the image, their respective meta-data. Notes: http://api-docs.phylopic.org/v2/ No rate limit specified. -## `phylopic_workflow` +### `phylopic_workflow` Content Provider: PhyloPic @@ -704,9 +704,9 @@ Output: TSV file containing the image, their respective meta-data. Notes: http://api-docs.phylopic.org/v2/ No rate limit specified. -## `pr_review_reminders` +### `pr_review_reminders` -### PR Review Reminders +#### PR Review Reminders Iterates through open PRs in our repositories and pings assigned reviewers who have not yet approved the PR or explicitly requested changes. @@ -727,7 +727,7 @@ Unfortunately the DAG does not know when someone is on vacation. It is up to the author of the PR to re-assign review if one of the randomly selected reviewers is unavailable for the time period during which the PR should be reviewed. -## `rawpixel_workflow` +### `rawpixel_workflow` Content Provider: Rawpixel @@ -741,7 +741,7 @@ issues. The public API max results range is limited to 100,000 results, although the API key we've been given can circumvent this limit. https://www.rawpixel.com/api/v1/search?tags=$publicdomain&page=1&pagesize=100 -## `recreate_audio_popularity_calculation` +### `recreate_audio_popularity_calculation` This file generates Apache Airflow DAGs that, for the given media type, completely wipes out and recreates the PostgreSQL functions involved in @@ -755,7 +755,7 @@ popularity constants and standardized popularity scores using the new functions. These DAGs are not on a schedule, and should only be run manually when new SQL code is deployed for the calculation. -## `recreate_image_popularity_calculation` +### `recreate_image_popularity_calculation` This file generates Apache Airflow DAGs that, for the given media type, completely wipes out and recreates the PostgreSQL functions involved in @@ -769,9 +769,9 @@ popularity constants and standardized popularity scores using the new functions. These DAGs are not on a schedule, and should only be run manually when new SQL code is deployed for the calculation. -## `report_pending_reported_media` +### `report_pending_reported_media` -### Report Pending Reported Media DAG +#### Report Pending Reported Media DAG This DAG checks for any user-reported media pending manual review, and alerts via Slack. @@ -782,7 +782,7 @@ whether further action (such as deindexing the record) needs to be taken. If a record has been reported multiple times, it only needs to be reviewed once and so is only counted once in the reporting by this DAG. -## `rotate_db_snapshots` +### `rotate_db_snapshots` Manages weekly database snapshots. @@ -799,7 +799,7 @@ Requires two variables: `CATALOG_RDS_DB_IDENTIFIER`: The "DBIdentifier" of the RDS DB instance. `CATALOG_RDS_SNAPSHOTS_TO_RETAIN`: How many historical snapshots to retain. -## `science_museum_workflow` +### `science_museum_workflow` Content Provider: Science Museum @@ -811,7 +811,7 @@ Notes: https://github.com/TheScienceMuseum/collectionsonline/wiki/Collections-Online-API Rate limited, no specific rate given. -## `smithsonian_workflow` +### `smithsonian_workflow` Content Provider: Smithsonian @@ -821,7 +821,7 @@ Output: TSV file containing the images and the respective meta-data. Notes: https://api.si.edu/openaccess/api/v1.0/search -## `smk_workflow` +### `smk_workflow` Content Provider: Statens Museum for Kunst (National Gallery of Denmark) @@ -831,9 +831,9 @@ Output: TSV file containing the media metadata. Notes: https://www.smk.dk/en/article/smk-api/ -## `staging_database_restore` +### `staging_database_restore` -### Update the staging database +#### Update the staging database This DAG is responsible for updating the staging database using the most recent snapshot of the production database. @@ -855,7 +855,7 @@ the RDS operations run using a different hook: - `AIRFLOW_CONN_`: The connection string to use for RDS operations (per the above example, it might be `AIRFLOW_CONN_AWS_RDS`) -## `stocksnap_workflow` +### `stocksnap_workflow` Content Provider: StockSnap @@ -867,7 +867,7 @@ Notes: https://stocksnap.io/api/load-photos/date/desc/1 https://stocksnap.io/faq All images are licensed under CC0. No rate limits or authorization required. API is undocumented. -## `wikimedia_commons_workflow` +### `wikimedia_commons_workflow` **Content Provider:** Wikimedia Commons @@ -875,7 +875,7 @@ is undocumented. **Output:** TSV file containing the image, the respective meta-data. -### Notes +#### Notes Rate limit of no more than 200 requests/second, and we are required to set a unique User-Agent field @@ -967,7 +967,7 @@ cases where they might exceed the limit. Technically, it's feasible for almost any property to exceed the limit, but these are the ones that we've seen in practice. -#### `imageinfo` +##### `imageinfo` [Docs](https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bimageinfo) @@ -977,7 +977,7 @@ practice. For these requests, we can remove the `metadata` property from the `iiprops` parameter to avoid this issue on subsequent iterations. -#### `globalusage` +##### `globalusage` [Docs](https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bglobalusage) @@ -986,7 +986,7 @@ parameter to avoid this issue on subsequent iterations. For these requests, we can remove the `globalusage` property from the `prop` parameter entirely and eschew the popularity data for these items. -## `wikimedia_reingestion_workflow` +### `wikimedia_reingestion_workflow` **Content Provider:** Wikimedia Commons @@ -994,7 +994,7 @@ parameter entirely and eschew the popularity data for these items. **Output:** TSV file containing the image, the respective meta-data. -### Notes +#### Notes Rate limit of no more than 200 requests/second, and we are required to set a unique User-Agent field @@ -1086,7 +1086,7 @@ cases where they might exceed the limit. Technically, it's feasible for almost any property to exceed the limit, but these are the ones that we've seen in practice. -#### `imageinfo` +##### `imageinfo` [Docs](https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bimageinfo) @@ -1096,7 +1096,7 @@ practice. For these requests, we can remove the `metadata` property from the `iiprops` parameter to avoid this issue on subsequent iterations. -#### `globalusage` +##### `globalusage` [Docs](https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bglobalusage) @@ -1105,7 +1105,7 @@ parameter to avoid this issue on subsequent iterations. For these requests, we can remove the `globalusage` property from the `prop` parameter entirely and eschew the popularity data for these items. -## `wordpress_workflow` +### `wordpress_workflow` Content Provider: WordPress Photo Directory From e3e00e4fc36aeebafc49ee87dd71977e627a5de6 Mon Sep 17 00:00:00 2001 From: Madison Swain-Bowden Date: Mon, 25 Sep 2023 19:34:31 -0700 Subject: [PATCH 7/8] Fix headers in tests --- .../utilities/dag_doc_gen/test_dag_doc_generation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/catalog/tests/utilities/dag_doc_gen/test_dag_doc_generation.py b/catalog/tests/utilities/dag_doc_gen/test_dag_doc_generation.py index b7442a90f02..f65fac2fec3 100644 --- a/catalog/tests/utilities/dag_doc_gen/test_dag_doc_generation.py +++ b/catalog/tests/utilities/dag_doc_gen/test_dag_doc_generation.py @@ -33,7 +33,7 @@ class DagMock(NamedTuple): [ (None, None), ("Sample simple doc", "Sample simple doc"), - ("# Big header", "### Big header"), + ("# Big header", "#### Big header"), ], ) @pytest.mark.parametrize( @@ -107,7 +107,7 @@ def test_get_dags_info( ), False, """ -## Special Name +### Special Name | DAG ID | Schedule Interval | | --- | --- | @@ -126,7 +126,7 @@ def test_get_dags_info( ), False, """ -## Special Name +### Special Name | DAG ID | Schedule Interval | | --- | --- | @@ -145,7 +145,7 @@ def test_get_dags_info( ), True, """ -## Special Name +### Special Name | DAG ID | Schedule Interval | Dated | Media Type(s) | | --- | --- | --- | --- | @@ -167,7 +167,7 @@ def test_generate_dag_doc(): + """\ 1. [T1](#t1) -## T1 +### T1 | DAG ID | Schedule Interval | | --- | --- | @@ -180,7 +180,7 @@ def test_generate_dag_doc(): 1. [`b`](#b) -## `b` +### `b` this one has a doc """ From 07ccc5bad5a492c62cda7786a2d19bdce5da67c7 Mon Sep 17 00:00:00 2001 From: ngken0995 Date: Tue, 26 Sep 2023 08:48:26 -0400 Subject: [PATCH 8/8] add path to pass github action --- catalog/justfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalog/justfile b/catalog/justfile index dec78c046b9..fd34546cd23 100644 --- a/catalog/justfile +++ b/catalog/justfile @@ -134,7 +134,7 @@ generate-dag-docs fail_on_diff="false": echo "Done!" if {{ fail_on_diff }}; then set +e - git diff --exit-code DAGs.md + git diff --exit-code -- documentation/catalog/reference/DAGs.md if [ $? -ne 0 ]; then printf "\n\n\e[31m!! Changes found in DAG documentation, please run 'just generate-dag-docs' locally and commit difference !!\n\n" exit 1