From ae6ce544f0cacdc11aebf898174e14eed1cfb57a Mon Sep 17 00:00:00 2001 From: Dhruv Bhanushali Date: Fri, 15 Dec 2023 18:50:46 +0400 Subject: [PATCH] Add DAG for ccMixter (#3479) Co-authored-by: Staci Mullins <63313398+stacimc@users.noreply.github.com> Co-authored-by: Krystle Salazar Co-authored-by: Madison Swain-Bowden --- .../dags/common/loader/provider_details.py | 1 + catalog/dags/common/requester.py | 12 +- .../popularity/popularity_refresh_types.py | 1 + .../provider_api_scripts/cc_mixter.py | 286 ++++++++++++++++++ catalog/dags/providers/provider_workflows.py | 5 + .../cc_mixter/expected_single_record.json | 47 +++ .../resources/cc_mixter/single_item.json | 110 +++++++ .../provider_api_scripts/test_cc_mixter.py | 111 +++++++ .../0007_openledger_audio_view.sql | 3 +- documentation/catalog/reference/DAGs.md | 14 + 10 files changed, 584 insertions(+), 6 deletions(-) create mode 100644 catalog/dags/providers/provider_api_scripts/cc_mixter.py create mode 100644 catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/expected_single_record.json create mode 100644 catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/single_item.json create mode 100644 catalog/tests/dags/providers/provider_api_scripts/test_cc_mixter.py diff --git a/catalog/dags/common/loader/provider_details.py b/catalog/dags/common/loader/provider_details.py index ff82904711f..c3d8530ad32 100644 --- a/catalog/dags/common/loader/provider_details.py +++ b/catalog/dags/common/loader/provider_details.py @@ -35,6 +35,7 @@ WIKIMEDIA_DEFAULT_PROVIDER = "wikimedia" WORDPRESS_DEFAULT_PROVIDER = "wordpress" PHYLOPIC_DEFAULT_PROVIDER = "phylopic" +CC_MIXTER_DEFAULT_PROVIDER = "ccmixter" # Finnish parameters FINNISH_SUB_PROVIDERS = { diff --git a/catalog/dags/common/requester.py b/catalog/dags/common/requester.py index 6ce81aff176..b0ef1cc1af5 100644 --- a/catalog/dags/common/requester.py +++ b/catalog/dags/common/requester.py @@ -128,6 +128,12 @@ def _delay_processing(self): logging.debug(f"Waiting {wait} second(s)") time.sleep(wait) + def _get_json(self, response) -> dict | list | None: + try: + return response.json() + except JSONDecodeError as e: + logger.warning(f"Could not get response_json.\n{e}") + def get_response_json(self, endpoint, retries=0, query_params=None, **kwargs): response_json = None @@ -137,11 +143,7 @@ def get_response_json(self, endpoint, retries=0, query_params=None, **kwargs): response = self.get(endpoint, params=query_params, **kwargs) if response is not None and response.status_code == 200: - try: - response_json = response.json() - except JSONDecodeError as e: - logger.warning(f"Could not get response_json.\n{e}") - response_json = None + response_json = self._get_json(response) if response_json is None or ( isinstance(response_json, dict) and response_json.get("error") is not None diff --git a/catalog/dags/popularity/popularity_refresh_types.py b/catalog/dags/popularity/popularity_refresh_types.py index b25867e9e27..67c9cb51912 100644 --- a/catalog/dags/popularity/popularity_refresh_types.py +++ b/catalog/dags/popularity/popularity_refresh_types.py @@ -84,6 +84,7 @@ def __post_init__(self): "jamendo": {"metric": "listens"}, "wikimedia_audio": {"metric": "global_usage_count"}, "freesound": {"metric": "num_downloads"}, + "ccmixter": {"metric": "upload_num_scores"}, }, ), ] diff --git a/catalog/dags/providers/provider_api_scripts/cc_mixter.py b/catalog/dags/providers/provider_api_scripts/cc_mixter.py new file mode 100644 index 00000000000..bc23223db63 --- /dev/null +++ b/catalog/dags/providers/provider_api_scripts/cc_mixter.py @@ -0,0 +1,286 @@ +""" +Content Provider: ccMixter + +ETL Process: Use the API to identify all CC licensed media. + +Output: TSV file containing the media and the + respective meta-data. + +Notes: Documentation: https://ccmixter.org/query-api + ccMixter sends bad JSON and extremely huge headers, both + of which need workarounds that are handled by this DAG. +""" + +import json +import logging +import re +from typing import Literal + +from common import constants +from common.licenses import get_license_info +from common.loader import provider_details as prov +from common.requester import DelayedRequester +from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester + + +logger = logging.getLogger(__name__) + +JSON_OCTALS = re.compile(r":\s*0(?P\d+)\s*(?P[,}])") + + +class CcMixterDelayedRequester(DelayedRequester): + """ + WORKAROUND! + + ccMixter sends bad JSON, including numbers with a leading 0 (observed some + such cases with "bpm" field). This makes the JSON invalid and raises decode + errors. + + This class extends ``DelayedRequester`` to supply a custom JSON decoding + step where we perform a text substitution first and then parse the JSON. + """ + + def _get_json(self, response): + raw_json = response.text + cleaned_json = JSON_OCTALS.sub(r":\g\g", raw_json) + if cleaned_json == raw_json: + logger.debug("JSON was clean, no substitutions were made.") + else: + logger.warning("JSON had bad octals, substitutions were made.") + try: + # WORKAROUND! + # + # ccMixter sends JSON that can contain control characters which + # break JSON parsing, unless we use non-strict mode that can handle + # such malformed data as well. + response_json = json.loads(cleaned_json, strict=False) + except json.JSONDecodeError as e: + logger.warning(f"Could not get response_json.\n{e}") + response_json = None + return response_json + + +def patch_http_client(): + """ + WORKAROUND! + + ccMixter sends a very long ``X-Json`` header with the response, which causes + the ``http.client`` library to raise a ``LineTooLong`` error. + + We work around it by patching the ``_read_headers`` function to ignore the + line length limit. + + .. seealso:: + + StackOverflow + `Answer with this approach `_ + + ``_read_headers()`` + `Original implementation `_ + """ + + import http.client + + def _read_headers(fp): + logger.debug("Patched _read_headers() called.") + + headers = [] + while True: + line = fp.readline() + headers.append(line) + if len(headers) > http.client._MAXHEADERS: + raise http.client.HTTPException( + f"Got more than {http.client._MAXHEADERS} headers." + ) + if line in (b"\r\n", b"\n", b""): + break + return headers + + http.client._read_headers = _read_headers + + +class CcMixterDataIngester(ProviderDataIngester): + providers = { + "audio": prov.CC_MIXTER_DEFAULT_PROVIDER, + } + endpoint = "https://ccmixter.org/api/query/" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # Set-up workarounds! + patch_http_client() + self.delayed_requester = CcMixterDelayedRequester( + delay=self.delay, headers=self.headers + ) + + def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: + if not prev_query_params: + # This means this is the first request, so we start with offset 0. + return { + "format": "json", + "limit": self.batch_limit, + "offset": 0, + } + else: + # This is a subsequent request, so we bump the offset by a value + # equal to the batch limit. + return { + **prev_query_params, + "offset": prev_query_params["offset"] + self.batch_limit, + } + + def get_batch_data(self, response_json): + return response_json + + def get_should_continue(self, response_json): + # We can know we are at the last page if the number of records returned + # is less than the batch limit. This is not an issue even if the + # penultimate page has a batch limit number of results because ccMixter + # allows paginating after the last page too, returning ``[]``, which is + # less than the batch limit. + return len(response_json) >= self.batch_limit + + def get_media_type(self, record: dict) -> Literal["audio"]: + return constants.AUDIO + + @staticmethod + def _get_duration(ps: str | None) -> int | None: + """ + Convert a duration string to the number of milliseconds. This function + can handle between 1 and 3 segments in the time string. + + :param ps: the human-friendly duration string + :return: the number of milliseconds + """ + + if not ps: + return None + + segments = map(int, [0, 0] + ps.split(":")) + *_, hours, minutes, seconds = segments + return (hours * 3600 + minutes * 60 + seconds) * 1000 + + @staticmethod + def _get_sample_rate(sr: str | None) -> int | None: + """ + Convert the sample rate from a human-friendly string to the integer + number of samples per second. + + :param sr: the human-friendly sample rate + :return: the number of samples per second + """ + + return int(float(sr.rstrip("k")) * 1000) if sr else None + + def _get_audio_files( + self, files: list[dict] + ) -> tuple[dict, list[dict]] | tuple[None, None]: + """ + Filter the audio files from the file list and identify the main one. + + The list of files can include archives like ZIP files, which we drop. + The smallest audio file is assumed to be the main one, which is usually + MP3 in the case of ccMixter. + + This is because the smallest audio file takes the least time to start + streaming, wastes the least data if not useful and MP3 is the most + widely supported format. + + :param files: the list of files supplied by ccMixter + :return: the main file and a list of alternative files + """ + + files = [ + { + "url": file["download_url"], + "filesize": file["file_rawsize"], + "filetype": file["file_format_info"]["default-ext"], + "sample_rate": self._get_sample_rate( + file["file_format_info"].get("sr") + ), + "duration": self._get_duration(file["file_format_info"].get("ps")), + } + for file in files + if file["file_format_info"]["media-type"] == "audio" + ] + if not files: + return None, None + + main_file, *alt_files = sorted(files, key=lambda file: file["filesize"]) + return main_file, alt_files + + def get_record_data(self, data: dict) -> dict | list[dict] | None: + if not (foreign_identifier := data.get("upload_id")): + logger.warning("Rejected record with no foreign identifier.") + return None + + if not (foreign_landing_url := data.get("file_page_url")): + logger.warning( + f"Rejected record {foreign_identifier} with no foreign landing URL." + ) + return None + + # Use the `get_license_info` utility to get license information from a URL. + license_url = data.get("license_url") + license_info = get_license_info(license_url) + if not license_info: + logger.warning( + f"Rejected record {foreign_identifier} with no license info." + ) + return None + + if not (files := data.get("files")): + return None + main_file, alt_files = self._get_audio_files(files) + if not main_file: + logger.warning( + f"Rejected record {foreign_identifier} with no main audio file." + ) + return None + + # Optional fields + + creator = data.get("user_real_name") + creator_url = data.get("artist_page_url") + title = data.get("upload_name") + meta_data = { + "description": data.get("upload_description_plain"), + "description_html": data.get("upload_description_html"), + "upload_num_scores": data.get("upload_num_scores", 0), + } + + # ccMixter tags are comma-separated, and there is a leading and trailing + # comma, so we need to filter out empty strings. + raw_tags = list(filter(None, data.get("upload_tags").split(","))) + + return { + "foreign_identifier": foreign_identifier, + "foreign_landing_url": foreign_landing_url, + "license_info": license_info, + # Optional fields + "creator": creator, + "creator_url": creator_url, + "title": title, + "meta_data": meta_data, + "raw_tags": raw_tags, + "alt_files": alt_files, + # ``main_file`` contains the following fields: + # - ``url`` + # - ``filesize`` + # - ``filetype`` + # - ``sample_rate`` + # - ``duration`` + **main_file, + } + + +def main(): + # Allows running ingestion from the CLI without Airflow running for debugging + # purposes. + ingester = CcMixterDataIngester() + ingester.ingest_records() + + +if __name__ == "__main__": + main() diff --git a/catalog/dags/providers/provider_workflows.py b/catalog/dags/providers/provider_workflows.py index 4c33ea74db8..52cee8b7fb8 100644 --- a/catalog/dags/providers/provider_workflows.py +++ b/catalog/dags/providers/provider_workflows.py @@ -9,6 +9,7 @@ from typing_extensions import NotRequired, TypedDict from providers.provider_api_scripts.brooklyn_museum import BrooklynMuseumDataIngester +from providers.provider_api_scripts.cc_mixter import CcMixterDataIngester from providers.provider_api_scripts.cleveland_museum import ClevelandDataIngester from providers.provider_api_scripts.europeana import EuropeanaDataIngester from providers.provider_api_scripts.finnish_museums import FinnishMuseumsDataIngester @@ -196,6 +197,10 @@ def __post_init__(self): start_date=datetime(2020, 1, 1), ingester_class=BrooklynMuseumDataIngester, ), + ProviderWorkflow( + start_date=datetime(2023, 11, 30), + ingester_class=CcMixterDataIngester, + ), ProviderWorkflow( ingester_class=ClevelandDataIngester, start_date=datetime(2020, 1, 15), diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/expected_single_record.json b/catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/expected_single_record.json new file mode 100644 index 00000000000..7f396411bc6 --- /dev/null +++ b/catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/expected_single_record.json @@ -0,0 +1,47 @@ +{ + "foreign_identifier": 69420, + "foreign_landing_url": "https://ccmixter.org/files/testuser/69420", + "license_info": { + "license": "by-nc", + "version": "4.0", + "url": "https://creativecommons.org/licenses/by-nc/4.0/", + "raw_url": "https://creativecommons.org/licenses/by-nc/4.0/" + }, + "creator": "Test User", + "creator_url": "https://ccmixter.org/people/testuser", + "title": "Test Music", + "meta_data": { + "description": "Thanks to:\r\nOther Test User – sax\r\nAnother Test User – bagpipes and sitar", + "description_html": "Thanks to:
\r\nOther Test User – sax
\r\nAnother Test User – bagpipes and sitar", + "upload_num_scores": 420 + }, + "raw_tags": [ + "remix", + "media", + "bpm_090_095", + "non_commercial", + "audio", + "mp3", + "flac", + "zip", + "44k", + "stereo", + "CBR", + "VBR", + "instrumental" + ], + "url": "https://ccmixter.org/content/testuser/testuser_-_Test_Music.mp3", + "sample_rate": 44100, + "filetype": "mp3", + "filesize": 11150278, + "duration": 346000, + "alt_files": [ + { + "url": "https://ccmixter.org/content/testuser/testuser_-_Test_Music.flac", + "sample_rate": 44000, + "filetype": "flac", + "filesize": 111502780, + "duration": 3946000 + } + ] +} diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/single_item.json b/catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/single_item.json new file mode 100644 index 00000000000..d21e87f9d6a --- /dev/null +++ b/catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/single_item.json @@ -0,0 +1,110 @@ +{ + "upload_id": 69420, + "upload_name": "Test Music", + "upload_extra": { + "usertags": "instrumental", + "ccud": "remix,media,bpm_090_095", + "systags": "non_commercial,audio,mp3,flac,zip,44k,stereo,CBR,VBR", + "bpm": 94, + "relative_dir": "content/testuser", + "featuring": "Other Test User, Another Test User", + "nsfw": false + }, + "user_name": "testuser", + "upload_tags": ",remix,media,bpm_090_095,non_commercial,audio,mp3,flac,zip,44k,stereo,CBR,VBR,instrumental,", + "file_page_url": "https://ccmixter.org/files/testuser/69420", + "user_real_name": "Test User", + "artist_page_url": "https://ccmixter.org/people/testuser", + "license_logo_url": "https://ccmixter.org/ccskins/shared/images/lics/small-by-nc-3.png", + "license_url": "https://creativecommons.org/licenses/by-nc/4.0/", + "license_name": "Attribution Noncommercial (4.0)", + "upload_date_format": "Wed, Dec 6, 2023 @ 4:47 AM", + "upload_num_scores": 420, + "files": [ + { + "file_id": 694201, + "file_upload": 69420, + "file_name": "testuser_-_Test_Music.zip", + "file_nicname": "Instruments", + "file_format_info": { + "media-type": "archive", + "format-name": "archive-zip-", + "default-ext": "zip", + "mime_type": "application/zip", + "zipdir": { + "files": [ + "/Sax.flac (11.78MB)", + "/Bagpipes.flac (17.65MB)", + "/Sitar.flac (9.74MB)" + ] + } + }, + "file_extra": { + "sha1": "D73BDA58A34B029F63CF7A5CE8C36914" + }, + "file_filesize": " (10.63MB)", + "file_order": 0, + "file_is_remote": 0, + "file_num_download": 0, + "download_url": "https://ccmixter.org/content/testuser/testuser_-_Test_Music.zip", + "local_path": "/var/www/ccmixter/content/testuser/testuser_-_Test_Music.zip", + "file_rawsize": 11150278 + }, + { + "file_id": 694202, + "file_upload": 69420, + "file_name": "testuser_-_Test_Music.flac", + "file_nicname": "flac", + "file_format_info": { + "media-type": "audio", + "format-name": "audio-flac-flac", + "default-ext": "flac", + "mime_type": "audio/x-flac", + "sr": "44k", + "ch": "stereo", + "ps": "1:05:46", + "br": "VBR" + }, + "file_extra": { + "ccud": "sample,media", + "type": "samples", + "sha1": "65A8E27D8879283831B664BD8B7F0AD4" + }, + "file_filesize": " (106.34MB)", + "file_order": 0, + "file_is_remote": 0, + "file_num_download": 0, + "download_url": "https://ccmixter.org/content/testuser/testuser_-_Test_Music.flac", + "local_path": "/var/www/ccmixter/content/testuser/testuser_-_Test_Music.flac", + "file_rawsize": 111502780 + }, + { + "file_id": 694203, + "file_upload": 69420, + "file_name": "testuser_-_Test_Music.mp3", + "file_nicname": "mp3", + "file_format_info": { + "media-type": "audio", + "format-name": "audio-mp3-mp3", + "default-ext": "mp3", + "mime_type": "audio/mpeg", + "sr": "44.1k", + "ch": "mono", + "ps": "5:46", + "br": "VBR" + }, + "file_extra": { + "sha1": "C2B9A98D0CA61853B0ED8ED89DC6D9C3" + }, + "file_filesize": " (10.63MB)", + "file_order": 0, + "file_is_remote": 0, + "file_num_download": 0, + "download_url": "https://ccmixter.org/content/testuser/testuser_-_Test_Music.mp3", + "local_path": "/var/www/ccmixter/content/testuser/testuser_-_Test_Music.mp3", + "file_rawsize": 11150278 + } + ], + "upload_description_plain": "Thanks to:\r\nOther Test User – sax\r\nAnother Test User – bagpipes and sitar", + "upload_description_html": "Thanks to:
\r\nOther Test User – sax
\r\nAnother Test User – bagpipes and sitar" +} diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_cc_mixter.py b/catalog/tests/dags/providers/provider_api_scripts/test_cc_mixter.py new file mode 100644 index 00000000000..7c54a5c46c5 --- /dev/null +++ b/catalog/tests/dags/providers/provider_api_scripts/test_cc_mixter.py @@ -0,0 +1,111 @@ +""" +Run these tests locally with `just test -k cc_mixter` +""" + +import json +from pathlib import Path +from unittest.mock import Mock + +import pytest + +from common.licenses import LicenseInfo +from providers.provider_api_scripts.cc_mixter import CcMixterDataIngester + + +RESOURCES = Path(__file__).parent / "resources/cc_mixter" + +# Set up test class +ingester = CcMixterDataIngester() + + +def test_custom_requester_parses_bad_json(): + response = Mock(text='{"value": 0123}') + + with pytest.raises(json.decoder.JSONDecodeError): + json.loads(response.text) + + assert ingester.delayed_requester._get_json(response) == {"value": 123} + + +@pytest.mark.parametrize( + "time_string, expected_ms", + [ + ("2", 2000), # one segment + ("1:2", 62000), # two segments + ("1:2:3", 3723000), # three segments + ("01:02:03", 3723000), # leading zeros + ], +) +def test_durations_converted_to_ms(time_string, expected_ms): + actual_ms = ingester._get_duration(time_string) + assert actual_ms == expected_ms + + +@pytest.mark.parametrize( + "input_ext, output_ext", + [ + ([], (None, None)), # no files + (["zip"], (None, None)), # ZIP is not audio so no audio files + (["flac", "mp3"], ("mp3", ["flac"])), # MP3 is smaller than FLAC + (["zip", "flac"], ("flac", [])), # ZIP is not audio but FLAC is + ], +) +def test_audio_files( + input_ext: list[str], output_ext: tuple[str, list[str]] | tuple[None, None] +): + with (RESOURCES / "single_item.json").open() as f: + single_item = json.load(f) + + input_files = [ + file + for file in single_item["files"] + if file["file_format_info"]["default-ext"] in input_ext + ] + input_files.sort( + key=lambda file: input_ext.index(file["file_format_info"]["default-ext"]) + ) + + output_files = ingester._get_audio_files(input_files) + if output_ext == (None, None): + assert output_files == output_ext + else: + assert output_files[0]["filetype"] == output_ext[0] + assert [file["filetype"] for file in output_files[1]] == output_ext[1] + + +def test_get_next_query_params_provides_parameters(): + prev_params = None + for idx in range(3): + next_params = ingester.get_next_query_params(prev_params) + expected_next_params = {"format": "json", "limit": 100, "offset": idx * 100} + assert next_params == expected_next_params + prev_params = next_params + + +@pytest.mark.parametrize( + "results_len, should_continue", + [ + (101, True), + (100, True), + (99, False), + ], +) +def test_determines_when_to_continue(results_len: int, should_continue: bool): + actual_result = ingester.get_should_continue([{}] * results_len) + assert actual_result == should_continue + + +def test_get_record_data(): + # Sample code for loading in the sample json + with (RESOURCES / "single_item.json").open() as f: + single_item = json.load(f) + + single_record = ingester.get_record_data(single_item) + + with (RESOURCES / "expected_single_record.json").open() as f: + expected_single_record = json.load(f) + expected_single_record["license_info"] = LicenseInfo( + **expected_single_record["license_info"] + ) + + assert single_record == expected_single_record diff --git a/docker/upstream_db/0007_openledger_audio_view.sql b/docker/upstream_db/0007_openledger_audio_view.sql index af625f4c261..da072fa0379 100644 --- a/docker/upstream_db/0007_openledger_audio_view.sql +++ b/docker/upstream_db/0007_openledger_audio_view.sql @@ -12,7 +12,8 @@ INSERT INTO public.audio_popularity_metrics ( ) VALUES ('wikimedia_audio', 'global_usage_count', 0.85), ('jamendo', 'listens', 0.85), - ('freesound', 'num_downloads', 0.85); + ('freesound', 'num_downloads', 0.85), + ('ccmixter', 'upload_num_scores', 0.85); CREATE FUNCTION audio_popularity_percentile( diff --git a/documentation/catalog/reference/DAGs.md b/documentation/catalog/reference/DAGs.md index bf4f550c8ca..ad06a123255 100644 --- a/documentation/catalog/reference/DAGs.md +++ b/documentation/catalog/reference/DAGs.md @@ -86,6 +86,7 @@ The following are DAGs grouped by their primary tag: | DAG ID | Schedule Interval | Dated | Media Type(s) | | --------------------------------------------------------------- | ----------------- | ------- | ------------- | | `brooklyn_museum_workflow` | `@monthly` | `False` | image | +| [`cc_mixter_workflow`](#cc_mixter_workflow) | `@monthly` | `False` | audio | | `cleveland_museum_workflow` | `@monthly` | `False` | image | | [`europeana_workflow`](#europeana_workflow) | `@daily` | `True` | image | | [`finnish_museums_workflow`](#finnish_museums_workflow) | `@daily` | `True` | image | @@ -125,6 +126,7 @@ The following is documentation associated with each DAG (where available): 1. [`audio_data_refresh`](#audio_data_refresh) 1. [`audio_popularity_refresh`](#audio_popularity_refresh) 1. [`batched_update`](#batched_update) +1. [`cc_mixter_workflow`](#cc_mixter_workflow) 1. [`check_silenced_dags`](#check_silenced_dags) 1. [`create_filtered_audio_index`](#create_filtered_audio_index) 1. [`create_filtered_image_index`](#create_filtered_image_index) @@ -315,6 +317,18 @@ used when the DagRun configuration needs to be changed after the table was already created: for example, if there was a problem with the `update_query` which caused DAG failures during the `update_batches` step. +### `cc_mixter_workflow` + +Content Provider: ccMixter + +ETL Process: Use the API to identify all CC licensed media. + +Output: TSV file containing the media and the respective meta-data. + +Notes: Documentation: https://ccmixter.org/query-api ccMixter sends bad JSON and +extremely huge headers, both of which need workarounds that are handled by this +DAG. + ### `check_silenced_dags` #### Silenced DAGs check