From ae6ce544f0cacdc11aebf898174e14eed1cfb57a Mon Sep 17 00:00:00 2001
From: Dhruv Bhanushali <hi@dhruvkb.dev>
Date: Fri, 15 Dec 2023 18:50:46 +0400
Subject: [PATCH] Add DAG for ccMixter (#3479)

Co-authored-by: Staci Mullins <63313398+stacimc@users.noreply.github.com>
Co-authored-by: Krystle Salazar <krystle.salazar@automattic.com>
Co-authored-by: Madison Swain-Bowden <bowdenm@spu.edu>
---
 .../dags/common/loader/provider_details.py    |   1 +
 catalog/dags/common/requester.py              |  12 +-
 .../popularity/popularity_refresh_types.py    |   1 +
 .../provider_api_scripts/cc_mixter.py         | 286 ++++++++++++++++++
 catalog/dags/providers/provider_workflows.py  |   5 +
 .../cc_mixter/expected_single_record.json     |  47 +++
 .../resources/cc_mixter/single_item.json      | 110 +++++++
 .../provider_api_scripts/test_cc_mixter.py    | 111 +++++++
 .../0007_openledger_audio_view.sql            |   3 +-
 documentation/catalog/reference/DAGs.md       |  14 +
 10 files changed, 584 insertions(+), 6 deletions(-)
 create mode 100644 catalog/dags/providers/provider_api_scripts/cc_mixter.py
 create mode 100644 catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/expected_single_record.json
 create mode 100644 catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/single_item.json
 create mode 100644 catalog/tests/dags/providers/provider_api_scripts/test_cc_mixter.py

diff --git a/catalog/dags/common/loader/provider_details.py b/catalog/dags/common/loader/provider_details.py
index ff82904711f..c3d8530ad32 100644
--- a/catalog/dags/common/loader/provider_details.py
+++ b/catalog/dags/common/loader/provider_details.py
@@ -35,6 +35,7 @@
 WIKIMEDIA_DEFAULT_PROVIDER = "wikimedia"
 WORDPRESS_DEFAULT_PROVIDER = "wordpress"
 PHYLOPIC_DEFAULT_PROVIDER = "phylopic"
+CC_MIXTER_DEFAULT_PROVIDER = "ccmixter"
 
 # Finnish parameters
 FINNISH_SUB_PROVIDERS = {
diff --git a/catalog/dags/common/requester.py b/catalog/dags/common/requester.py
index 6ce81aff176..b0ef1cc1af5 100644
--- a/catalog/dags/common/requester.py
+++ b/catalog/dags/common/requester.py
@@ -128,6 +128,12 @@ def _delay_processing(self):
             logging.debug(f"Waiting {wait} second(s)")
             time.sleep(wait)
 
+    def _get_json(self, response) -> dict | list | None:
+        try:
+            return response.json()
+        except JSONDecodeError as e:
+            logger.warning(f"Could not get response_json.\n{e}")
+
     def get_response_json(self, endpoint, retries=0, query_params=None, **kwargs):
         response_json = None
 
@@ -137,11 +143,7 @@ def get_response_json(self, endpoint, retries=0, query_params=None, **kwargs):
 
         response = self.get(endpoint, params=query_params, **kwargs)
         if response is not None and response.status_code == 200:
-            try:
-                response_json = response.json()
-            except JSONDecodeError as e:
-                logger.warning(f"Could not get response_json.\n{e}")
-                response_json = None
+            response_json = self._get_json(response)
 
         if response_json is None or (
             isinstance(response_json, dict) and response_json.get("error") is not None
diff --git a/catalog/dags/popularity/popularity_refresh_types.py b/catalog/dags/popularity/popularity_refresh_types.py
index b25867e9e27..67c9cb51912 100644
--- a/catalog/dags/popularity/popularity_refresh_types.py
+++ b/catalog/dags/popularity/popularity_refresh_types.py
@@ -84,6 +84,7 @@ def __post_init__(self):
             "jamendo": {"metric": "listens"},
             "wikimedia_audio": {"metric": "global_usage_count"},
             "freesound": {"metric": "num_downloads"},
+            "ccmixter": {"metric": "upload_num_scores"},
         },
     ),
 ]
diff --git a/catalog/dags/providers/provider_api_scripts/cc_mixter.py b/catalog/dags/providers/provider_api_scripts/cc_mixter.py
new file mode 100644
index 00000000000..bc23223db63
--- /dev/null
+++ b/catalog/dags/providers/provider_api_scripts/cc_mixter.py
@@ -0,0 +1,286 @@
+"""
+Content Provider:       ccMixter
+
+ETL Process:            Use the API to identify all CC licensed media.
+
+Output:                 TSV file containing the media and the
+                        respective meta-data.
+
+Notes:                  Documentation: https://ccmixter.org/query-api
+                        ccMixter sends bad JSON and extremely huge headers, both
+                        of which need workarounds that are handled by this DAG.
+"""
+
+import json
+import logging
+import re
+from typing import Literal
+
+from common import constants
+from common.licenses import get_license_info
+from common.loader import provider_details as prov
+from common.requester import DelayedRequester
+from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester
+
+
+logger = logging.getLogger(__name__)
+
+JSON_OCTALS = re.compile(r":\s*0(?P<num>\d+)\s*(?P<sep>[,}])")
+
+
+class CcMixterDelayedRequester(DelayedRequester):
+    """
+    WORKAROUND!
+
+    ccMixter sends bad JSON, including numbers with a leading 0 (observed some
+    such cases with "bpm" field). This makes the JSON invalid and raises decode
+    errors.
+
+    This class extends ``DelayedRequester`` to supply a custom JSON decoding
+    step where we perform a text substitution first and then parse the JSON.
+    """
+
+    def _get_json(self, response):
+        raw_json = response.text
+        cleaned_json = JSON_OCTALS.sub(r":\g<num>\g<sep>", raw_json)
+        if cleaned_json == raw_json:
+            logger.debug("JSON was clean, no substitutions were made.")
+        else:
+            logger.warning("JSON had bad octals, substitutions were made.")
+        try:
+            # WORKAROUND!
+            #
+            # ccMixter sends JSON that can contain control characters which
+            # break JSON parsing, unless we use non-strict mode that can handle
+            # such malformed data as well.
+            response_json = json.loads(cleaned_json, strict=False)
+        except json.JSONDecodeError as e:
+            logger.warning(f"Could not get response_json.\n{e}")
+            response_json = None
+        return response_json
+
+
+def patch_http_client():
+    """
+    WORKAROUND!
+
+    ccMixter sends a very long ``X-Json`` header with the response, which causes
+    the ``http.client`` library to raise a ``LineTooLong`` error.
+
+    We work around it by patching the ``_read_headers`` function to ignore the
+    line length limit.
+
+    .. seealso::
+
+        StackOverflow
+            `Answer with this approach <https://stackoverflow.com/a/63158213/2601645>`_
+
+        ``_read_headers()``
+            `Original implementation <https://github.com/python/cpython/blob/8b6ee5b/Lib/http/client.py#L206>`_
+    """
+
+    import http.client
+
+    def _read_headers(fp):
+        logger.debug("Patched _read_headers() called.")
+
+        headers = []
+        while True:
+            line = fp.readline()
+            headers.append(line)
+            if len(headers) > http.client._MAXHEADERS:
+                raise http.client.HTTPException(
+                    f"Got more than {http.client._MAXHEADERS} headers."
+                )
+            if line in (b"\r\n", b"\n", b""):
+                break
+        return headers
+
+    http.client._read_headers = _read_headers
+
+
+class CcMixterDataIngester(ProviderDataIngester):
+    providers = {
+        "audio": prov.CC_MIXTER_DEFAULT_PROVIDER,
+    }
+    endpoint = "https://ccmixter.org/api/query/"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Set-up workarounds!
+        patch_http_client()
+        self.delayed_requester = CcMixterDelayedRequester(
+            delay=self.delay, headers=self.headers
+        )
+
+    def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict:
+        if not prev_query_params:
+            # This means this is the first request, so we start with offset 0.
+            return {
+                "format": "json",
+                "limit": self.batch_limit,
+                "offset": 0,
+            }
+        else:
+            # This is a subsequent request, so we bump the offset by a value
+            # equal to the batch limit.
+            return {
+                **prev_query_params,
+                "offset": prev_query_params["offset"] + self.batch_limit,
+            }
+
+    def get_batch_data(self, response_json):
+        return response_json
+
+    def get_should_continue(self, response_json):
+        # We can know we are at the last page if the number of records returned
+        # is less than the batch limit. This is not an issue even if the
+        # penultimate page has a batch limit number of results because ccMixter
+        # allows paginating after the last page too, returning ``[]``, which is
+        # less than the batch limit.
+        return len(response_json) >= self.batch_limit
+
+    def get_media_type(self, record: dict) -> Literal["audio"]:
+        return constants.AUDIO
+
+    @staticmethod
+    def _get_duration(ps: str | None) -> int | None:
+        """
+        Convert a duration string to the number of milliseconds. This function
+        can handle between 1 and 3 segments in the time string.
+
+        :param ps: the human-friendly duration string
+        :return: the number of milliseconds
+        """
+
+        if not ps:
+            return None
+
+        segments = map(int, [0, 0] + ps.split(":"))
+        *_, hours, minutes, seconds = segments
+        return (hours * 3600 + minutes * 60 + seconds) * 1000
+
+    @staticmethod
+    def _get_sample_rate(sr: str | None) -> int | None:
+        """
+        Convert the sample rate from a human-friendly string to the integer
+        number of samples per second.
+
+        :param sr: the human-friendly sample rate
+        :return: the number of samples per second
+        """
+
+        return int(float(sr.rstrip("k")) * 1000) if sr else None
+
+    def _get_audio_files(
+        self, files: list[dict]
+    ) -> tuple[dict, list[dict]] | tuple[None, None]:
+        """
+        Filter the audio files from the file list and identify the main one.
+
+        The list of files can include archives like ZIP files, which we drop.
+        The smallest audio file is assumed to be the main one, which is usually
+        MP3 in the case of ccMixter.
+
+        This is because the smallest audio file takes the least time to start
+        streaming, wastes the least data if not useful and MP3 is the most
+        widely supported format.
+
+        :param files: the list of files supplied by ccMixter
+        :return: the main file and a list of alternative files
+        """
+
+        files = [
+            {
+                "url": file["download_url"],
+                "filesize": file["file_rawsize"],
+                "filetype": file["file_format_info"]["default-ext"],
+                "sample_rate": self._get_sample_rate(
+                    file["file_format_info"].get("sr")
+                ),
+                "duration": self._get_duration(file["file_format_info"].get("ps")),
+            }
+            for file in files
+            if file["file_format_info"]["media-type"] == "audio"
+        ]
+        if not files:
+            return None, None
+
+        main_file, *alt_files = sorted(files, key=lambda file: file["filesize"])
+        return main_file, alt_files
+
+    def get_record_data(self, data: dict) -> dict | list[dict] | None:
+        if not (foreign_identifier := data.get("upload_id")):
+            logger.warning("Rejected record with no foreign identifier.")
+            return None
+
+        if not (foreign_landing_url := data.get("file_page_url")):
+            logger.warning(
+                f"Rejected record {foreign_identifier} with no foreign landing URL."
+            )
+            return None
+
+        # Use the `get_license_info` utility to get license information from a URL.
+        license_url = data.get("license_url")
+        license_info = get_license_info(license_url)
+        if not license_info:
+            logger.warning(
+                f"Rejected record {foreign_identifier} with no license info."
+            )
+            return None
+
+        if not (files := data.get("files")):
+            return None
+        main_file, alt_files = self._get_audio_files(files)
+        if not main_file:
+            logger.warning(
+                f"Rejected record {foreign_identifier} with no main audio file."
+            )
+            return None
+
+        # Optional fields
+
+        creator = data.get("user_real_name")
+        creator_url = data.get("artist_page_url")
+        title = data.get("upload_name")
+        meta_data = {
+            "description": data.get("upload_description_plain"),
+            "description_html": data.get("upload_description_html"),
+            "upload_num_scores": data.get("upload_num_scores", 0),
+        }
+
+        # ccMixter tags are comma-separated, and there is a leading and trailing
+        # comma, so we need to filter out empty strings.
+        raw_tags = list(filter(None, data.get("upload_tags").split(",")))
+
+        return {
+            "foreign_identifier": foreign_identifier,
+            "foreign_landing_url": foreign_landing_url,
+            "license_info": license_info,
+            # Optional fields
+            "creator": creator,
+            "creator_url": creator_url,
+            "title": title,
+            "meta_data": meta_data,
+            "raw_tags": raw_tags,
+            "alt_files": alt_files,
+            # ``main_file`` contains the following fields:
+            # - ``url``
+            # - ``filesize``
+            # - ``filetype``
+            # - ``sample_rate``
+            # - ``duration``
+            **main_file,
+        }
+
+
+def main():
+    # Allows running ingestion from the CLI without Airflow running for debugging
+    # purposes.
+    ingester = CcMixterDataIngester()
+    ingester.ingest_records()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/catalog/dags/providers/provider_workflows.py b/catalog/dags/providers/provider_workflows.py
index 4c33ea74db8..52cee8b7fb8 100644
--- a/catalog/dags/providers/provider_workflows.py
+++ b/catalog/dags/providers/provider_workflows.py
@@ -9,6 +9,7 @@
 from typing_extensions import NotRequired, TypedDict
 
 from providers.provider_api_scripts.brooklyn_museum import BrooklynMuseumDataIngester
+from providers.provider_api_scripts.cc_mixter import CcMixterDataIngester
 from providers.provider_api_scripts.cleveland_museum import ClevelandDataIngester
 from providers.provider_api_scripts.europeana import EuropeanaDataIngester
 from providers.provider_api_scripts.finnish_museums import FinnishMuseumsDataIngester
@@ -196,6 +197,10 @@ def __post_init__(self):
         start_date=datetime(2020, 1, 1),
         ingester_class=BrooklynMuseumDataIngester,
     ),
+    ProviderWorkflow(
+        start_date=datetime(2023, 11, 30),
+        ingester_class=CcMixterDataIngester,
+    ),
     ProviderWorkflow(
         ingester_class=ClevelandDataIngester,
         start_date=datetime(2020, 1, 15),
diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/expected_single_record.json b/catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/expected_single_record.json
new file mode 100644
index 00000000000..7f396411bc6
--- /dev/null
+++ b/catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/expected_single_record.json
@@ -0,0 +1,47 @@
+{
+  "foreign_identifier": 69420,
+  "foreign_landing_url": "https://ccmixter.org/files/testuser/69420",
+  "license_info": {
+    "license": "by-nc",
+    "version": "4.0",
+    "url": "https://creativecommons.org/licenses/by-nc/4.0/",
+    "raw_url": "https://creativecommons.org/licenses/by-nc/4.0/"
+  },
+  "creator": "Test User",
+  "creator_url": "https://ccmixter.org/people/testuser",
+  "title": "Test Music",
+  "meta_data": {
+    "description": "Thanks to:\r\nOther Test User – sax\r\nAnother Test User – bagpipes and sitar",
+    "description_html": "Thanks to:<br />\r\nOther Test User – sax<br />\r\nAnother Test User – bagpipes and sitar",
+    "upload_num_scores": 420
+  },
+  "raw_tags": [
+    "remix",
+    "media",
+    "bpm_090_095",
+    "non_commercial",
+    "audio",
+    "mp3",
+    "flac",
+    "zip",
+    "44k",
+    "stereo",
+    "CBR",
+    "VBR",
+    "instrumental"
+  ],
+  "url": "https://ccmixter.org/content/testuser/testuser_-_Test_Music.mp3",
+  "sample_rate": 44100,
+  "filetype": "mp3",
+  "filesize": 11150278,
+  "duration": 346000,
+  "alt_files": [
+    {
+      "url": "https://ccmixter.org/content/testuser/testuser_-_Test_Music.flac",
+      "sample_rate": 44000,
+      "filetype": "flac",
+      "filesize": 111502780,
+      "duration": 3946000
+    }
+  ]
+}
diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/single_item.json b/catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/single_item.json
new file mode 100644
index 00000000000..d21e87f9d6a
--- /dev/null
+++ b/catalog/tests/dags/providers/provider_api_scripts/resources/cc_mixter/single_item.json
@@ -0,0 +1,110 @@
+{
+  "upload_id": 69420,
+  "upload_name": "Test Music",
+  "upload_extra": {
+    "usertags": "instrumental",
+    "ccud": "remix,media,bpm_090_095",
+    "systags": "non_commercial,audio,mp3,flac,zip,44k,stereo,CBR,VBR",
+    "bpm": 94,
+    "relative_dir": "content/testuser",
+    "featuring": "Other Test User, Another Test User",
+    "nsfw": false
+  },
+  "user_name": "testuser",
+  "upload_tags": ",remix,media,bpm_090_095,non_commercial,audio,mp3,flac,zip,44k,stereo,CBR,VBR,instrumental,",
+  "file_page_url": "https://ccmixter.org/files/testuser/69420",
+  "user_real_name": "Test User",
+  "artist_page_url": "https://ccmixter.org/people/testuser",
+  "license_logo_url": "https://ccmixter.org/ccskins/shared/images/lics/small-by-nc-3.png",
+  "license_url": "https://creativecommons.org/licenses/by-nc/4.0/",
+  "license_name": "Attribution Noncommercial (4.0)",
+  "upload_date_format": "Wed, Dec 6, 2023 @ 4:47 AM",
+  "upload_num_scores": 420,
+  "files": [
+    {
+      "file_id": 694201,
+      "file_upload": 69420,
+      "file_name": "testuser_-_Test_Music.zip",
+      "file_nicname": "Instruments",
+      "file_format_info": {
+        "media-type": "archive",
+        "format-name": "archive-zip-",
+        "default-ext": "zip",
+        "mime_type": "application/zip",
+        "zipdir": {
+          "files": [
+            "/Sax.flac  (11.78MB)",
+            "/Bagpipes.flac  (17.65MB)",
+            "/Sitar.flac  (9.74MB)"
+          ]
+        }
+      },
+      "file_extra": {
+        "sha1": "D73BDA58A34B029F63CF7A5CE8C36914"
+      },
+      "file_filesize": " (10.63MB)",
+      "file_order": 0,
+      "file_is_remote": 0,
+      "file_num_download": 0,
+      "download_url": "https://ccmixter.org/content/testuser/testuser_-_Test_Music.zip",
+      "local_path": "/var/www/ccmixter/content/testuser/testuser_-_Test_Music.zip",
+      "file_rawsize": 11150278
+    },
+    {
+      "file_id": 694202,
+      "file_upload": 69420,
+      "file_name": "testuser_-_Test_Music.flac",
+      "file_nicname": "flac",
+      "file_format_info": {
+        "media-type": "audio",
+        "format-name": "audio-flac-flac",
+        "default-ext": "flac",
+        "mime_type": "audio/x-flac",
+        "sr": "44k",
+        "ch": "stereo",
+        "ps": "1:05:46",
+        "br": "VBR"
+      },
+      "file_extra": {
+        "ccud": "sample,media",
+        "type": "samples",
+        "sha1": "65A8E27D8879283831B664BD8B7F0AD4"
+      },
+      "file_filesize": " (106.34MB)",
+      "file_order": 0,
+      "file_is_remote": 0,
+      "file_num_download": 0,
+      "download_url": "https://ccmixter.org/content/testuser/testuser_-_Test_Music.flac",
+      "local_path": "/var/www/ccmixter/content/testuser/testuser_-_Test_Music.flac",
+      "file_rawsize": 111502780
+    },
+    {
+      "file_id": 694203,
+      "file_upload": 69420,
+      "file_name": "testuser_-_Test_Music.mp3",
+      "file_nicname": "mp3",
+      "file_format_info": {
+        "media-type": "audio",
+        "format-name": "audio-mp3-mp3",
+        "default-ext": "mp3",
+        "mime_type": "audio/mpeg",
+        "sr": "44.1k",
+        "ch": "mono",
+        "ps": "5:46",
+        "br": "VBR"
+      },
+      "file_extra": {
+        "sha1": "C2B9A98D0CA61853B0ED8ED89DC6D9C3"
+      },
+      "file_filesize": " (10.63MB)",
+      "file_order": 0,
+      "file_is_remote": 0,
+      "file_num_download": 0,
+      "download_url": "https://ccmixter.org/content/testuser/testuser_-_Test_Music.mp3",
+      "local_path": "/var/www/ccmixter/content/testuser/testuser_-_Test_Music.mp3",
+      "file_rawsize": 11150278
+    }
+  ],
+  "upload_description_plain": "Thanks to:\r\nOther Test User – sax\r\nAnother Test User – bagpipes and sitar",
+  "upload_description_html": "Thanks to:<br />\r\nOther Test User – sax<br />\r\nAnother Test User – bagpipes and sitar"
+}
diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_cc_mixter.py b/catalog/tests/dags/providers/provider_api_scripts/test_cc_mixter.py
new file mode 100644
index 00000000000..7c54a5c46c5
--- /dev/null
+++ b/catalog/tests/dags/providers/provider_api_scripts/test_cc_mixter.py
@@ -0,0 +1,111 @@
+"""
+Run these tests locally with `just test -k cc_mixter`
+"""
+
+import json
+from pathlib import Path
+from unittest.mock import Mock
+
+import pytest
+
+from common.licenses import LicenseInfo
+from providers.provider_api_scripts.cc_mixter import CcMixterDataIngester
+
+
+RESOURCES = Path(__file__).parent / "resources/cc_mixter"
+
+# Set up test class
+ingester = CcMixterDataIngester()
+
+
+def test_custom_requester_parses_bad_json():
+    response = Mock(text='{"value": 0123}')
+
+    with pytest.raises(json.decoder.JSONDecodeError):
+        json.loads(response.text)
+
+    assert ingester.delayed_requester._get_json(response) == {"value": 123}
+
+
+@pytest.mark.parametrize(
+    "time_string, expected_ms",
+    [
+        ("2", 2000),  # one segment
+        ("1:2", 62000),  # two segments
+        ("1:2:3", 3723000),  # three segments
+        ("01:02:03", 3723000),  # leading zeros
+    ],
+)
+def test_durations_converted_to_ms(time_string, expected_ms):
+    actual_ms = ingester._get_duration(time_string)
+    assert actual_ms == expected_ms
+
+
+@pytest.mark.parametrize(
+    "input_ext, output_ext",
+    [
+        ([], (None, None)),  # no files
+        (["zip"], (None, None)),  # ZIP is not audio so no audio files
+        (["flac", "mp3"], ("mp3", ["flac"])),  # MP3 is smaller than FLAC
+        (["zip", "flac"], ("flac", [])),  # ZIP is not audio but FLAC is
+    ],
+)
+def test_audio_files(
+    input_ext: list[str], output_ext: tuple[str, list[str]] | tuple[None, None]
+):
+    with (RESOURCES / "single_item.json").open() as f:
+        single_item = json.load(f)
+
+    input_files = [
+        file
+        for file in single_item["files"]
+        if file["file_format_info"]["default-ext"] in input_ext
+    ]
+    input_files.sort(
+        key=lambda file: input_ext.index(file["file_format_info"]["default-ext"])
+    )
+
+    output_files = ingester._get_audio_files(input_files)
+    if output_ext == (None, None):
+        assert output_files == output_ext
+    else:
+        assert output_files[0]["filetype"] == output_ext[0]
+        assert [file["filetype"] for file in output_files[1]] == output_ext[1]
+
+
+def test_get_next_query_params_provides_parameters():
+    prev_params = None
+    for idx in range(3):
+        next_params = ingester.get_next_query_params(prev_params)
+        expected_next_params = {"format": "json", "limit": 100, "offset": idx * 100}
+        assert next_params == expected_next_params
+        prev_params = next_params
+
+
+@pytest.mark.parametrize(
+    "results_len, should_continue",
+    [
+        (101, True),
+        (100, True),
+        (99, False),
+    ],
+)
+def test_determines_when_to_continue(results_len: int, should_continue: bool):
+    actual_result = ingester.get_should_continue([{}] * results_len)
+    assert actual_result == should_continue
+
+
+def test_get_record_data():
+    # Sample code for loading in the sample json
+    with (RESOURCES / "single_item.json").open() as f:
+        single_item = json.load(f)
+
+    single_record = ingester.get_record_data(single_item)
+
+    with (RESOURCES / "expected_single_record.json").open() as f:
+        expected_single_record = json.load(f)
+    expected_single_record["license_info"] = LicenseInfo(
+        **expected_single_record["license_info"]
+    )
+
+    assert single_record == expected_single_record
diff --git a/docker/upstream_db/0007_openledger_audio_view.sql b/docker/upstream_db/0007_openledger_audio_view.sql
index af625f4c261..da072fa0379 100644
--- a/docker/upstream_db/0007_openledger_audio_view.sql
+++ b/docker/upstream_db/0007_openledger_audio_view.sql
@@ -12,7 +12,8 @@ INSERT INTO public.audio_popularity_metrics (
 ) VALUES
   ('wikimedia_audio', 'global_usage_count', 0.85),
   ('jamendo', 'listens', 0.85),
-  ('freesound', 'num_downloads', 0.85);
+  ('freesound', 'num_downloads', 0.85),
+  ('ccmixter', 'upload_num_scores', 0.85);
 
 
 CREATE FUNCTION audio_popularity_percentile(
diff --git a/documentation/catalog/reference/DAGs.md b/documentation/catalog/reference/DAGs.md
index bf4f550c8ca..ad06a123255 100644
--- a/documentation/catalog/reference/DAGs.md
+++ b/documentation/catalog/reference/DAGs.md
@@ -86,6 +86,7 @@ The following are DAGs grouped by their primary tag:
 | DAG ID                                                          | Schedule Interval | Dated   | Media Type(s) |
 | --------------------------------------------------------------- | ----------------- | ------- | ------------- |
 | `brooklyn_museum_workflow`                                      | `@monthly`        | `False` | image         |
+| [`cc_mixter_workflow`](#cc_mixter_workflow)                     | `@monthly`        | `False` | audio         |
 | `cleveland_museum_workflow`                                     | `@monthly`        | `False` | image         |
 | [`europeana_workflow`](#europeana_workflow)                     | `@daily`          | `True`  | image         |
 | [`finnish_museums_workflow`](#finnish_museums_workflow)         | `@daily`          | `True`  | image         |
@@ -125,6 +126,7 @@ The following is documentation associated with each DAG (where available):
 1.  [`audio_data_refresh`](#audio_data_refresh)
 1.  [`audio_popularity_refresh`](#audio_popularity_refresh)
 1.  [`batched_update`](#batched_update)
+1.  [`cc_mixter_workflow`](#cc_mixter_workflow)
 1.  [`check_silenced_dags`](#check_silenced_dags)
 1.  [`create_filtered_audio_index`](#create_filtered_audio_index)
 1.  [`create_filtered_image_index`](#create_filtered_image_index)
@@ -315,6 +317,18 @@ used when the DagRun configuration needs to be changed after the table was
 already created: for example, if there was a problem with the `update_query`
 which caused DAG failures during the `update_batches` step.
 
+### `cc_mixter_workflow`
+
+Content Provider: ccMixter
+
+ETL Process: Use the API to identify all CC licensed media.
+
+Output: TSV file containing the media and the respective meta-data.
+
+Notes: Documentation: https://ccmixter.org/query-api ccMixter sends bad JSON and
+extremely huge headers, both of which need workarounds that are handled by this
+DAG.
+
 ### `check_silenced_dags`
 
 #### Silenced DAGs check