From b094f4f3cb8eec02b1dc2f14aa1b392e57b1bc4c Mon Sep 17 00:00:00 2001 From: Madison Swain-Bowden Date: Tue, 11 Oct 2022 12:07:31 -0700 Subject: [PATCH] Simplify URL acquisition & tests --- .../provider_api_scripts/freesound.py | 39 ++++---- .../provider_api_scripts/test_freesound.py | 96 ++++++++----------- 2 files changed, 59 insertions(+), 76 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/freesound.py b/openverse_catalog/dags/providers/provider_api_scripts/freesound.py index bab05eebe..058306996 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/freesound.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/freesound.py @@ -34,6 +34,7 @@ class FreesoundDataIngester(ProviderDataIngester): endpoint = f"https://{host}/apiv2/search/text" providers = {"audio": prov.FREESOUND_DEFAULT_PROVIDER} flaky_exceptions = (SSLError, ConnectionError) + preferred_preview = "preview-hq-mp3" preview_bitrates = { "preview-hq-mp3": 128000, "preview-lq-mp3": 64000, @@ -155,14 +156,6 @@ def _get_audio_set_info(self, media_data): else: return None, None, None - @staticmethod - def _get_preview_filedata(preview_type, preview_url): - return { - "url": preview_url, - "filetype": preview_type.split("-")[-1], - "bit_rate": FreesoundDataIngester.preview_bitrates[preview_type], - } - @retry(flaky_exceptions, tries=3, delay=1, backoff=2) def _get_audio_file_size(self, url): """ @@ -180,8 +173,25 @@ def _get_audio_file_size(self, url): """ return int(requests.head(url).headers["content-length"]) - def _get_audio_files(self, media_data): - # This is the original file, needs auth for downloading. + def _get_audio_files( + self, media_data + ) -> tuple[dict, list[dict]] | tuple[None, None]: + previews = media_data.get("previews") + # If there are no previews, then we will not be able to play the file + if not previews: + return None, None + + # If our preferred preview type is not present, skip this audio + if not (preview_url := previews.get(self.preferred_preview)): + return None, None + + main_file = { + "audio_url": preview_url, + "filetype": self.preferred_preview.split("-")[-1], + "bit_rate": FreesoundDataIngester.preview_bitrates[self.preferred_preview], + "filesize": self._get_audio_file_size(preview_url), + } + # These are the original files, needs auth for downloading. # bit_rate in kilobytes, converted to bytes alt_files = [ { @@ -192,15 +202,6 @@ def _get_audio_files(self, media_data): "filesize": media_data.get("filesize"), } ] - previews = media_data.get("previews") - # If there are no previews, then we will not be able to play the file - if not previews: - return None - main_file = self._get_preview_filedata( - "preview-hq-mp3", previews["preview-hq-mp3"] - ) - main_file["audio_url"] = main_file.pop("url") - main_file["filesize"] = self._get_audio_file_size(main_file["audio_url"]) return main_file, alt_files def get_record_data(self, media_data: dict) -> dict | list[dict] | None: diff --git a/tests/dags/providers/provider_api_scripts/test_freesound.py b/tests/dags/providers/provider_api_scripts/test_freesound.py index 34c52310e..0ac1b0db0 100644 --- a/tests/dags/providers/provider_api_scripts/test_freesound.py +++ b/tests/dags/providers/provider_api_scripts/test_freesound.py @@ -29,8 +29,8 @@ def file_size_patch(): @pytest.fixture def audio_data(): - AUDIO_DATA_EXAMPLE = RESOURCES / "audio_data_example.json" - with open(AUDIO_DATA_EXAMPLE) as f: + audio_data_example = RESOURCES / "audio_data_example.json" + with open(audio_data_example) as f: yield json.load(f) @@ -83,71 +83,53 @@ def test_get_items(file_size_patch): assert actual_audio_count == expected_audio_count -def test_process_item_batch_handles_example_batch(audio_data, file_size_patch): - items_batch = [audio_data] - with patch.object( - fsd.media_stores["audio"], "add_item", return_value=1 - ) as mock_add: - fsd.process_batch(items_batch) - mock_add.assert_called_once() - _, actual_call_args = mock_add.call_args_list[0] - expected_call_args = { - "alt_files": [ - { - "bit_rate": 1381000, - "filesize": 107592, - "filetype": "wav", - "sample_rate": 44100, - "url": "https://freesound.org/apiv2/sounds/415362/download/", - } - ], - "audio_set": "https://freesound.org/apiv2/packs/23434/", +def test_get_audio_files_handles_example_audio_data(audio_data, file_size_patch): + actual = fsd._get_audio_files(audio_data) + expected = ( + { "audio_url": "https://freesound.org/data/previews/415/415362_6044691-hq.mp3", "bit_rate": 128000, - "creator": "owly-bee", - "creator_url": "https://freesound.org/people/owly-bee/", - "duration": 608, "filesize": AUDIO_FILE_SIZE, "filetype": "mp3", - "foreign_identifier": 415362, - "foreign_landing_url": "https://freesound.org/people/owly-bee/sounds/415362/", - "license_info": LicenseInfo( - license="by", - version="3.0", - url="https://creativecommons.org/licenses/by/3.0/", - raw_url="http://creativecommons.org/licenses/by/3.0/", - ), - "meta_data": { - "description": "A disinterested noise in a somewhat low tone.", - "download": "https://freesound.org/apiv2/sounds/415362/download/", - "num_downloads": 164, - }, - "raw_tags": ["eh", "disinterest", "low", "uh", "voice", "uncaring"], - "set_foreign_id": "foo", - "set_url": "https://freesound.org/apiv2/packs/23434/", - "title": "Ehh disinterested.wav", - } - assert actual_call_args == expected_call_args + }, + [ + { + "bit_rate": 1381000, + "filesize": 107592, + "filetype": "wav", + "sample_rate": 44100, + "url": "https://freesound.org/apiv2/sounds/415362/download/", + } + ], + ) + assert actual == expected -def test_extract_audio_data_returns_none_when_no_foreign_id(audio_data): - audio_data.pop("id", None) - actual_audio_info = fsd.get_record_data(audio_data) - expected_audio_info = None - assert actual_audio_info is expected_audio_info +def test_get_audio_files_returns_none_when_missing_previews(audio_data): + audio_data.pop("previews", None) + actual = fsd._get_audio_files(audio_data) + assert actual == (None, None) -def test_extract_audio_data_returns_none_when_no_audio_url(audio_data): - audio_data.pop("url", None) - audio_data.pop("download", None) - actual_audio_info = fsd.get_record_data(audio_data) - assert actual_audio_info is None +def test_get_audio_files_returns_none_when_missing_preferred_preview(audio_data): + audio_data["previews"].pop(fsd.preferred_preview) + actual = fsd._get_audio_files(audio_data) + assert actual == (None, None) -def test_extract_audio_data_returns_none_when_no_license(audio_data): - audio_data.pop("license", None) - actual_audio_info = fsd.get_record_data(audio_data) - assert actual_audio_info is None +@pytest.mark.parametrize( + "missing_fields", + [ + ("id",), + ("url", "download"), + ("license",), + ], +) +def test_get_record_data_returns_none_when_missing_data(missing_fields, audio_data): + for field in missing_fields: + audio_data.pop(field, None) + actual = fsd.get_record_data(audio_data) + assert actual is None def test_get_audio_set_info(audio_data):