Skip to content
This repository has been archived by the owner on Aug 4, 2023. It is now read-only.

Commit

Permalink
Simplify URL acquisition & tests
Browse files Browse the repository at this point in the history
  • Loading branch information
AetherUnbound committed Oct 11, 2022
1 parent 63fa109 commit b094f4f
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 76 deletions.
39 changes: 20 additions & 19 deletions openverse_catalog/dags/providers/provider_api_scripts/freesound.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class FreesoundDataIngester(ProviderDataIngester):
endpoint = f"https://{host}/apiv2/search/text"
providers = {"audio": prov.FREESOUND_DEFAULT_PROVIDER}
flaky_exceptions = (SSLError, ConnectionError)
preferred_preview = "preview-hq-mp3"
preview_bitrates = {
"preview-hq-mp3": 128000,
"preview-lq-mp3": 64000,
Expand Down Expand Up @@ -155,14 +156,6 @@ def _get_audio_set_info(self, media_data):
else:
return None, None, None

@staticmethod
def _get_preview_filedata(preview_type, preview_url):
return {
"url": preview_url,
"filetype": preview_type.split("-")[-1],
"bit_rate": FreesoundDataIngester.preview_bitrates[preview_type],
}

@retry(flaky_exceptions, tries=3, delay=1, backoff=2)
def _get_audio_file_size(self, url):
"""
Expand All @@ -180,8 +173,25 @@ def _get_audio_file_size(self, url):
"""
return int(requests.head(url).headers["content-length"])

def _get_audio_files(self, media_data):
# This is the original file, needs auth for downloading.
def _get_audio_files(
self, media_data
) -> tuple[dict, list[dict]] | tuple[None, None]:
previews = media_data.get("previews")
# If there are no previews, then we will not be able to play the file
if not previews:
return None, None

# If our preferred preview type is not present, skip this audio
if not (preview_url := previews.get(self.preferred_preview)):
return None, None

main_file = {
"audio_url": preview_url,
"filetype": self.preferred_preview.split("-")[-1],
"bit_rate": FreesoundDataIngester.preview_bitrates[self.preferred_preview],
"filesize": self._get_audio_file_size(preview_url),
}
# These are the original files, needs auth for downloading.
# bit_rate in kilobytes, converted to bytes
alt_files = [
{
Expand All @@ -192,15 +202,6 @@ def _get_audio_files(self, media_data):
"filesize": media_data.get("filesize"),
}
]
previews = media_data.get("previews")
# If there are no previews, then we will not be able to play the file
if not previews:
return None
main_file = self._get_preview_filedata(
"preview-hq-mp3", previews["preview-hq-mp3"]
)
main_file["audio_url"] = main_file.pop("url")
main_file["filesize"] = self._get_audio_file_size(main_file["audio_url"])
return main_file, alt_files

def get_record_data(self, media_data: dict) -> dict | list[dict] | None:
Expand Down
96 changes: 39 additions & 57 deletions tests/dags/providers/provider_api_scripts/test_freesound.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ def file_size_patch():

@pytest.fixture
def audio_data():
AUDIO_DATA_EXAMPLE = RESOURCES / "audio_data_example.json"
with open(AUDIO_DATA_EXAMPLE) as f:
audio_data_example = RESOURCES / "audio_data_example.json"
with open(audio_data_example) as f:
yield json.load(f)


Expand Down Expand Up @@ -83,71 +83,53 @@ def test_get_items(file_size_patch):
assert actual_audio_count == expected_audio_count


def test_process_item_batch_handles_example_batch(audio_data, file_size_patch):
items_batch = [audio_data]
with patch.object(
fsd.media_stores["audio"], "add_item", return_value=1
) as mock_add:
fsd.process_batch(items_batch)
mock_add.assert_called_once()
_, actual_call_args = mock_add.call_args_list[0]
expected_call_args = {
"alt_files": [
{
"bit_rate": 1381000,
"filesize": 107592,
"filetype": "wav",
"sample_rate": 44100,
"url": "https://freesound.org/apiv2/sounds/415362/download/",
}
],
"audio_set": "https://freesound.org/apiv2/packs/23434/",
def test_get_audio_files_handles_example_audio_data(audio_data, file_size_patch):
actual = fsd._get_audio_files(audio_data)
expected = (
{
"audio_url": "https://freesound.org/data/previews/415/415362_6044691-hq.mp3",
"bit_rate": 128000,
"creator": "owly-bee",
"creator_url": "https://freesound.org/people/owly-bee/",
"duration": 608,
"filesize": AUDIO_FILE_SIZE,
"filetype": "mp3",
"foreign_identifier": 415362,
"foreign_landing_url": "https://freesound.org/people/owly-bee/sounds/415362/",
"license_info": LicenseInfo(
license="by",
version="3.0",
url="https://creativecommons.org/licenses/by/3.0/",
raw_url="http://creativecommons.org/licenses/by/3.0/",
),
"meta_data": {
"description": "A disinterested noise in a somewhat low tone.",
"download": "https://freesound.org/apiv2/sounds/415362/download/",
"num_downloads": 164,
},
"raw_tags": ["eh", "disinterest", "low", "uh", "voice", "uncaring"],
"set_foreign_id": "foo",
"set_url": "https://freesound.org/apiv2/packs/23434/",
"title": "Ehh disinterested.wav",
}
assert actual_call_args == expected_call_args
},
[
{
"bit_rate": 1381000,
"filesize": 107592,
"filetype": "wav",
"sample_rate": 44100,
"url": "https://freesound.org/apiv2/sounds/415362/download/",
}
],
)
assert actual == expected


def test_extract_audio_data_returns_none_when_no_foreign_id(audio_data):
audio_data.pop("id", None)
actual_audio_info = fsd.get_record_data(audio_data)
expected_audio_info = None
assert actual_audio_info is expected_audio_info
def test_get_audio_files_returns_none_when_missing_previews(audio_data):
audio_data.pop("previews", None)
actual = fsd._get_audio_files(audio_data)
assert actual == (None, None)


def test_extract_audio_data_returns_none_when_no_audio_url(audio_data):
audio_data.pop("url", None)
audio_data.pop("download", None)
actual_audio_info = fsd.get_record_data(audio_data)
assert actual_audio_info is None
def test_get_audio_files_returns_none_when_missing_preferred_preview(audio_data):
audio_data["previews"].pop(fsd.preferred_preview)
actual = fsd._get_audio_files(audio_data)
assert actual == (None, None)


def test_extract_audio_data_returns_none_when_no_license(audio_data):
audio_data.pop("license", None)
actual_audio_info = fsd.get_record_data(audio_data)
assert actual_audio_info is None
@pytest.mark.parametrize(
"missing_fields",
[
("id",),
("url", "download"),
("license",),
],
)
def test_get_record_data_returns_none_when_missing_data(missing_fields, audio_data):
for field in missing_fields:
audio_data.pop(field, None)
actual = fsd.get_record_data(audio_data)
assert actual is None


def test_get_audio_set_info(audio_data):
Expand Down

0 comments on commit b094f4f

Please sign in to comment.