Skip to content
This repository has been archived by the owner on Feb 22, 2023. It is now read-only.

Cache waveform data in database #510

Merged
merged 10 commits into from
Feb 17, 2022
6 changes: 6 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ cd openverse_api
pipenv install
```

Note: The following non-Python libraries or binaries are transient dependencies which will also need to be present on your computer for the project to run and install as expected:

* `librdkafka`
* `exempi`
* `audiowaveform`

Comment on lines +49 to +54
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

3. Launch a new shell session

```
Expand Down
26 changes: 26 additions & 0 deletions api/catalog/api/migrations/0045_audioaddon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Generated by Django 3.2.12 on 2022-02-16 19:28

import django.contrib.postgres.fields
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('api', '0044_singular_category'),
]

operations = [
migrations.CreateModel(
name='AudioAddOn',
fields=[
('created_on', models.DateTimeField(auto_now_add=True)),
('updated_on', models.DateTimeField(auto_now=True)),
('audio_identifier', models.UUIDField(help_text='The identifier of the audio object.', primary_key=True, serialize=False)),
('waveform_peaks', django.contrib.postgres.fields.ArrayField(base_field=models.FloatField(), help_text='The waveform peaks. A list of floats in the range of 0 -> 1 inclusively.', null=True, size=1500)),
],
options={
'abstract': False,
},
),
]
46 changes: 46 additions & 0 deletions api/catalog/api/models/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
AbstractMediaReport,
)
from catalog.api.models.mixins import FileMixin, ForeignIdentifierMixin, MediaMixin
from catalog.api.utils.waveform import generate_peaks
from django.contrib.postgres.fields import ArrayField
from django.db import models
from uuslug import uuslug
Expand Down Expand Up @@ -96,6 +97,40 @@ class Meta:
abstract = True


class AudioAddOn(OpenLedgerModel):
audio_identifier = models.UUIDField(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is the primary key of this model we should drop the audio_ prefix.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should also note that leaving it as it is wouldn't be wrong either, as it refers to the Audio model instance.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think we should keep it as it's not the identifier for the add-on exactly... it just happens to act as that and we're throwing away the actual canonical PK for the add on table because we never use it anyway.

primary_key=True,
help_text=("The identifier of the audio object."),
)
"""
This cannot be a "ForeignKey" or "OneToOneRel" because the refresh process
wipes out the Audio table completely and recreates it. If we made these a FK
or OneToOneRel there'd be foreign key constraint added that would be violated
when the Audio table is recreated.

The index is necessary as this column is used by the Audio object to query
for the relevant add on.

The refresh process will also eventually include cleaning up any potentially
dangling audio_add_on rows.
Comment on lines +106 to +115
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a great piece of documentation!

"""

waveform_peaks = ArrayField(
base_field=models.FloatField(),
# The approximate resolution of waveform generation
# results in _about_ 1000 peaks. We use 1500 to give
# sufficient wiggle room should we have any outlier
# files pop up.
# https://github.com/WordPress/openverse-api/blob/a7955c86d43bff504e8d41454f68717d79dd3a44/api/catalog/api/utils/waveform.py#L71
size=1500,
help_text=(
"The waveform peaks. A list of floats in"
" the range of 0 -> 1 inclusively."
),
null=True,
)


class Audio(AudioFileMixin, AbstractMedia):
"""
Inherited fields
Expand Down Expand Up @@ -157,6 +192,17 @@ def audio_set(self):
except AudioSet.DoesNotExist:
return None

def get_or_create_waveform(self):
add_on, _ = AudioAddOn.objects.get_or_create(audio_identifier=self.identifier)

if add_on.waveform_peaks is not None:
return add_on.waveform_peaks

add_on.waveform_peaks = generate_peaks(self)
add_on.save()

return add_on.waveform_peaks

class Meta(AbstractMedia.Meta):
db_table = "audio"

Expand Down
12 changes: 12 additions & 0 deletions api/catalog/api/utils/waveform.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pathlib
import shutil
import subprocess
from typing import List

import requests

Expand Down Expand Up @@ -128,3 +129,14 @@ def cleanup(file_name):
os.remove(file_path)
else:
log.info("File not found, nothing deleted")


def generate_peaks(audio) -> List[float]:
dhruvkb marked this conversation as resolved.
Show resolved Hide resolved
file_name = None
try:
file_name = download_audio(audio.url, audio.identifier)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is maybe something that can be done in a separate issue - download_audio could wrap a tempfile context manager. This could prevent us from having to explicitly delete the file if an exception occurs, since tempfile would take care of cleaning things up once the context is left.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AetherUnbound wouldn't the finally clause below run in the event of an exception?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does! I just mention tempfile because then we wouldn't have to do the cleanup ourselves, in every place where download_audio is called.

awf_out = generate_waveform(file_name, audio.duration)
return process_waveform_output(awf_out)
finally:
if file_name is not None:
cleanup(file_name)
16 changes: 1 addition & 15 deletions api/catalog/api/views/audio_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,6 @@
)
from catalog.api.utils.exceptions import get_api_exception
from catalog.api.utils.throttle import OneThousandPerMinute
from catalog.api.utils.waveform import (
cleanup,
download_audio,
generate_waveform,
process_waveform_output,
)
from catalog.api.views.media_views import MediaViewSet
from django.utils.decorators import method_decorator
from drf_yasg.utils import swagger_auto_schema
Expand Down Expand Up @@ -84,21 +78,13 @@ def thumbnail(self, request, *_, **__):
def waveform(self, *_, **__):
audio = self.get_object()

file_name = None
try:
file_name = download_audio(audio.url, audio.identifier)
awf_out = generate_waveform(file_name, audio.duration)
data = process_waveform_output(awf_out)

obj = {"points": data}
obj = {"points": audio.get_or_create_waveform()}
serializer = self.get_serializer(obj)

return Response(status=200, data=serializer.data)
except Exception as e:
raise get_api_exception(getattr(e, "message", str(e)))
finally:
if file_name is not None:
cleanup(file_name)

@action(detail=True, methods=["post"], serializer_class=AudioReportSerializer)
def report(self, *args, **kwargs):
Expand Down
Empty file added api/test/unit/__init__.py
Empty file.
Empty file.
41 changes: 41 additions & 0 deletions api/test/unit/models/audio_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import uuid
from unittest import mock

import pytest
from catalog.api.models.audio import Audio, AudioAddOn


@pytest.fixture
@pytest.mark.django_db
def audio_fixture():
audio = Audio(
identifier=uuid.uuid4(),
)

audio.save()

return audio


@pytest.mark.django_db
@mock.patch("catalog.api.models.audio.generate_peaks")
def test_audio_waveform_caches(generate_peaks_mock, audio_fixture):
mock_waveform = [0.4, 0.3, 0.1, 0, 1, 0.6]
generate_peaks_mock.return_value = mock_waveform

assert AudioAddOn.objects.count() == 0
assert audio_fixture.get_or_create_waveform() == mock_waveform
assert AudioAddOn.objects.count() == 1
# Ensure the waveform was saved
assert (
AudioAddOn.objects.get(audio_identifier=audio_fixture.identifier).waveform_peaks
== mock_waveform
)
assert audio_fixture.get_or_create_waveform() == mock_waveform
# Should only be called once if Audio.get_or_create_waveform is using the DB value on subsequent calls
generate_peaks_mock.assert_called_once()

# Ensure there are no foreign constraints on the AudioAddOn that would cause failures during refresh
audio_fixture.delete()

assert AudioAddOn.objects.count() == 1
12 changes: 6 additions & 6 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ recreate:
@just up "--force-recreate --build"

# Show logs of all, or named, Docker services
logs services="":
docker-compose {{ DOCKER_FILE }} logs -f {{ services }}
logs services="" args="-f":
sarayourfriend marked this conversation as resolved.
Show resolved Hide resolved
docker-compose {{ DOCKER_FILE }} logs {{ args }} {{ services }}


########
Expand Down Expand Up @@ -165,12 +165,12 @@ _api-install:
"Waiting for the API to be healthy..."

# Run API tests inside Docker
api-test args="": up wait-for-es wait-for-ing wait-for-web
docker-compose exec {{ args }} web ./test/run_test.sh
api-test docker_args="" tests="": up wait-for-es wait-for-ing wait-for-web
docker-compose exec {{ docker_args }} web ./test/run_test.sh {{ tests }}

# Run API tests locally
api-testlocal:
cd api && pipenv run ./test/run_test.sh
api-testlocal args="":
cd api && pipenv run ./test/run_test.sh {{ args }}

# Run Django administrative commands
dj args="":
Expand Down