Skip to content
This repository has been archived by the owner on Feb 22, 2023. It is now read-only.

Cache waveform data in database #510

Merged
merged 10 commits into from
Feb 17, 2022
6 changes: 6 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ cd openverse_api
pipenv install
```

Note: The following non-Python libraries or binaries are transient dependencies which will also need to be present on your computer for the project to run and install as expected:

* `librdkafka`
* `exempi`
* `audiowaveform`

Comment on lines +49 to +54
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

3. Launch a new shell session

```
Expand Down
28 changes: 28 additions & 0 deletions api/catalog/api/migrations/0045_audiowaveformaddon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Generated by Django 3.2.9 on 2022-02-07 14:05

import django.contrib.postgres.fields
from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('api', '0044_singular_category'),
]

operations = [
migrations.CreateModel(
name='AudioWaveformAddOn',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created_on', models.DateTimeField(auto_now_add=True)),
('updated_on', models.DateTimeField(auto_now=True)),
('peaks', django.contrib.postgres.fields.ArrayField(base_field=models.FloatField(), help_text='The waveform peaks. A list of floats in the range of 0 -> 1 inclusively.', size=1500)),
('audio', models.OneToOneField(help_text='The foreign key of the audio.', on_delete=django.db.models.deletion.CASCADE, related_name='waveform', to='api.audio')),
],
options={
'abstract': False,
},
),
]
37 changes: 37 additions & 0 deletions api/catalog/api/models/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
AbstractDeletedMedia,
AbstractMatureMedia,
AbstractMedia,
AbstractMediaAddOn,
AbstractMediaList,
AbstractMediaReport,
)
from catalog.api.models.mixins import FileMixin, ForeignIdentifierMixin, MediaMixin
from catalog.api.utils.waveform import generate_peaks
from django.contrib.postgres.fields import ArrayField
from django.db import models
from uuslug import uuslug
Expand Down Expand Up @@ -157,6 +159,18 @@ def audio_set(self):
except AudioSet.DoesNotExist:
return None

def get_or_create_waveform(self):
if hasattr(self, "waveform"):
return self.waveform.peaks

peaks = generate_peaks(self)

self.waveform = AudioWaveformAddOn(audio=self, peaks=peaks)

self.waveform.save()

return self.waveform.peaks

class Meta(AbstractMedia.Meta):
db_table = "audio"

Expand Down Expand Up @@ -209,3 +223,26 @@ class Meta:
def save(self, *args, **kwargs):
self.slug = uuslug(self.title, instance=self)
super(AudioList, self).save(*args, **kwargs)


class AudioWaveformAddOn(AbstractMediaAddOn):
audio = models.OneToOneField(
to=Audio,
on_delete=models.CASCADE,
related_name="waveform",
help_text=("The foreign key of the audio."),
)

peaks = ArrayField(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of the strict typing as an array of floats, we could possibly use a jsonb field? That allows us to tweak waveform date if needed in the future & prevents the need for an approximation of 1500.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about using jsonb but I actually prefer the strict typing. I wonder if there are any performance characteristics we should consider too, if inserting an array is faster than jsonb over time? For a single client it's possible that up to 10 waveform requests will be made for an audio search result page which could cause a pretty large number of transactions under heavy traffic.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To boost the performance, I was at some point considering compressing the JSON data of the waveform and storing it as a text field in the DB 😄. I'm not sure which of our choices is most performant, maybe @AetherUnbound has more knowledge in that area, but imo, we should aim for max performance even if the cost is absence of validation.

base_field=models.FloatField(),
# The approximate resolution of waveform generation
# results in _about_ 1000 peaks. We use 1500 to give
# sufficient wiggle room should we have any outlier
# files pop up.
# https://github.com/WordPress/openverse-api/blob/a7955c86d43bff504e8d41454f68717d79dd3a44/api/catalog/api/utils/waveform.py#L71
size=1500,
help_text=(
"The waveform peaks. A list of floats in"
" the range of 0 -> 1 inclusively."
),
)
15 changes: 15 additions & 0 deletions api/catalog/api/models/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,21 @@
OTHER = "other"


class AbstractMediaAddOn(OpenLedgerModel):
"""
Base class for "add-on" data for a particular media entry.

These models are appropriate places to store generated and infinitely
cacheable data that is not retrieved or generated at ingestion time.

For example, audio waveforms are generated by the API, but can be
cached indefinitely.
"""

class Meta:
abstract = True


class AbstractMedia(
IdentifierMixin, ForeignIdentifierMixin, MediaMixin, OpenLedgerModel
):
Expand Down
12 changes: 12 additions & 0 deletions api/catalog/api/utils/waveform.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pathlib
import shutil
import subprocess
from typing import List

import requests

Expand Down Expand Up @@ -128,3 +129,14 @@ def cleanup(file_name):
os.remove(file_path)
else:
log.info("File not found, nothing deleted")


def generate_peaks(audio) -> List[float]:
dhruvkb marked this conversation as resolved.
Show resolved Hide resolved
file_name = None
try:
file_name = download_audio(audio.url, audio.identifier)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is maybe something that can be done in a separate issue - download_audio could wrap a tempfile context manager. This could prevent us from having to explicitly delete the file if an exception occurs, since tempfile would take care of cleaning things up once the context is left.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AetherUnbound wouldn't the finally clause below run in the event of an exception?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does! I just mention tempfile because then we wouldn't have to do the cleanup ourselves, in every place where download_audio is called.

awf_out = generate_waveform(file_name, audio.duration)
return process_waveform_output(awf_out)
finally:
if file_name is not None:
cleanup(file_name)
16 changes: 1 addition & 15 deletions api/catalog/api/views/audio_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,6 @@
)
from catalog.api.utils.exceptions import get_api_exception
from catalog.api.utils.throttle import OneThousandPerMinute
from catalog.api.utils.waveform import (
cleanup,
download_audio,
generate_waveform,
process_waveform_output,
)
from catalog.api.views.media_views import MediaViewSet
from django.utils.decorators import method_decorator
from drf_yasg.utils import swagger_auto_schema
Expand Down Expand Up @@ -84,21 +78,13 @@ def thumbnail(self, request, *_, **__):
def waveform(self, *_, **__):
audio = self.get_object()

file_name = None
try:
file_name = download_audio(audio.url, audio.identifier)
awf_out = generate_waveform(file_name, audio.duration)
data = process_waveform_output(awf_out)

obj = {"points": data}
obj = {"points": audio.get_or_create_waveform()}
serializer = self.get_serializer(obj)

return Response(status=200, data=serializer.data)
except Exception as e:
raise get_api_exception(getattr(e, "message", str(e)))
finally:
if file_name is not None:
cleanup(file_name)

@action(detail=True, methods=["post"], serializer_class=AudioReportSerializer)
def report(self, *args, **kwargs):
Expand Down
Empty file added api/test/unit/__init__.py
Empty file.
Empty file.
36 changes: 36 additions & 0 deletions api/test/unit/models/audio_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import uuid
from unittest import mock

import pytest
from catalog.api.models.audio import Audio, AudioWaveformAddOn


@pytest.fixture
@pytest.mark.django_db
def audio_fixture():
audio = Audio(
identifier=uuid.uuid4(),
)

audio.save()

return audio


@pytest.mark.django_db
@mock.patch("catalog.api.models.audio.generate_peaks")
def test_audio_waveform_caches(generate_peaks_mock, audio_fixture):
mock_peaks = [0.4, 0.3, 0.1, 0, 1, 0.6]
generate_peaks_mock.return_value = mock_peaks

assert not hasattr(audio_fixture, "waveform")
assert audio_fixture.get_or_create_waveform() == mock_peaks
assert hasattr(audio_fixture, "waveform")
assert audio_fixture.waveform.peaks == mock_peaks
assert audio_fixture.get_or_create_waveform() == mock_peaks
# Should only be called once if Audio.get_or_create_waveform is using the DB value on subsequent calls
generate_peaks_mock.assert_called_once()

# Ensure the waveform addon was saved
waveform = AudioWaveformAddOn.objects.get(audio=audio_fixture)
assert waveform.peaks == mock_peaks
8 changes: 4 additions & 4 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -165,12 +165,12 @@ _api-install:
"Waiting for the API to be healthy..."

# Run API tests inside Docker
api-test args="": up wait-for-es wait-for-ing wait-for-web
docker-compose exec {{ args }} web ./test/run_test.sh
api-test docker_args="" tests="": up wait-for-es wait-for-ing wait-for-web
docker-compose exec {{ docker_args }} web ./test/run_test.sh {{ tests }}

# Run API tests locally
api-testlocal:
cd api && pipenv run ./test/run_test.sh
api-testlocal args="":
cd api && pipenv run ./test/run_test.sh {{ args }}

# Run Django administrative commands
dj args="":
Expand Down