Skip to content
This repository has been archived by the owner on Aug 4, 2023. It is now read-only.

Add Audio to the database #111

Merged
merged 26 commits into from
Jul 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
273ea6a
Extract common media functionality from ImageStore to abstract MediaS…
obulat Jun 3, 2021
6b7ba33
Add MediaStore entity
obulat Jun 21, 2021
f8d66fe
Add audio fields
obulat Jun 21, 2021
f270112
Add a MediaStorage method to clean common metadata
obulat Jun 21, 2021
33ed416
Fix typo and tests
obulat Jun 21, 2021
f34c316
Merge branch 'main' into extract_media_storage
obulat Jun 23, 2021
f0a0bcf
Clean common media metadata in the MediaStorage class
obulat Jun 23, 2021
dec95c3
Fix _IMAGE_TSV_COLUMNS renaming to IMAGE_TSV_COLUMNS
obulat Jun 23, 2021
2302cfd
Fix pep8 violation
obulat Jun 23, 2021
3a91cee
Refactor source parsing
obulat Jun 23, 2021
c8b3637
Linting fixes
obulat Jun 23, 2021
04a0dff
Clean up common metadata validation
obulat Jun 23, 2021
2c731c6
Merge branch 'extract_media_storage' into add_audio_storage
obulat Jun 23, 2021
738df7d
Fix audio column types, sync with MediaStorage
obulat Jun 23, 2021
b2bd55f
Extract media type from stage tsv file name for loader
obulat Jun 24, 2021
b5471c9
Prepare `paths` and `sql` functions for other media types
obulat Jun 24, 2021
5b09923
Merge branch 'add_audio_storage' into add_audio_db
obulat Jun 24, 2021
baafeb4
Add audio-specific database loading functionality
obulat Jun 25, 2021
62a1793
Add audio db creation sql files
obulat Jun 25, 2021
96b0f35
Fix linting errors
obulat Jun 25, 2021
81e5410
Merge branch 'main' into add_audio_db
obulat Jun 25, 2021
1cced03
Remove typo from merge
obulat Jun 25, 2021
37caaf9
Fix audio columns order
obulat Jun 27, 2021
e46dd39
Correct column order in audio tests
obulat Jun 29, 2021
d13fff9
Make watermarked common media column
obulat Jun 30, 2021
17f4f0a
Merge branch 'main' into add_audio_db
obulat Jul 2, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions src/cc_catalog_airflow/dags/common/storage/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,18 @@
columns.JSONColumn(
name='tags', required=False
),
columns.BooleanColumn(
name='watermarked', required=False,
),
columns.StringColumn(
name='provider', required=False, size=80, truncate=False
),
columns.StringColumn(
name='source', required=False, size=80, truncate=False
),
columns.StringColumn(
name="ingestion_type", required=False, size=80, truncate=False
),
columns.IntegerColumn(
name='duration', required=False
),
Expand All @@ -70,15 +82,6 @@
# Alternative files: url, filesize, bit_rate, sample_rate
name='alt_audio_files', required=False
),
columns.StringColumn(
name='provider', required=False, size=80, truncate=False
),
columns.StringColumn(
name='source', required=False, size=80, truncate=False
),
columns.StringColumn(
name="ingestion_type", required=False, size=80, truncate=False
),
]

Audio = namedtuple("Audio", [c.NAME for c in AUDIO_TSV_COLUMNS])
Expand Down Expand Up @@ -126,6 +129,7 @@ def add_item(
title: Optional[str] = None,
meta_data: Optional[Union[Dict, str]] = None,
raw_tags: Optional[Union[list, str]] = None,
watermarked: Optional[bool] = False,
duration: Optional[int] = None,
bit_rate: Optional[int] = None,
sample_rate: Optional[int] = None,
Expand All @@ -137,7 +141,7 @@ def add_item(
set_url: Optional[str] = None,
alt_audio_files: Optional[Dict] = None,
source: Optional[str] = None,
ingestion_type: Optional[str] = 'commoncrawl',
ingestion_type: Optional[str] = None,
):
"""
Add information for a single audio to the AudioStore.
Expand Down Expand Up @@ -208,6 +212,7 @@ def add_item(
'title': title,
'meta_data': meta_data,
'raw_tags': raw_tags,
'watermarked': watermarked,
'duration': duration,
'bit_rate': bit_rate,
'sample_rate': sample_rate,
Expand Down
2 changes: 1 addition & 1 deletion src/cc_catalog_airflow/dags/common/storage/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def add_item(
raw_tags=None,
watermarked: Optional[str] = "f",
source: Optional[str] = None,
ingestion_type: Optional[str] = 'commoncrawl',
ingestion_type: Optional[str] = None,
):
"""
Add information for a single image to the ImageStore.
Expand Down
15 changes: 13 additions & 2 deletions src/cc_catalog_airflow/dags/common/storage/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
"pdm",
}

COMMON_CRAWL = 'commoncrawl'
PROVIDER_API = 'provider_api'


class MediaStore(metaclass=abc.ABCMeta):
"""
Expand Down Expand Up @@ -137,8 +140,8 @@ def clean_media_metadata(self, **media_data) -> Optional[dict]:
and for common metadata we:
- remove `license_url` and `raw_license_url`,
- validate `license_` and `license_version`,
- enrich `metadata` and `tags`,
- remove `raw_tags` are removed,
- enrich `metadata`,
- replace `raw_tags` with enriched `tags`,
- validate `source`,
- add `provider`,
- add `filesize` (with value of None)
Expand All @@ -153,6 +156,14 @@ def clean_media_metadata(self, **media_data) -> Optional[dict]:
media_data.get('source'),
self._PROVIDER
)
# Add ingestion_type column value based on `source`.
# The implementation is based on `ingestion_column`
if media_data.get('ingestion_type') is None:
if media_data['source'] == 'commoncrawl':
media_data['ingestion_type'] = 'commoncrawl'
else:
media_data['ingestion_type'] = 'provider_api'

media_data['tags'] = self._enrich_tags(
media_data.pop('raw_tags', None)
)
Expand Down
25 changes: 15 additions & 10 deletions src/cc_catalog_airflow/dags/common/storage/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
'creator_url': 'https://creatorurl.com',
'title': 'agreatpicture',
'meta_data': {},
'watermarked': None,
'raw_tags': {},
'bit_rate': None,
'sample_rate': None,
Expand Down Expand Up @@ -221,6 +222,7 @@ def default_audio_args(
title='agreatsong',
meta_data={"description": "cat song"},
tags={"name": "tag1", "provider": "testing"},
watermarked=None,
duration=100,
bit_rate=None,
sample_rate=None,
Expand Down Expand Up @@ -262,6 +264,10 @@ def test_create_tsv_row_creates_alt_audio_files(
'agreatsong',
'{"description": "cat song"}',
'{"name": "tag1", "provider": "testing"}',
'\\N',
'testing_provider',
'testing_source',
'provider_api',
'100',
'\\N',
'\\N',
Expand All @@ -271,9 +277,6 @@ def test_create_tsv_row_creates_alt_audio_files(
'[{"url": '
'"http://alternative.com/audio.mp3", "filesize": "123", "bit_rate": "41000", '
'"sample_rate": "16000"}]',
'testing_provider',
'testing_source',
'provider_api',

]) + '\n'
assert actual_row == expected_row
Expand Down Expand Up @@ -308,6 +311,10 @@ def test_create_tsv_row_creates_audio_set(
'agreatsong',
'{"description": "cat song"}',
'{"name": "tag1", "provider": "testing"}',
'\\N',
'testing_provider',
'testing_source',
'provider_api',
'100',
'\\N',
'\\N',
Expand All @@ -316,10 +323,6 @@ def test_create_tsv_row_creates_audio_set(
'{"audio_set": "test_audio_set", "set_url": "test.com", '
'"set_position": "1", "set_thumbnail": "thumbnail.jpg"}',
'\\N',
'testing_provider',
'testing_source',
'provider_api',

]) + '\n'
assert actual_row == expected_row

Expand Down Expand Up @@ -438,6 +441,7 @@ def mock_validate_url(url_string):
'title': 'agreatsong',
'meta_data': {'description': 'a song about cat'},
'tags': [{'name': 'tag1', 'provider': 'testing'}],
'watermarked': None,
'bit_rate': 16000,
'sample_rate': 44100,
'category': 'music',
Expand Down Expand Up @@ -472,6 +476,10 @@ def mock_validate_url(url_string):
'agreatsong',
'{"description": "a song about cat"}',
'[{"name": "tag1", "provider": "testing"}]',
'\\N',
'testing_provider',
'testing_source',
'provider_api',
'200',
'16000',
'44100',
Expand All @@ -480,8 +488,5 @@ def mock_validate_url(url_string):
'{"audio_set": "album", "set_position": "1", "set_url": "https://album.com/", '
'"set_thumbnail": "https://album.com/thumbnail.jpg"}',
'\\N',
'testing_provider',
'testing_source',
'provider_api'
]) + '\n'
assert expect_row == actual_row
7 changes: 7 additions & 0 deletions src/cc_catalog_airflow/dags/util/loader/column_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,10 @@
UPDATED_ON = 'updated_on'
LAST_SYNCED = 'last_synced_with_source'
REMOVED = 'removed_from_source'
DURATION = 'duration'
BIT_RATE = 'bit_rate'
SAMPLE_RATE = 'sample_rate'
CATEGORY = 'category'
GENRE = 'genre'
AUDIO_SET = 'audio_set'
ALT_AUDIO_FILES = 'alt_audio_files'
15 changes: 8 additions & 7 deletions src/cc_catalog_airflow/dags/util/loader/ingestion_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
import logging
import os

from common.storage.audio import AUDIO_TSV_COLUMNS
from common.storage.image import IMAGE_TSV_COLUMNS
from common.storage import media

logger = logging.getLogger(__name__)

Expand All @@ -25,12 +27,11 @@ def check_and_fix_tsv_file(tsv_file_name):
# If no media file is set in the filename, it is
# probably image
media_type = 'image'
old_cols_number = len(IMAGE_TSV_COLUMNS) - 1
if media_type == 'audio':
# TODO: when audio is added:
# old_cols_number = len(AUDIO_TSV_COLUMNS) - 1
old_cols_number = 0
new_cols_number = old_cols_number + 1
new_cols_number = len(AUDIO_TSV_COLUMNS)
else:
new_cols_number = len(IMAGE_TSV_COLUMNS)
old_cols_number = new_cols_number - 1
with open(tsv_file_name) as f:
test_line = f.readline()
line_list = [word.strip() for word in test_line.split('\t')]
Expand All @@ -51,8 +52,8 @@ def check_and_fix_tsv_file(tsv_file_name):


def _add_ingestion_type(tsv_file_name, source):
COMMON_CRAWL = 'commoncrawl'
PROVIDER_API = 'provider_api'
COMMON_CRAWL = media.COMMON_CRAWL
PROVIDER_API = media.PROVIDER_API
ingestion_type = source if source == COMMON_CRAWL else PROVIDER_API
logger.debug(f'Found source: {source}')
logger.info(
Expand Down
8 changes: 6 additions & 2 deletions src/cc_catalog_airflow/dags/util/loader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,12 @@ def load_local_data(output_dir, postgres_conn_id, identifier, overwrite=False):

def copy_to_s3(output_dir, bucket, identifier, aws_conn_id):
tsv_file_name = paths.get_staged_file(output_dir, identifier)
media_type = _extract_media_type(tsv_file_name)
ingestion_column.check_and_fix_tsv_file(tsv_file_name)
s3.copy_file_to_s3_staging(identifier, tsv_file_name, bucket, aws_conn_id)
s3.copy_file_to_s3_staging(
identifier, tsv_file_name, bucket, aws_conn_id,
media_prefix=media_type
)


def load_s3_data(
Expand All @@ -41,7 +45,7 @@ def load_s3_data(
if media_type is None:
media_type = 'image'
tsv_key = s3.get_staged_s3_object(
identifier, bucket, aws_conn_id
identifier, bucket, aws_conn_id, media_prefix=media_type
)
sql.load_s3_data_to_intermediate_table(
postgres_conn_id,
Expand Down
Loading