WordPress · obulat · Jul 2, 2021 · Jun 3, 2021 · Jun 21, 2021 · Jun 21, 2021
@@ -47,6 +47,18 @@
     columns.JSONColumn(
         name='tags', required=False
     ),
+    columns.BooleanColumn(
+        name='watermarked', required=False,
+    ),
+    columns.StringColumn(
+        name='provider', required=False, size=80, truncate=False
+    ),
+    columns.StringColumn(
+        name='source', required=False, size=80, truncate=False
+    ),
+    columns.StringColumn(
+        name="ingestion_type", required=False, size=80, truncate=False
+    ),
     columns.IntegerColumn(
         name='duration', required=False
     ),
@@ -70,15 +82,6 @@
         # Alternative files: url, filesize, bit_rate, sample_rate
         name='alt_audio_files', required=False
     ),
-    columns.StringColumn(
-        name='provider', required=False, size=80, truncate=False
-    ),
-    columns.StringColumn(
-        name='source', required=False, size=80, truncate=False
-    ),
-    columns.StringColumn(
-        name="ingestion_type", required=False, size=80, truncate=False
-    ),
 ]
 
 Audio = namedtuple("Audio", [c.NAME for c in AUDIO_TSV_COLUMNS])
@@ -126,6 +129,7 @@ def add_item(
         title: Optional[str] = None,
         meta_data: Optional[Union[Dict, str]] = None,
         raw_tags: Optional[Union[list, str]] = None,
+        watermarked: Optional[bool] = False,
         duration: Optional[int] = None,
         bit_rate: Optional[int] = None,
         sample_rate: Optional[int] = None,
@@ -137,7 +141,7 @@ def add_item(
         set_url: Optional[str] = None,
         alt_audio_files: Optional[Dict] = None,
         source: Optional[str] = None,
-        ingestion_type: Optional[str] = 'commoncrawl',
+        ingestion_type: Optional[str] = None,
     ):
         """
         Add information for a single audio to the AudioStore.
@@ -208,6 +212,7 @@ def add_item(
             'title': title,
             'meta_data': meta_data,
             'raw_tags': raw_tags,
+            'watermarked': watermarked,
             'duration': duration,
             'bit_rate': bit_rate,
             'sample_rate': sample_rate,

@@ -116,7 +116,7 @@ def add_item(
         raw_tags=None,
         watermarked: Optional[str] = "f",
         source: Optional[str] = None,
-        ingestion_type: Optional[str] = 'commoncrawl',
+        ingestion_type: Optional[str] = None,
     ):
         """
         Add information for a single image to the ImageStore.

@@ -30,6 +30,9 @@
     "pdm",
 }
 
+COMMON_CRAWL = 'commoncrawl'
+PROVIDER_API = 'provider_api'
+
 
 class MediaStore(metaclass=abc.ABCMeta):
     """
@@ -137,8 +140,8 @@ def clean_media_metadata(self, **media_data) -> Optional[dict]:
         and for common metadata we:
         - remove `license_url` and `raw_license_url`,
         - validate `license_` and `license_version`,
-        - enrich `metadata` and `tags`,
-        - remove `raw_tags` are removed,
+        - enrich `metadata`,
+        - replace `raw_tags` with enriched `tags`,
         - validate `source`,
         - add `provider`,
         - add `filesize` (with value of None)
@@ -153,6 +156,14 @@ def clean_media_metadata(self, **media_data) -> Optional[dict]:
             media_data.get('source'),
             self._PROVIDER
         )
+        # Add ingestion_type column value based on `source`.
+        # The implementation is based on `ingestion_column`
+        if media_data.get('ingestion_type') is None:
+            if media_data['source'] == 'commoncrawl':
+                media_data['ingestion_type'] = 'commoncrawl'
+            else:
+                media_data['ingestion_type'] = 'provider_api'
+
         media_data['tags'] = self._enrich_tags(
             media_data.pop('raw_tags', None)
         )

@@ -33,6 +33,7 @@
     'creator_url': 'https://creatorurl.com',
     'title': 'agreatpicture',
     'meta_data': {},
+    'watermarked': None,
     'raw_tags': {},
     'bit_rate': None,
     'sample_rate': None,
@@ -221,6 +222,7 @@ def default_audio_args(
         title='agreatsong',
         meta_data={"description": "cat song"},
         tags={"name": "tag1", "provider": "testing"},
+        watermarked=None,
         duration=100,
         bit_rate=None,
         sample_rate=None,
@@ -262,6 +264,10 @@ def test_create_tsv_row_creates_alt_audio_files(
         'agreatsong',
         '{"description": "cat song"}',
         '{"name": "tag1", "provider": "testing"}',
+        '\\N',
+        'testing_provider',
+        'testing_source',
+        'provider_api',
         '100',
         '\\N',
         '\\N',
@@ -271,9 +277,6 @@ def test_create_tsv_row_creates_alt_audio_files(
         '[{"url": '
         '"http://alternative.com/audio.mp3", "filesize": "123", "bit_rate": "41000", '
         '"sample_rate": "16000"}]',
-        'testing_provider',
-        'testing_source',
-        'provider_api',
 
     ]) + '\n'
     assert actual_row == expected_row
@@ -308,6 +311,10 @@ def test_create_tsv_row_creates_audio_set(
         'agreatsong',
         '{"description": "cat song"}',
         '{"name": "tag1", "provider": "testing"}',
+        '\\N',
+        'testing_provider',
+        'testing_source',
+        'provider_api',
         '100',
         '\\N',
         '\\N',
@@ -316,10 +323,6 @@ def test_create_tsv_row_creates_audio_set(
         '{"audio_set": "test_audio_set", "set_url": "test.com", '
         '"set_position": "1", "set_thumbnail": "thumbnail.jpg"}',
         '\\N',
-        'testing_provider',
-        'testing_source',
-        'provider_api',
-
     ]) + '\n'
     assert actual_row == expected_row
 
@@ -438,6 +441,7 @@ def mock_validate_url(url_string):
         'title': 'agreatsong',
         'meta_data': {'description': 'a song about cat'},
         'tags': [{'name': 'tag1', 'provider': 'testing'}],
+        'watermarked': None,
         'bit_rate': 16000,
         'sample_rate': 44100,
         'category': 'music',
@@ -472,6 +476,10 @@ def mock_validate_url(url_string):
         'agreatsong',
         '{"description": "a song about cat"}',
         '[{"name": "tag1", "provider": "testing"}]',
+        '\\N',
+        'testing_provider',
+        'testing_source',
+        'provider_api',
         '200',
         '16000',
         '44100',
@@ -480,8 +488,5 @@ def mock_validate_url(url_string):
         '{"audio_set": "album", "set_position": "1", "set_url": "https://album.com/", '
         '"set_thumbnail": "https://album.com/thumbnail.jpg"}',
         '\\N',
-        'testing_provider',
-        'testing_source',
-        'provider_api'
     ]) + '\n'
     assert expect_row == actual_row
@@ -25,3 +25,10 @@
 UPDATED_ON = 'updated_on'
 LAST_SYNCED = 'last_synced_with_source'
 REMOVED = 'removed_from_source'
+DURATION = 'duration'
+BIT_RATE = 'bit_rate'
+SAMPLE_RATE = 'sample_rate'
+CATEGORY = 'category'
+GENRE = 'genre'
+AUDIO_SET = 'audio_set'
+ALT_AUDIO_FILES = 'alt_audio_files'
@@ -6,7 +6,9 @@
 import logging
 import os
 
+from common.storage.audio import AUDIO_TSV_COLUMNS
 from common.storage.image import IMAGE_TSV_COLUMNS
+from common.storage import media
 
 logger = logging.getLogger(__name__)
 
@@ -25,12 +27,11 @@ def check_and_fix_tsv_file(tsv_file_name):
         # If no media file is set in the filename, it is
         # probably image
         media_type = 'image'
-    old_cols_number = len(IMAGE_TSV_COLUMNS) - 1
     if media_type == 'audio':
-        # TODO: when audio is added:
-        # old_cols_number = len(AUDIO_TSV_COLUMNS) - 1
-        old_cols_number = 0
-    new_cols_number = old_cols_number + 1
+        new_cols_number = len(AUDIO_TSV_COLUMNS)
+    else:
+        new_cols_number = len(IMAGE_TSV_COLUMNS)
+    old_cols_number = new_cols_number - 1
     with open(tsv_file_name) as f:
         test_line = f.readline()
     line_list = [word.strip() for word in test_line.split('\t')]
@@ -51,8 +52,8 @@ def check_and_fix_tsv_file(tsv_file_name):
 
 
 def _add_ingestion_type(tsv_file_name, source):
-    COMMON_CRAWL = 'commoncrawl'
-    PROVIDER_API = 'provider_api'
+    COMMON_CRAWL = media.COMMON_CRAWL
+    PROVIDER_API = media.PROVIDER_API
     ingestion_type = source if source == COMMON_CRAWL else PROVIDER_API
     logger.debug(f'Found source:  {source}')
     logger.info(

@@ -23,8 +23,12 @@ def load_local_data(output_dir, postgres_conn_id, identifier, overwrite=False):
 
 def copy_to_s3(output_dir, bucket, identifier, aws_conn_id):
     tsv_file_name = paths.get_staged_file(output_dir, identifier)
+    media_type = _extract_media_type(tsv_file_name)
     ingestion_column.check_and_fix_tsv_file(tsv_file_name)
-    s3.copy_file_to_s3_staging(identifier, tsv_file_name, bucket, aws_conn_id)
+    s3.copy_file_to_s3_staging(
+        identifier, tsv_file_name, bucket, aws_conn_id,
+        media_prefix=media_type
+    )
 
 
 def load_s3_data(
@@ -41,7 +45,7 @@ def load_s3_data(
     if media_type is None:
         media_type = 'image'
     tsv_key = s3.get_staged_s3_object(
-        identifier, bucket, aws_conn_id
+        identifier, bucket, aws_conn_id, media_prefix=media_type
     )
     sql.load_s3_data_to_intermediate_table(
         postgres_conn_id,