From 3a816554f6a15d20c60c3808a337b7cbc1978dfd Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Thu, 15 Jul 2021 17:51:48 -0400 Subject: [PATCH 1/8] Add watermarked column to sql files --- .../local_postgres/0006_openledger_audio_schema.sql | 1 + .../local_postgres/0007_openledger_audio_view.sql | 1 + src/openledger_sql/03_create_audio_schema.sql | 1 + 3 files changed, 3 insertions(+) diff --git a/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql b/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql index e4770fc09..acd803024 100644 --- a/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql +++ b/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql @@ -40,6 +40,7 @@ CREATE TABLE public.audio ( title character varying(5000), meta_data jsonb, tags jsonb, + watermarked boolean, last_synced_with_source timestamp with time zone, removed_from_source boolean NOT NULL ); diff --git a/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql b/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql index 4c0a6979a..dbf7a24b1 100644 --- a/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql +++ b/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql @@ -73,6 +73,7 @@ CREATE MATERIALIZED VIEW audio_view AS title, meta_data, tags, + watermarked, last_synced_with_source, removed_from_source, standardized_audio_popularity( diff --git a/src/openledger_sql/03_create_audio_schema.sql b/src/openledger_sql/03_create_audio_schema.sql index e4770fc09..acd803024 100644 --- a/src/openledger_sql/03_create_audio_schema.sql +++ b/src/openledger_sql/03_create_audio_schema.sql @@ -40,6 +40,7 @@ CREATE TABLE public.audio ( title character varying(5000), meta_data jsonb, tags jsonb, + watermarked boolean, last_synced_with_source timestamp with time zone, removed_from_source boolean NOT NULL ); From 621765b11c77ee202e5a1f81cb468e5e51e0653e Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Thu, 15 Jul 2021 17:53:25 -0400 Subject: [PATCH 2/8] Make field for audio genres plural --- src/cc_catalog_airflow/dags/common/storage/audio.py | 8 ++++---- src/cc_catalog_airflow/dags/util/loader/sql.py | 8 ++++---- .../local_postgres/0006_openledger_audio_schema.sql | 2 +- .../local_postgres/0007_openledger_audio_view.sql | 2 +- src/openledger_sql/03_create_audio_schema.sql | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/cc_catalog_airflow/dags/common/storage/audio.py b/src/cc_catalog_airflow/dags/common/storage/audio.py index 3839b150a..ddb8ba37c 100644 --- a/src/cc_catalog_airflow/dags/common/storage/audio.py +++ b/src/cc_catalog_airflow/dags/common/storage/audio.py @@ -73,7 +73,7 @@ name='category', required=False, size=80, truncate=False, ), columns.JSONColumn( - name='genre', required=False, + name='genres', required=False, ), columns.JSONColumn( # set name, set thumbnail, position of audio in set, set url @@ -133,7 +133,7 @@ def add_item( bit_rate: Optional[int] = None, sample_rate: Optional[int] = None, category: Optional[str] = None, - genre: Optional[str] = None, + genres: Optional[str] = None, audio_set: Optional[str] = None, set_position: Optional[int] = None, set_thumbnail: Optional[str] = None, @@ -187,7 +187,7 @@ def add_item( bit_rate: Audio bit rate as int. sample_rate: Audio sample rate as int. category: 'music', 'sound' or 'podcast'. - genre: List of genres + genres: List of genres audio_set: The name of the set (album, pack) the audio is part of set_position: Position of the audio in the audio_set @@ -230,7 +230,7 @@ def add_item( 'bit_rate': bit_rate, 'sample_rate': sample_rate, 'category': category, - 'genre': genre, + 'genres': genres, 'audio_set': audio_set_data, 'alt_audio_files': alt_audio_files, 'source': source, diff --git a/src/cc_catalog_airflow/dags/util/loader/sql.py b/src/cc_catalog_airflow/dags/util/loader/sql.py index e68074436..855ee8828 100644 --- a/src/cc_catalog_airflow/dags/util/loader/sql.py +++ b/src/cc_catalog_airflow/dags/util/loader/sql.py @@ -70,7 +70,7 @@ def create_loading_table( {col.BIT_RATE} integer, {col.SAMPLE_RATE} integer, {col.CATEGORY} character varying(100), - {col.GENRE} jsonb, + {col.GENRES} jsonb, {col.AUDIO_SET} jsonb, {col.ALT_AUDIO_FILES} jsonb ); @@ -296,7 +296,7 @@ def _merge_jsonb_arrays(column: str) -> str: col.BIT_RATE: col.BIT_RATE, col.SAMPLE_RATE: col.SAMPLE_RATE, col.CATEGORY: col.CATEGORY, - col.GENRE: col.GENRE, + col.GENRES: col.GENRES, col.AUDIO_SET: col.AUDIO_SET, col.ALT_AUDIO_FILES: col.ALT_AUDIO_FILES, }) @@ -311,7 +311,7 @@ def _merge_jsonb_arrays(column: str) -> str: {_newest_non_null(col.BIT_RATE)}, {_newest_non_null(col.SAMPLE_RATE)}, {_newest_non_null(col.CATEGORY)}, - {_merge_jsonb_arrays(col.GENRE)}, + {_merge_jsonb_arrays(col.GENRES)}, {_merge_jsonb_objects(col.AUDIO_SET)}, {_merge_jsonb_objects(col.ALT_AUDIO_FILES)} ''' @@ -381,7 +381,7 @@ def overwrite_records_in_db_table( col.BIT_RATE, col.SAMPLE_RATE, col.CATEGORY, - col.GENRE, + col.GENRES, col.AUDIO_SET, col.ALT_AUDIO_FILES, ] diff --git a/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql b/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql index acd803024..967f87db3 100644 --- a/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql +++ b/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql @@ -29,7 +29,7 @@ CREATE TABLE public.audio ( bit_rate integer, sample_rate integer, category character varying(200), - genre jsonb, + genres jsonb, audio_set jsonb, alt_audio_files jsonb, filesize integer, diff --git a/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql b/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql index dbf7a24b1..f59d2ad27 100644 --- a/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql +++ b/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql @@ -62,7 +62,7 @@ CREATE MATERIALIZED VIEW audio_view AS bit_rate, sample_rate, category, - genre, + genres, audio_set, alt_audio_files, filesize, diff --git a/src/openledger_sql/03_create_audio_schema.sql b/src/openledger_sql/03_create_audio_schema.sql index acd803024..967f87db3 100644 --- a/src/openledger_sql/03_create_audio_schema.sql +++ b/src/openledger_sql/03_create_audio_schema.sql @@ -29,7 +29,7 @@ CREATE TABLE public.audio ( bit_rate integer, sample_rate integer, category character varying(200), - genre jsonb, + genres jsonb, audio_set jsonb, alt_audio_files jsonb, filesize integer, From 6b4fe9ff9187de895aea34e6b37a4759aab3746f Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Tue, 20 Jul 2021 00:35:45 -0400 Subject: [PATCH 3/8] Rename `standardized__popularity` column in view tables --- src/cc_catalog_airflow/dags/util/popularity/sql.py | 3 +-- src/cc_catalog_airflow/dags/util/popularity/test_sql.py | 2 +- .../local_postgres/0004_openledger_image_view.sql | 2 +- .../local_postgres/0007_openledger_audio_view.sql | 2 +- src/openledger_sql/popularity/04_create_image_view.sql | 2 +- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/cc_catalog_airflow/dags/util/popularity/sql.py b/src/cc_catalog_airflow/dags/util/popularity/sql.py index 26be3dc7c..d0400a0d0 100644 --- a/src/cc_catalog_airflow/dags/util/popularity/sql.py +++ b/src/cc_catalog_airflow/dags/util/popularity/sql.py @@ -315,7 +315,6 @@ def create_media_view( db_view_provider_fid_idx = AUDIO_VIEW_PROVIDER_FID_IDX standardized_popularity_func = STANDARDIZED_AUDIO_POPULARITY_FUNCTION postgres = PostgresHook(postgres_conn_id=postgres_conn_id) - STANDARDIZED_POPULARITY = f"standardized_{media_type}_popularity" create_view_query = dedent( f""" CREATE MATERIALIZED VIEW public.{db_view_name} AS @@ -324,7 +323,7 @@ def create_media_view( {standardized_popularity_func}( {table_name}.{PARTITION}, {table_name}.{METADATA_COLUMN} - ) AS {STANDARDIZED_POPULARITY} + ) AS standardized_popularity FROM {table_name}; """ ) diff --git a/src/cc_catalog_airflow/dags/util/popularity/test_sql.py b/src/cc_catalog_airflow/dags/util/popularity/test_sql.py index 4fb4d644e..25ac6fee7 100644 --- a/src/cc_catalog_airflow/dags/util/popularity/test_sql.py +++ b/src/cc_catalog_airflow/dags/util/popularity/test_sql.py @@ -453,7 +453,7 @@ def test_image_view_calculates_std_pop(postgres_with_image_table): _set_up_image_view(postgres_with_image_table, data_query, metrics) check_query = dedent( f""" - SELECT foreign_identifier, standardized_image_popularity + SELECT foreign_identifier, standardized_popularity FROM {image_view}; """ ) diff --git a/src/cc_catalog_airflow/local_postgres/0004_openledger_image_view.sql b/src/cc_catalog_airflow/local_postgres/0004_openledger_image_view.sql index 73a2c5eb9..b6a9cf74f 100644 --- a/src/cc_catalog_airflow/local_postgres/0004_openledger_image_view.sql +++ b/src/cc_catalog_airflow/local_postgres/0004_openledger_image_view.sql @@ -74,5 +74,5 @@ CREATE MATERIALIZED VIEW image_view AS removed_from_source, standardized_image_popularity( image.provider, image.meta_data - ) AS standardized_image_popularity + ) AS standardized_popularity FROM image; diff --git a/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql b/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql index f59d2ad27..8e8127782 100644 --- a/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql +++ b/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql @@ -78,5 +78,5 @@ CREATE MATERIALIZED VIEW audio_view AS removed_from_source, standardized_audio_popularity( audio.provider, audio.meta_data - ) AS standardized_audio_popularity + ) AS standardized_popularity FROM audio; diff --git a/src/openledger_sql/popularity/04_create_image_view.sql b/src/openledger_sql/popularity/04_create_image_view.sql index 8d76ffcd0..4a78c96cb 100644 --- a/src/openledger_sql/popularity/04_create_image_view.sql +++ b/src/openledger_sql/popularity/04_create_image_view.sql @@ -25,5 +25,5 @@ CREATE MATERIALIZED VIEW image_view AS removed_from_source, standardized_image_popularity( image.provider, image.meta_data - ) AS standardized_image_popularity + ) AS standardized_popularity FROM image; From 9895bf2d4d5fcef6537b9dbda9eca9e92f2e1f66 Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Tue, 20 Jul 2021 18:26:23 -0400 Subject: [PATCH 4/8] Modify columns in test_audio.py --- src/cc_catalog_airflow/dags/common/storage/test_audio.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/cc_catalog_airflow/dags/common/storage/test_audio.py b/src/cc_catalog_airflow/dags/common/storage/test_audio.py index ea672a95e..2d1f16320 100644 --- a/src/cc_catalog_airflow/dags/common/storage/test_audio.py +++ b/src/cc_catalog_airflow/dags/common/storage/test_audio.py @@ -37,18 +37,16 @@ 'creator_url': 'https://creatorurl.com', 'title': 'agreatsong', 'meta_data': {}, - 'watermarked': None, 'raw_tags': {}, 'watermarked': None, 'bit_rate': None, 'sample_rate': None, 'category': None, - 'genre': [], + 'genres': [], 'audio_set': {}, 'alt_audio_files': [], 'source': 'testing_source', 'ingestion_type': 'provider_api', - } @@ -232,7 +230,7 @@ def default_audio_args( bit_rate=None, sample_rate=None, category='music', - genre=['rock', 'pop'], + genres=['rock', 'pop'], alt_audio_files=None, provider='testing_provider', source='testing_source', @@ -450,7 +448,7 @@ def mock_validate_url(url_string): 'bit_rate': 16000, 'sample_rate': 44100, 'category': 'music', - 'genre': ['pop', 'rock'], + 'genres': ['pop', 'rock'], 'audio_set': { 'audio_set': 'album', 'set_position': 1, From 8e2186ef5d94c5f1c82fec82079a232c8ff107ef Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Mon, 26 Jul 2021 22:58:04 -0400 Subject: [PATCH 5/8] Add new `ArrayColumn` type --- .../dags/common/storage/columns.py | 41 +++++++++++++++++++ .../dags/common/storage/test_columns.py | 41 +++++++++++++++++-- .../dags/util/loader/column_names.py | 2 +- 3 files changed, 79 insertions(+), 5 deletions(-) diff --git a/src/cc_catalog_airflow/dags/common/storage/columns.py b/src/cc_catalog_airflow/dags/common/storage/columns.py index 264d35f91..fc955959e 100644 --- a/src/cc_catalog_airflow/dags/common/storage/columns.py +++ b/src/cc_catalog_airflow/dags/common/storage/columns.py @@ -241,3 +241,44 @@ def prepare_string(self, value): self.SIZE, False ) + + +class ArrayColumn(Column): + """ + Represents a PostgreSQL column of type Array, which should hold elements + of the given BASE_COLUMN type. + + name: name of the corresponding column in the DB + required: whether the column should be considered required by the + instantiating script. (Not necessarily mapping to + `not null` columns in the PostgreSQL table) + base_column: type of the elements in the array, another column + + """ + def __init__(self, name: str, required: bool, base_column: Column): + self.BASE_COLUMN = base_column + super().__init__(name, required) + + def prepare_string(self, value): + """ + Returns a string representation of an array in the PostgreSQL format: + `{, ...}`. + + Apply changes and validations of the corresponding base column type. + """ + input_type = type(value) + + if value is None: + return value + elif input_type != list: + arr_str = self.BASE_COLUMN.prepare_string(value) + return "{" + arr_str + "}" if arr_str else None + + values = [] + for val in value: + if val is None: + values.append(None) + else: + values.append(self.BASE_COLUMN.prepare_string(val)) + arr_str = json.dumps(values, ensure_ascii=False) + return "{" + arr_str[1:-1] + "}" if arr_str else None diff --git a/src/cc_catalog_airflow/dags/common/storage/test_columns.py b/src/cc_catalog_airflow/dags/common/storage/test_columns.py index 7e69c6c2f..bac252de7 100644 --- a/src/cc_catalog_airflow/dags/common/storage/test_columns.py +++ b/src/cc_catalog_airflow/dags/common/storage/test_columns.py @@ -179,7 +179,7 @@ def test_BooleanColumn_prepare_string_casts_falselike(): assert all([bc.prepare_string(v) == 'f' for v in falselike_values]) -def test_JSONColumn_prepare_string_nones_empty_list(monkeypatch): +def test_JSONColumn_prepare_string_nones_empty_list(): jc = columns.JSONColumn('test', False) L = [] actual_json = jc.prepare_string(L) @@ -187,7 +187,7 @@ def test_JSONColumn_prepare_string_nones_empty_list(monkeypatch): assert actual_json == expect_json -def test_JSONColumn_prepare_string_nones_empty_dict(monkeypatch): +def test_JSONColumn_prepare_string_nones_empty_dict(): jc = columns.JSONColumn('test', False) D = {} actual_json = jc.prepare_string(D) @@ -195,7 +195,7 @@ def test_JSONColumn_prepare_string_nones_empty_dict(monkeypatch): assert actual_json == expect_json -def test_JSONColumn_prepare_string_returns_json_string(monkeypatch): +def test_JSONColumn_prepare_string_returns_json_string(): jc = columns.JSONColumn('test', False) D = {'test': 'dict'} actual_json = jc.prepare_string(D) @@ -203,7 +203,7 @@ def test_JSONColumn_prepare_string_returns_json_string(monkeypatch): assert actual_json == expect_json -def test_JSONColumn_prepare_string_returns_unicode_json_string(monkeypatch): +def test_JSONColumn_prepare_string_returns_unicode_json_string(): jc = columns.JSONColumn('test', False) D = {'test': u'A unicode \u018e string \xf1'} actual_json = jc.prepare_string(D) @@ -333,3 +333,36 @@ def mock_sanitize_string(some_string): actual_str = uc.prepare_string('test string') expect_str = None assert actual_str == expect_str + + +def test_ArrayColumn_of_StringColumn_prepare_string_returns_pg_array(): + ac = columns.ArrayColumn( + 'test', False, columns.StringColumn( + name='test', size=80, required=False, truncate=False + ) + ) + given_list = ['item1', 'item2'] + actual_str = ac.prepare_string(given_list) + expected_str = '{"item1", "item2"}' + assert actual_str == expected_str + + +def test_ArrayColumn_prepare_string_returns_pg_array_from_single_string(): + ac = columns.ArrayColumn( + 'test', False, columns.StringColumn( + name='test', size=80, required=False, truncate=False + ) + ) + actual_str = ac.prepare_string('abcdef') + expected_str = '{abcdef}' + assert actual_str == expected_str + + +def test_ArrayColumn_of_IntegerColumn_prepare_string_returns_pg_array(): + ac = columns.ArrayColumn( + 'test', False, columns.IntegerColumn(name='test', required=False) + ) + given_list = [1.1, 22, 3, 456] + actual_str = ac.prepare_string(given_list) + expected_str = '{"1", "22", "3", "456"}' + assert actual_str == expected_str diff --git a/src/cc_catalog_airflow/dags/util/loader/column_names.py b/src/cc_catalog_airflow/dags/util/loader/column_names.py index fb9b49b9b..5b4a08646 100644 --- a/src/cc_catalog_airflow/dags/util/loader/column_names.py +++ b/src/cc_catalog_airflow/dags/util/loader/column_names.py @@ -29,6 +29,6 @@ BIT_RATE = 'bit_rate' SAMPLE_RATE = 'sample_rate' CATEGORY = 'category' -GENRE = 'genre' +GENRES = 'genres' AUDIO_SET = 'audio_set' ALT_AUDIO_FILES = 'alt_audio_files' From 8ac0298b6b322007f885d99749f68988732037ed Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Tue, 27 Jul 2021 20:15:26 -0400 Subject: [PATCH 6/8] Change `genres` column to ArrayColumn type --- src/cc_catalog_airflow/dags/common/storage/audio.py | 8 +++++--- src/cc_catalog_airflow/dags/common/storage/test_audio.py | 6 +++--- src/cc_catalog_airflow/dags/util/loader/sql.py | 2 +- .../local_postgres/0006_openledger_audio_schema.sql | 2 +- src/openledger_sql/03_create_audio_schema.sql | 2 +- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/cc_catalog_airflow/dags/common/storage/audio.py b/src/cc_catalog_airflow/dags/common/storage/audio.py index ddb8ba37c..cd2304d94 100644 --- a/src/cc_catalog_airflow/dags/common/storage/audio.py +++ b/src/cc_catalog_airflow/dags/common/storage/audio.py @@ -72,8 +72,10 @@ columns.StringColumn( name='category', required=False, size=80, truncate=False, ), - columns.JSONColumn( - name='genres', required=False, + columns.ArrayColumn( + name='genres', required=False, base_column=columns.StringColumn( + name='genre', required=False, size=80, truncate=False + ) ), columns.JSONColumn( # set name, set thumbnail, position of audio in set, set url @@ -133,7 +135,7 @@ def add_item( bit_rate: Optional[int] = None, sample_rate: Optional[int] = None, category: Optional[str] = None, - genres: Optional[str] = None, + genres: Optional[Union[list, str]] = None, audio_set: Optional[str] = None, set_position: Optional[int] = None, set_thumbnail: Optional[str] = None, diff --git a/src/cc_catalog_airflow/dags/common/storage/test_audio.py b/src/cc_catalog_airflow/dags/common/storage/test_audio.py index 2d1f16320..a9cba1d5c 100644 --- a/src/cc_catalog_airflow/dags/common/storage/test_audio.py +++ b/src/cc_catalog_airflow/dags/common/storage/test_audio.py @@ -275,7 +275,7 @@ def test_create_tsv_row_creates_alt_audio_files( '\\N', '\\N', 'music', - '["rock", "pop"]', + '{"rock", "pop"}', '\\N', '[{"url": ' '"http://alternative.com/audio.mp3", "filesize": "123", "bit_rate": "41000", ' @@ -322,7 +322,7 @@ def test_create_tsv_row_creates_audio_set( '\\N', '\\N', 'music', - '["rock", "pop"]', + '{"rock", "pop"}', '{"audio_set": "test_audio_set", "set_url": "test.com", ' '"set_position": "1", "set_thumbnail": "thumbnail.jpg"}', '\\N', @@ -487,7 +487,7 @@ def mock_validate_url(url_string): '16000', '44100', 'music', - '["pop", "rock"]', + '{"pop", "rock"}', '{"audio_set": "album", "set_position": "1", "set_url": "https://album.com/", ' '"set_thumbnail": "https://album.com/thumbnail.jpg"}', '\\N', diff --git a/src/cc_catalog_airflow/dags/util/loader/sql.py b/src/cc_catalog_airflow/dags/util/loader/sql.py index 855ee8828..9f90ef5a7 100644 --- a/src/cc_catalog_airflow/dags/util/loader/sql.py +++ b/src/cc_catalog_airflow/dags/util/loader/sql.py @@ -70,7 +70,7 @@ def create_loading_table( {col.BIT_RATE} integer, {col.SAMPLE_RATE} integer, {col.CATEGORY} character varying(100), - {col.GENRES} jsonb, + {col.GENRES} character varying(80)[], {col.AUDIO_SET} jsonb, {col.ALT_AUDIO_FILES} jsonb ); diff --git a/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql b/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql index 967f87db3..e0d676f85 100644 --- a/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql +++ b/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql @@ -29,7 +29,7 @@ CREATE TABLE public.audio ( bit_rate integer, sample_rate integer, category character varying(200), - genres jsonb, + genres character varying(80)[], audio_set jsonb, alt_audio_files jsonb, filesize integer, diff --git a/src/openledger_sql/03_create_audio_schema.sql b/src/openledger_sql/03_create_audio_schema.sql index 967f87db3..e0d676f85 100644 --- a/src/openledger_sql/03_create_audio_schema.sql +++ b/src/openledger_sql/03_create_audio_schema.sql @@ -29,7 +29,7 @@ CREATE TABLE public.audio ( bit_rate integer, sample_rate integer, category character varying(200), - genres jsonb, + genres character varying(80)[], audio_set jsonb, alt_audio_files jsonb, filesize integer, From dd4bcff0cf0619659fb6ce13b30fdcc755adf2e7 Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Thu, 29 Jul 2021 09:22:54 -0400 Subject: [PATCH 7/8] Add funtion to merge arrays in sql --- src/cc_catalog_airflow/dags/util/loader/sql.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/cc_catalog_airflow/dags/util/loader/sql.py b/src/cc_catalog_airflow/dags/util/loader/sql.py index 9f90ef5a7..5880f4a4b 100644 --- a/src/cc_catalog_airflow/dags/util/loader/sql.py +++ b/src/cc_catalog_airflow/dags/util/loader/sql.py @@ -261,6 +261,17 @@ def _merge_jsonb_arrays(column: str) -> str: EXCLUDED.{column}, old.{column} )''' + + def _merge_array(column: str) -> str: + return f'''{column} = COALESCE( + ( + SELECT array_agg(DISTINCT x) + FROM unnest(old.{column} || EXCLUDED.{column}) t(x) + ), + EXCLUDED.{column}, + old.{column} + )''' + if db_table is None: db_table = AUDIO_TABLE_NAME \ if media_type == 'audio' else IMAGE_TABLE_NAME @@ -311,7 +322,7 @@ def _merge_jsonb_arrays(column: str) -> str: {_newest_non_null(col.BIT_RATE)}, {_newest_non_null(col.SAMPLE_RATE)}, {_newest_non_null(col.CATEGORY)}, - {_merge_jsonb_arrays(col.GENRES)}, + {_merge_array(col.GENRES)}, {_merge_jsonb_objects(col.AUDIO_SET)}, {_merge_jsonb_objects(col.ALT_AUDIO_FILES)} ''' From 182e1592696c476906c3c1113c09ab455945075b Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Thu, 29 Jul 2021 11:14:41 -0400 Subject: [PATCH 8/8] Rename `alt_audio_files` column to `alt_files` --- src/cc_catalog_airflow/dags/common/storage/audio.py | 8 ++++---- .../dags/common/storage/test_audio.py | 12 ++++++------ .../dags/util/loader/column_names.py | 2 +- src/cc_catalog_airflow/dags/util/loader/sql.py | 8 ++++---- .../local_postgres/0006_openledger_audio_schema.sql | 2 +- .../local_postgres/0007_openledger_audio_view.sql | 2 +- src/openledger_sql/03_create_audio_schema.sql | 2 +- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/cc_catalog_airflow/dags/common/storage/audio.py b/src/cc_catalog_airflow/dags/common/storage/audio.py index cd2304d94..35b19ce28 100644 --- a/src/cc_catalog_airflow/dags/common/storage/audio.py +++ b/src/cc_catalog_airflow/dags/common/storage/audio.py @@ -83,7 +83,7 @@ ), columns.JSONColumn( # Alternative files: url, filesize, bit_rate, sample_rate - name='alt_audio_files', required=False + name='alt_files', required=False ), ] @@ -140,7 +140,7 @@ def add_item( set_position: Optional[int] = None, set_thumbnail: Optional[str] = None, set_url: Optional[str] = None, - alt_audio_files: Optional[Dict] = None, + alt_files: Optional[Dict] = None, source: Optional[str] = None, ingestion_type: Optional[str] = None, ): @@ -195,7 +195,7 @@ def add_item( set_position: Position of the audio in the audio_set set_thumbnail: URL of the audio_set thumbnail set_url: URL of the audio_set - alt_audio_files: A dictionary with information about alternative + alt_files: A dictionary with information about alternative files for the audio (different formats/ quality). Dict with the following keys: url, filesize, bit_rate, sample_rate @@ -234,7 +234,7 @@ def add_item( 'category': category, 'genres': genres, 'audio_set': audio_set_data, - 'alt_audio_files': alt_audio_files, + 'alt_files': alt_files, 'source': source, 'ingestion_type': ingestion_type, } diff --git a/src/cc_catalog_airflow/dags/common/storage/test_audio.py b/src/cc_catalog_airflow/dags/common/storage/test_audio.py index a9cba1d5c..2e2f654bd 100644 --- a/src/cc_catalog_airflow/dags/common/storage/test_audio.py +++ b/src/cc_catalog_airflow/dags/common/storage/test_audio.py @@ -44,7 +44,7 @@ 'category': None, 'genres': [], 'audio_set': {}, - 'alt_audio_files': [], + 'alt_files': [], 'source': 'testing_source', 'ingestion_type': 'provider_api', } @@ -231,27 +231,27 @@ def default_audio_args( sample_rate=None, category='music', genres=['rock', 'pop'], - alt_audio_files=None, + alt_files=None, provider='testing_provider', source='testing_source', ingestion_type='provider_api', ) -def test_create_tsv_row_creates_alt_audio_files( +def test_create_tsv_row_creates_alt_files( default_audio_args, get_good, setup_env, ): audio_store = audio.AudioStore() audio_args = default_audio_args.copy() - alt_audio_files = [{ + alt_files = [{ 'url': 'http://alternative.com/audio.mp3', 'filesize': 123, 'bit_rate': 41000, 'sample_rate': '16000' }] - audio_args['alt_audio_files'] = alt_audio_files + audio_args['alt_files'] = alt_files test_audio = audio.Audio(**audio_args) actual_row = audio_store._create_tsv_row(test_audio) expected_row = '\t'.join([ @@ -455,7 +455,7 @@ def mock_validate_url(url_string): 'set_url': 'https://album.com/', 'set_thumbnail': 'https://album.com/thumbnail.jpg' }, - 'alt_audio_files': None, + 'alt_files': None, 'provider': 'testing_provider', 'source': 'testing_source', 'ingestion_type': 'provider_api', diff --git a/src/cc_catalog_airflow/dags/util/loader/column_names.py b/src/cc_catalog_airflow/dags/util/loader/column_names.py index 5b4a08646..fd611bd9e 100644 --- a/src/cc_catalog_airflow/dags/util/loader/column_names.py +++ b/src/cc_catalog_airflow/dags/util/loader/column_names.py @@ -31,4 +31,4 @@ CATEGORY = 'category' GENRES = 'genres' AUDIO_SET = 'audio_set' -ALT_AUDIO_FILES = 'alt_audio_files' +ALT_FILES = 'alt_files' diff --git a/src/cc_catalog_airflow/dags/util/loader/sql.py b/src/cc_catalog_airflow/dags/util/loader/sql.py index 5880f4a4b..51e66d891 100644 --- a/src/cc_catalog_airflow/dags/util/loader/sql.py +++ b/src/cc_catalog_airflow/dags/util/loader/sql.py @@ -72,7 +72,7 @@ def create_loading_table( {col.CATEGORY} character varying(100), {col.GENRES} character varying(80)[], {col.AUDIO_SET} jsonb, - {col.ALT_AUDIO_FILES} jsonb + {col.ALT_FILES} jsonb ); ''' ) @@ -309,7 +309,7 @@ def _merge_array(column: str) -> str: col.CATEGORY: col.CATEGORY, col.GENRES: col.GENRES, col.AUDIO_SET: col.AUDIO_SET, - col.ALT_AUDIO_FILES: col.ALT_AUDIO_FILES, + col.ALT_FILES: col.ALT_FILES, }) else: column_inserts.update({ @@ -324,7 +324,7 @@ def _merge_array(column: str) -> str: {_newest_non_null(col.CATEGORY)}, {_merge_array(col.GENRES)}, {_merge_jsonb_objects(col.AUDIO_SET)}, - {_merge_jsonb_objects(col.ALT_AUDIO_FILES)} + {_merge_jsonb_objects(col.ALT_FILES)} ''' ) else: @@ -394,7 +394,7 @@ def overwrite_records_in_db_table( col.CATEGORY, col.GENRES, col.AUDIO_SET, - col.ALT_AUDIO_FILES, + col.ALT_FILES, ] else: columns_to_update = [ diff --git a/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql b/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql index e0d676f85..34bc70ac0 100644 --- a/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql +++ b/src/cc_catalog_airflow/local_postgres/0006_openledger_audio_schema.sql @@ -31,7 +31,7 @@ CREATE TABLE public.audio ( category character varying(200), genres character varying(80)[], audio_set jsonb, - alt_audio_files jsonb, + alt_files jsonb, filesize integer, license character varying(50) NOT NULL, license_version character varying(25), diff --git a/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql b/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql index 8e8127782..894177793 100644 --- a/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql +++ b/src/cc_catalog_airflow/local_postgres/0007_openledger_audio_view.sql @@ -64,7 +64,7 @@ CREATE MATERIALIZED VIEW audio_view AS category, genres, audio_set, - alt_audio_files, + alt_files, filesize, license, license_version, diff --git a/src/openledger_sql/03_create_audio_schema.sql b/src/openledger_sql/03_create_audio_schema.sql index e0d676f85..34bc70ac0 100644 --- a/src/openledger_sql/03_create_audio_schema.sql +++ b/src/openledger_sql/03_create_audio_schema.sql @@ -31,7 +31,7 @@ CREATE TABLE public.audio ( category character varying(200), genres character varying(80)[], audio_set jsonb, - alt_audio_files jsonb, + alt_files jsonb, filesize integer, license character varying(50) NOT NULL, license_version character varying(25),