diff --git a/src/cc_catalog_airflow/__init__.py b/src/cc_catalog_airflow/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/cc_catalog_airflow/dags/common/__init__.py b/src/cc_catalog_airflow/dags/common/__init__.py index dbcc7e068..3c7be47df 100644 --- a/src/cc_catalog_airflow/dags/common/__init__.py +++ b/src/cc_catalog_airflow/dags/common/__init__.py @@ -1,13 +1,20 @@ # flake8: noqa from .licenses import constants from .licenses.licenses import ( - get_license_info, LicenseInfo, is_valid_license_info + get_license_info, + get_license_info_from_license_pair, + is_valid_license_info, + LicenseInfo, ) from .storage.image import ( - Image, ImageStore, MockImageStore + Image, + ImageStore, + MockImageStore, ) from .storage.audio import ( - Audio, AudioStore, MockAudioStore + Audio, + AudioStore, + MockAudioStore ) from .storage import columns from .requester import DelayedRequester diff --git a/src/cc_catalog_airflow/dags/common/licenses/licenses.py b/src/cc_catalog_airflow/dags/common/licenses/licenses.py index 105f9ff62..8996b196e 100644 --- a/src/cc_catalog_airflow/dags/common/licenses/licenses.py +++ b/src/cc_catalog_airflow/dags/common/licenses/licenses.py @@ -69,7 +69,7 @@ def get_license_info( f'Falling back to given license_ {license_}' f' and license_version {license_version}' ) - license_info = _get_license_info_from_license_pair( + license_info = get_license_info_from_license_pair( license_, license_version ) license_info = (*license_info, license_url) @@ -190,7 +190,7 @@ def _get_valid_cc_url(license_url) -> Optional[str]: return validated_license_url -def _get_license_info_from_license_pair( +def get_license_info_from_license_pair( license_, license_version, pair_map=REVERSE_LICENSE_PATH_MAP ) -> Tuple[Optional[str], Optional[str], Optional[str]]: """ diff --git a/src/cc_catalog_airflow/dags/common/licenses/test_licenses.py b/src/cc_catalog_airflow/dags/common/licenses/test_licenses.py index 32e43d40c..24a5a3861 100644 --- a/src/cc_catalog_airflow/dags/common/licenses/test_licenses.py +++ b/src/cc_catalog_airflow/dags/common/licenses/test_licenses.py @@ -177,7 +177,7 @@ def test_get_license_info_from_license_pair_nones_when_missing_license( mock_rewriter ): pair_map = {('by', '1.0'): 'licenses/by/1.0'} - license_info = licenses._get_license_info_from_license_pair( + license_info = licenses.get_license_info_from_license_pair( None, '1.0', pair_map=pair_map @@ -189,7 +189,7 @@ def test_get_license_info_from_license_pair_nones_missing_version( mock_rewriter ): pair_map = {('by', '1.0'): 'licenses/by/1.0'} - license_info = licenses._get_license_info_from_license_pair( + license_info = licenses.get_license_info_from_license_pair( 'by', None, pair_map=pair_map @@ -199,7 +199,7 @@ def test_get_license_info_from_license_pair_nones_missing_version( def test_validate_license_pair_handles_float_version(mock_rewriter): pair_map = {('by', '1.0'): 'licenses/by/1.0'} - actual_license_info = licenses._get_license_info_from_license_pair( + actual_license_info = licenses.get_license_info_from_license_pair( 'by', 1.0, pair_map=pair_map @@ -212,7 +212,7 @@ def test_validate_license_pair_handles_float_version(mock_rewriter): def test_validate_license_pair_handles_int_version(mock_rewriter): pair_map = {('by', '1.0'): 'licenses/by/1.0'} - actual_license_info = licenses._get_license_info_from_license_pair( + actual_license_info = licenses.get_license_info_from_license_pair( 'by', 1, pair_map=pair_map @@ -225,7 +225,7 @@ def test_validate_license_pair_handles_int_version(mock_rewriter): def test_validate_license_pair_handles_na_version(mock_rewriter): pair_map = {('publicdomain', 'N/A'): 'licenses/publicdomain'} - actual_license_info = licenses._get_license_info_from_license_pair( + actual_license_info = licenses.get_license_info_from_license_pair( 'publicdomain', 'N/A', pair_map=pair_map diff --git a/src/cc_catalog_airflow/templates/README.md b/src/cc_catalog_airflow/templates/README.md new file mode 100644 index 000000000..f415c8c55 --- /dev/null +++ b/src/cc_catalog_airflow/templates/README.md @@ -0,0 +1,22 @@ +## Adding new provider API script + +Openverse Catalog uses APIs of sites that share openly-licensed media to collect the data about the media and save it to the database. We call the scripts that pull the data from these APIs "Provider API scripts". You can find examples in [`provider_api_scripts` folder](../dags/provider_api_scripts). + +To add a Provider API script using this template, you will need to have Python 3 installed on your machine (preferably, version 3.9). You will also need to know the name of provider, and the type of media you are going to collect (`image` or `audio`). + +To add a script for collecting audio data from provider named "MyProvider", open your terminal and run +```bash +python3 src/cc_catalog_airflow/templates/create_api_script.py MyProvider -m audio +``` +You should see output similar to this: +```bash +Creating files in path/to/openverse-catalog +API script: src/cc_catalog_airflow/dags/provider_api_scripts/myprovider.py +API script test: src/cc_catalog_airflow/dags/provider_api_scripts/test_myprovider.py +Airflow workflow file: src/cc_catalog_airflow/dags/myprovider_workflow.py + +``` +The following files have been created: +1. Airflow workflow file. You will probably NOT need to edit it. +2. `myprovider.py` script. This is a template that simplifies creating an API provider script by providing the basic structure. The scripts use small and easily-testable functions. Follow the instructions within the script comments, and complete all the TODOs. Make sure to look at sample `.json` files that will be saved for testing. +3. `test_myprovider.py`. This is a skeleton for your tests. Write tests for the functions in your Provider API script, using the `json` files with sample API responses. \ No newline at end of file diff --git a/src/cc_catalog_airflow/templates/__init__.py b/src/cc_catalog_airflow/templates/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/cc_catalog_airflow/templates/create_api_script.py b/src/cc_catalog_airflow/templates/create_api_script.py new file mode 100644 index 000000000..2211560cc --- /dev/null +++ b/src/cc_catalog_airflow/templates/create_api_script.py @@ -0,0 +1,99 @@ +import argparse +from pathlib import Path + + +IMAGE_STORE_INIT = 'image_store = ImageStore(provider=PROVIDER)' +AUDIO_STORE_INIT = 'audio_store = AudioStore(provider=PROVIDER)' + + +def _get_filled_template(template_path, provider, media_type='image'): + with open(template_path, 'r', encoding='utf8') as template: + template_string = template.read() + script_string = template_string.replace( + '{provider_title_case}', provider.title() + ).replace( + '{provider_upper_case}', provider.upper() + ).replace( + '{provider}', provider.lower() + ) + if media_type == 'audio': + media_store_init = AUDIO_STORE_INIT + media_store = 'audio_store' + else: + media_store_init = IMAGE_STORE_INIT + media_store = 'image_store' + script_string = script_string.replace( + 'media_store_init', media_store_init + ).replace( + '{media_store}', media_store + ).replace( + '{media_type}', media_type + ) + + return script_string + + +def fill_template(provider, media_type, templates_path): + project_path = templates_path.parent.parent.parent + template_name = 'template_provider.py_template' + script_template_path = templates_path / template_name + print(f"Creating files in {project_path}") + + dags_path = templates_path.parent / 'dags' + filename = provider.replace(" ", '_').lower() + + api_path = dags_path / 'provider_api_scripts' + api_script_path = api_path / f"{filename}.py" + with open(api_script_path, 'w+', encoding='utf8') as api_script: + api_script_string = _get_filled_template( + script_template_path, provider, media_type + ) + api_script.write(api_script_string) + print(f"API script: {api_script_path.relative_to(project_path)}") + + template_name = 'template_test.py_template' + script_template_path = templates_path / template_name + test_script_path = api_path / f"test_{filename}.py" + with open(test_script_path, 'w+', encoding='utf8') as test_script: + test_string = _get_filled_template( + script_template_path, provider, media_type + ) + test_script.write(test_string) + print(f"API script test: {test_script_path.relative_to(project_path)}") + + workflow_template_path = templates_path / 'workflow.py_template' + workflow_path = dags_path / f"{filename}_workflow.py" + with open(workflow_path, 'w+', encoding='utf8') as workflow_file: + workflow_string = _get_filled_template( + workflow_template_path, provider + ) + workflow_file.write(workflow_string) + print("Airflow workflow file: " + f"{workflow_path.relative_to(project_path)}") + + +def main(): + parser = argparse.ArgumentParser( + description='Create a new provider API script', + add_help=True, + ) + parser.add_argument( + "provider", + help='Create the script for this provider (eg. "Wikimedia").') + parser.add_argument( + '-m', '--media', type=str, choices=['image', 'audio'], + help="Script will collect media of this type" + " ('audio'/'image'). Default value is 'image'" + ) + args = parser.parse_args() + provider = args.provider + media_type = args.media + if media_type not in ['audio', 'image']: + print("No media type given, assuming it's `image`") + media_type = 'image' + templates_path = Path(__file__).parent + fill_template(provider, media_type, templates_path) + + +if __name__ == "__main__": + main() diff --git a/src/cc_catalog_airflow/templates/template_provider.py_template b/src/cc_catalog_airflow/templates/template_provider.py_template new file mode 100644 index 000000000..1a7ee168f --- /dev/null +++ b/src/cc_catalog_airflow/templates/template_provider.py_template @@ -0,0 +1,392 @@ +""" +Content Provider: {provider_title_case} + +ETL Process: Use the API to identify all openly licensed media. + +Output: TSV file containing the media metadata. + +Notes: {{API URL}} + No rate limit specified. +""" +import json +import os +import logging +from pathlib import Path +from urllib.parse import urlparse + +from common import DelayedRequester, ImageStore, AudioStore +from common.licenses.licenses import get_license_info, get_license_info_from_license_pair +from util.loader import provider_details as prov + +""" +This is template for an API script. Broadly, there are several steps: +1. Download batches of information for the query for openly-licensed media +2. For each item in batch, extract the necessary meta data. +3. Save the metadata using ImageStore.add_item or AudioStore.add_item methods + +Try to write small functions that are easier to test. Don't forget to +write tests, too! + +You can test your script during development by running it: +`python -m .py` + +To extract information from html, you can use lxml.html. +Recommended examples: +- For scripts requiring a start date: `wikimedia_commons.py` +- For scripts without a start date: `science_museum.py`. + +ImageStore/AudioStore are the classes that clean the media metadata +and save it to the disk in form of a tsv file. +They will save the file to '/tmp' folder on your computer when you run +this script as is +""" + +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s: %(message)s', + level=logging.INFO +) +logger = logging.getLogger(__name__) + +LIMIT = 0 # number of items per page in API response +DELAY = 1 # in seconds +RETRIES = 3 +HOST = '{{host URL}}' +ENDPOINT = f'https://{HOST}/{{API PATH}}' +# TODO: Add Provider constant to the +# `src/cc_catalog_airflow/dags/util/loader/provider_details.py` file +PROVIDER = prov.{provider_upper_case}_DEFAULT_PROVIDER +# TODO: Add the API key to `src/cc_catalog_airflow/env.template` +# Do not hardcode your API key value! +API_KEY = os.getenv("{provider_upper_case}", "nokeyprovided") + +# TODO: Add any headers necessary for API request +HEADERS = { + "Accept": "application/json", + "api_key": API_KEY, +} +# TODO: Add parameters that are necessary for each API request +DEFAULT_QUERY_PARAMS = { + 'format': 'json', + 'license': 'open-cc-licenses', +} + +delayed_requester = DelayedRequester(DELAY) +media_store_init + + +saved_json_counter = { + 'full_response': 0, + 'empty_response': 0, + 'full_item': 0, + 'no_{media_type}_url': 0, + 'no_foreign_landing_url': 0, + 'no_license': 0, +} + + +def check_and_save_json_for_test(name, data): + parent = Path(__file__).parent + test_resources_path = parent / 'tests' / 'resources' / {provider} + if not Path.is_dir(test_resources_path): + Path.mkdir(test_resources_path) + if saved_json_counter[name] == 0: + with open(f"{name}.json", "w+", encoding="utf-8") as outf: + json.dump(data, outf, indent=2) + saved_json_counter[name] += 1 + + +# TODO: Date parameter is necessary for providers with a lot of +# content. For others, we simply ingest all of the content +# every time. +def main(): + """ + This script pulls the data for a given date from the {provider_title_case}, + and writes it into a .TSV file to be eventually read + into our DB. + + {{ TODO: Remove if date is not used }} + Required Arguments: + + date: Date String in the form YYYY-MM-DD. This is the date for + which running the script will pull data. }} + """ + + logger.info("Begin: {provider_title_case} script") + {media_type}_count = _get_items() + {media_type}_store.commit() + logger.info(f"Total {media_type}s pulled: {{media_type}_count}") + logger.info('Terminated!') + + +def _get_query_params( + page_number=0, + default_query_params=None, +): + if default_query_params is None: + default_query_params = DEFAULT_QUERY_PARAMS + query_params = default_query_params.copy() + query_params["page"] = str(page_number) + return query_params + + +def _get_items(): + item_count = 0 + page_number = 0 + should_continue = True + while should_continue: + query_params = _get_query_params(page_number=page_number) + + batch_data = _get_batch_json( + query_params=query_params + ) + if isinstance(batch_data, list) and len(batch_data) > 0: + item_count = _process_item_batch(batch_data) + page_number += 1 + else: + should_continue = False + return item_count + + +def _get_batch_json( + endpoint=ENDPOINT, + headers=None, + retries=RETRIES, + query_params=None +): + if headers is None: + headers = HEADERS + response_json = delayed_requester.get_response_json( + endpoint, + retries, + query_params, + headers=headers + ) + if response_json is None: + return None + else: + data = response_json.get("data") + if data: + check_and_save_json_for_test('full_response', data) + else: + check_and_save_json_for_test('empty_response', data) + return data + + +def _process_item_batch(items_batch): + for item in items_batch: + # For testing purposes, you would need to save json data for single + # media objects. To make sure that you test edge cases, + # we add the code that saves a json file per each condition: + # full, and without one of the required properties. + # TODO: save the resulting json files (if any) in the + # `provider_api_scripts/tests/resources/` folder + # TODO: remove the code for saving json files from the final script + + item_meta_data = _extract_item_data(item) + if item_meta_data is None: + continue + {media_store}.add_item(**item_meta_data) + return {media_store}.total_items + + +def _extract_item_data(media_data): + """ + Extract data for individual item + You can view the documentation about media parameters in + `AudioStore.add_item` or `ImageStore.add_item` method + docstrings. + Required properties: + - foreign_landing_url + - image_url / audio_url + - item_license + + Optional properties: + - foreign_identifier + - title + - creator + - creator_url + - thumbnail_url + - metadata + - tags + - watermarked (false by default) + + Optional properties for images: + - width + - height + + Optional properties for audio: + - duration + - bit_rate + - sample_rate + - category + - genre (list of genres) + - audio_set (JSON field) + - alternative audio fields + + """ + # TODO: remove the code for saving json files from the final script + + try: + foreign_landing_url = media_data["links"][0]["url"] + except (TypeError, KeyError, AttributeError): + print(f"Found no foreign landing url:") + print(f"{json.dumps(media_data, indent=2)}") + check_and_save_json_for_test('no_foreign_landing_url', media_data) + return None + # TODO: Choose correct line for media type and delete the other: + # TODO: Delete the unnecessary _get_media_info function + # audio_url, duration = _get_audio_info(media_data) + # image_url, height, width = _get_image_info(media_data) + if {media_type}_url is None: + print(f"Found no media url:") + print(f"{json.dumps(media_data, indent=2)}") + check_and_save_json_for_test('no_{media_type}_url', media_data) + return None + item_license = _get_license(media_data) + if item_license is None: + print(f"Found no item license:") + print(f"{json.dumps(media_data, indent=2)}") + check_and_save_json_for_test('no_license', media_data) + return None + foreign_identifier = _get_foreign_identifier(media_data) + title = _get_title(media_data) + creator, creator_url = _get_creator_data(media_data) + thumbnail = _get_thumbnail_url(media_data) + metadata = _get_metadata(media_data) + tags = _get_tags(media_data) + check_and_save_json_for_test('full_item', media_data) + + return { + 'title': title, + 'creator': creator, + 'creator_url': creator_url, + 'foreign_identifier': foreign_identifier, + 'foreign_landing_url': foreign_landing_url, + '{media_type}_url': {media_type}_url, + 'height': height, + 'width': width, + 'thumbnail_url': thumbnail, + 'license_': item_license.license, + 'license_version': item_license.version, + 'meta_data': metadata, + 'raw_tags': tags + } + + +def _get_foreign_identifier(media_data): + try: + return media_data['some_key'][0]['uid'] + except(TypeError, IndexError, AttributeError): + return None + + +def _get_image_info(media_data): + width = media_data.get('width') + height = media_data.get('height') + image_url = media_data.get('image_url') + return image_url, width, height + +def _get_audio_info(media_data): + duration = media_data.get('duration') + audio_url = media_data.get('audio_url') + return audio_url, duration + + +def _get_thumbnail_url(media_data): + # TODO: Add correct implementation of _get_thumbnail_url + return media_data.get('thumbnail', {}).get('url', None) + + +def _get_creator_data(item): + # TODO: Add correct implementation of _get_creator_data + creator = item.get('creator_key').strip() + creator_url = _cleanse_url( + item.get('creator_key', {}).get('url') + ) + return creator, creator_url + + +def _get_title(item): + # TODO: Add correct implementation of _get_title + title = item.get('title') + return title + + +def _get_metadata(item): + """ + Metadata may include: description, date created and modified at source, + categories, popularity statistics. + """ + # TODO: Add function to extract metadata from the item dictionary + # Do not includes keys without value + metadata = {} + some_other_key_value = item.get('some_other_key') + if some_other_key_value is not None: + metadata['some_other_key'] = some_other_key_value + return metadata + + +def _get_tags(item): + # TODO: Add correct implementation of _get_tags + return item.get('tags') + + +def _get_license(item): + """ + To parse the item license, use `get_license_info` function. It + returns a namedtuple LicenseInfo(license_url, license, version) + + It requires either: + 1) a`license_url` (eg. `https://creativecommons.org/licenses/by/4.0/`) + or + 2)`license_name` and `license_version` + + `license_name` can be one of the following: + [ 'by', 'by-nc', 'by-nc-nd', 'by-nc-sa', 'by-nd', 'by-sa', + 'devnations', 'sampling', 'sampling+', + 'publicdomain', 'pdm', 'cc0' ] + + To view all possible licenses, look at licenses/constants. + To validate that the license_name and license_version you get + are correct, you can use `get_license_info_from_license_pair( + license_name, license_version)` + """ + # TODO: add correct implementation of _get_license + # If the provider gives license url: + item_license_url = item.get('license_url') + item_license = get_license_info(license_url=item_license_url) + + # If the provider gives license name and license version + # Note: `publicdomain` does not have version, pass 'N/A' instead + item_license_name = item.get('license_name') + item_license_version = item.get('license_version') + item_license = get_license_info( + license_=item_license_name, + license_version=item_license_version + ) + if item_license.license is None: + return None + return item_license + + +def _cleanse_url(url_string): + """ + Check to make sure that a url is valid, and prepend a protocol if needed + """ + + parse_result = urlparse(url_string) + + if parse_result.netloc == HOST: + parse_result = urlparse(url_string, scheme='https') + elif not parse_result.scheme: + parse_result = urlparse(url_string, scheme='http') + + if parse_result.netloc or parse_result.path: + return parse_result.geturl() + + +if __name__ == '__main__': + main() + +# TODO: Remove unnecessary comments +# TODO: Lint your code with pycodestyle diff --git a/src/cc_catalog_airflow/templates/template_test.py_template b/src/cc_catalog_airflow/templates/template_test.py_template new file mode 100644 index 000000000..4a98364f1 --- /dev/null +++ b/src/cc_catalog_airflow/templates/template_test.py_template @@ -0,0 +1,175 @@ +# TODO: Test the small functions you created, +# trying to find different edge cases (missing keys, +# different data types returned, Nones, etc), +# especially the ones found in the `json` files +# with API responses. + + +# Mock the functions that require internet access, +# such as url verification, so that the tests can run +# faster, and even offline. +import json +import logging +import os +from unittest.mock import patch + +import {provider} + +RESOURCES = os.path.join( + os.path.abspath(os.path.dirname(__file__)), 'tests/resources/{provider}' +) + +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s: %(message)s', + level=logging.DEBUG, +) + + +def test_get_{media_type}_pages_returns_correctly_with_none_json(): + expect_result = None + with patch.object( + {provider}.delayed_requester, + 'get_response_json', + return_value=None + ): + actual_result = {provider}._get_batch_json() + assert actual_result == expect_result + + +def test_get_{media_type}_pages_returns_correctly_with_no_results(): + expect_result = None + with patch.object( + {provider}.delayed_requester, + 'get_response_json', + return_value={} + ): + actual_result = {provider}._get_batch_json() + assert actual_result == expect_result + + +def test_get_query_params_adds_offset(): + actual_qp = {provider}._get_query_params( + offset=200 + ) + expected_qp = {'offset': 200} + assert actual_qp['offset'] == expected_qp['offset'] + + +def test_get_query_params_leaves_other_keys(): + actual_qp = {provider}._get_query_params( + offset=200, default_query_params={'test': 'value'} + ) + assert actual_qp['test'] == 'value' + assert len(actual_qp.keys()) == 2 + + +def test_get_items(): + with open(os.path.join(RESOURCES, 'page1.json')) as f: + first_response = json.load(f) + with patch.object( + {provider}, + '_get_batch_json', + side_effect=[first_response, []] + ): + expected_{media_type}_count = 3 + actual_{media_type}_count = {provider}._get_items() + assert expected_{media_type}_count == actual_{media_type}_count + + +def test_process_item_batch_handles_example_batch(): + with open(os.path.join(RESOURCES, '{media_type}_data_example.json')) as f: + items_batch = [json.load(f)] + with patch.object( + {provider}.{media_type}_store, + 'add_item', + return_value=1 + ) as mock_add: + {provider}._process_item_batch(items_batch) + mock_add.assert_called_once() + _, actual_call_args = mock_add.call_args_list[0] + expected_call_args = { + } + assert actual_call_args == expected_call_args + + +def test_extract_{media_type}_data_returns_none_when_media_data_none(): + actual_{media_type}_info = {provider}._extract_item_data(None) + expected_{media_type}_info = None + assert actual_{media_type}_info is expected_{media_type}_info + + +def test_extract_{media_type}_data_returns_none_when_no_foreign_id(): + with open(os.path.join(RESOURCES, '{media_type}_data_example.json')) as f: + {media_type}_data = json.load(f) + {media_type}_data.pop('foreign_id', None) + actual_{media_type}_info = {provider}._extract_item_data({media_type}_data) + expected_{media_type}_info = None + assert actual_{media_type}_info is expected_{media_type}_info + + +def test_extract_{media_type}_data_returns_none_when_no_{media_type}_url(): + with open(os.path.join(RESOURCES, '{media_type}_data_example.json')) as f: + {media_type}_data = json.load(f) + {media_type}_data.pop('{media_type}_url', None) + actual_{media_type}_info = {provider}._extract_item_data({media_type}_data) + assert actual_{media_type}_info is None + + +def test_extract_{media_type}_data_returns_none_when_no_license(): + with open(os.path.join(RESOURCES, '{media_type}_data_example.json')) as f: + {media_type}_data = json.load(f) + {media_type}_data.pop('license_url', None) + actual_{media_type}_info = {provider}._extract_item_data({media_type}_data) + assert actual_{media_type}_info is None + + +def test_get_creator_data(): + + with open(os.path.join(RESOURCES, '{media_type}_data_example.json')) as f: + {media_type}_data = json.load(f) + actual_creator, actual_creator_url = {provider}._get_creator_data({media_type}_data) + expected_creator = '' + expected_creator_url = '' + + assert actual_creator == expected_creator + assert actual_creator_url == expected_creator_url + + +def test_get_creator_data_handles_no_url(): + with open(os.path.join(RESOURCES, '{media_type}_data_example.json')) as f: + {media_type}_data = json.load(f) + {media_type}_data.pop('artist_url', None) + expected_creator = '' + + actual_creator, actual_creator_url = {provider}._get_creator_data({media_type}_data) + assert actual_creator == expected_creator + assert actual_creator_url is None + + +def test_get_creator_data_returns_none_when_no_artist(): + with open(os.path.join(RESOURCES, '{media_type}_data_example.json')) as f: + {media_type}_data = json.load(f) + {media_type}_data.pop('artist_name', None) + actual_creator, actual_creator_url = {provider}._get_creator_data({media_type}_data) + + assert actual_creator is None + assert actual_creator_url is None + + +def test_extract_{media_type}_data_handles_example_dict(): + with open(os.path.join(RESOURCES, '{media_type}_data_example.json')) as f: + {media_type}_data = json.load(f) + + actual_{media_type}_info = {provider}._extract_item_data({media_type}_data) + expected_{media_type}_info = { + } + assert actual_{media_type}_info == expected_{media_type}_info + + +def test_get_tags(): + item_data = { + "tags": ['tag1', 'tag2'] + } + expected_tags = ['tag1', 'tag2'] + actual_tags = {provider}._get_tags(item_data) + assert expected_tags == actual_tags diff --git a/src/cc_catalog_airflow/templates/workflow.py_template b/src/cc_catalog_airflow/templates/workflow.py_template new file mode 100644 index 000000000..a5e116e51 --- /dev/null +++ b/src/cc_catalog_airflow/templates/workflow.py_template @@ -0,0 +1,26 @@ +""" +This file configures the Apache Airflow DAG to (re)ingest {provider_title_case} data. +""" +# airflow DAG (necessary for Airflow to find this file) +from datetime import datetime +import logging + +from provider_api_scripts import {provider} +from util.dag_factory import create_provider_api_workflow + + +logging.basicConfig( + format='%(asctime)s: [%(levelname)s - DAG Loader] %(message)s', + level=logging.DEBUG) +logger = logging.getLogger(__name__) + +DAG_ID = "{provider}_workflow" + +globals()[DAG_ID] = create_provider_api_workflow( + DAG_ID, + {provider}.main, + start_date=datetime(1970, 1, 1), + concurrency=1, + schedule_string='@monthly', + dated=False, +)