Skip to content
This repository has been archived by the owner on Aug 4, 2023. It is now read-only.

Create a Provider API script template #93

Merged
merged 27 commits into from
Jul 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
7ca3fea
Create a Provider API script template
obulat Jun 8, 2021
b343d84
Merge branch 'main' into template
obulat Jun 9, 2021
7181cd3
Shorten lines
obulat Jun 9, 2021
5e45660
Update src/cc_catalog_airflow/templates/template_provider.py_template
obulat Jun 9, 2021
1331d46
Better wording for script date parameter
obulat Jun 11, 2021
5efe43a
Replace relative path with absolute to fix file not found errors
obulat Jun 17, 2021
4a03768
Make image the default media type
obulat Jun 17, 2021
41714d2
Merge branch 'template' of github.com:WordPress/openverse-catalog int…
obulat Jun 17, 2021
88a777c
Make the script output clearer
obulat Jun 17, 2021
13e607b
Fix typo in provider template script
obulat Jun 25, 2021
67b9b30
Merge branch 'main' into template
obulat Jun 25, 2021
ab8d3cc
Merge remote-tracking branch 'origin/template' into template
obulat Jun 25, 2021
264306c
Improve DAG creation template
obulat Jun 25, 2021
d14d5fc
Update src/cc_catalog_airflow/templates/template_provider.py_template
obulat Jun 29, 2021
c4fda47
Add tests for template script
obulat Jul 2, 2021
b4ee577
Fix item column names
Jul 2, 2021
dbcea1d
Pass `template_name` of the template for tests
Jul 2, 2021
0c74c8b
Add `{media_type}` placeholders to template_provider.py_template
Jul 2, 2021
a4e4372
Minor fixes of typos and comments
Jul 2, 2021
bc32f66
Merge branch 'main' into template
obulat Jul 7, 2021
f2c869c
Template falls back to `image` if no image type specified
obulat Jul 7, 2021
3820404
Small bug and documentation fixes
obulat Jul 7, 2021
fe34ec8
Merge branch 'licenses' into template
obulat Jul 7, 2021
7f7e1df
Make get_license_info_from_license_pair public
obulat Jul 7, 2021
1829f19
Add README
obulat Jul 7, 2021
e05666b
Add sample output to the README
obulat Jul 7, 2021
5fca558
Commit media_store items so that the last batch is not lost
obulat Jul 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
13 changes: 10 additions & 3 deletions src/cc_catalog_airflow/dags/common/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
# flake8: noqa
from .licenses import constants
from .licenses.licenses import (
get_license_info, LicenseInfo, is_valid_license_info
get_license_info,
get_license_info_from_license_pair,
is_valid_license_info,
LicenseInfo,
)
from .storage.image import (
Image, ImageStore, MockImageStore
Image,
ImageStore,
MockImageStore,
)
from .storage.audio import (
Audio, AudioStore, MockAudioStore
Audio,
AudioStore,
MockAudioStore
)
from .storage import columns
from .requester import DelayedRequester
4 changes: 2 additions & 2 deletions src/cc_catalog_airflow/dags/common/licenses/licenses.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def get_license_info(
f'Falling back to given license_ {license_}'
f' and license_version {license_version}'
)
license_info = _get_license_info_from_license_pair(
license_info = get_license_info_from_license_pair(
license_, license_version
)
license_info = (*license_info, license_url)
Expand Down Expand Up @@ -190,7 +190,7 @@ def _get_valid_cc_url(license_url) -> Optional[str]:
return validated_license_url


def _get_license_info_from_license_pair(
def get_license_info_from_license_pair(
license_, license_version, pair_map=REVERSE_LICENSE_PATH_MAP
) -> Tuple[Optional[str], Optional[str], Optional[str]]:
"""
Expand Down
10 changes: 5 additions & 5 deletions src/cc_catalog_airflow/dags/common/licenses/test_licenses.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def test_get_license_info_from_license_pair_nones_when_missing_license(
mock_rewriter
):
pair_map = {('by', '1.0'): 'licenses/by/1.0'}
license_info = licenses._get_license_info_from_license_pair(
license_info = licenses.get_license_info_from_license_pair(
None,
'1.0',
pair_map=pair_map
Expand All @@ -189,7 +189,7 @@ def test_get_license_info_from_license_pair_nones_missing_version(
mock_rewriter
):
pair_map = {('by', '1.0'): 'licenses/by/1.0'}
license_info = licenses._get_license_info_from_license_pair(
license_info = licenses.get_license_info_from_license_pair(
'by',
None,
pair_map=pair_map
Expand All @@ -199,7 +199,7 @@ def test_get_license_info_from_license_pair_nones_missing_version(

def test_validate_license_pair_handles_float_version(mock_rewriter):
pair_map = {('by', '1.0'): 'licenses/by/1.0'}
actual_license_info = licenses._get_license_info_from_license_pair(
actual_license_info = licenses.get_license_info_from_license_pair(
'by',
1.0,
pair_map=pair_map
Expand All @@ -212,7 +212,7 @@ def test_validate_license_pair_handles_float_version(mock_rewriter):

def test_validate_license_pair_handles_int_version(mock_rewriter):
pair_map = {('by', '1.0'): 'licenses/by/1.0'}
actual_license_info = licenses._get_license_info_from_license_pair(
actual_license_info = licenses.get_license_info_from_license_pair(
'by',
1,
pair_map=pair_map
Expand All @@ -225,7 +225,7 @@ def test_validate_license_pair_handles_int_version(mock_rewriter):

def test_validate_license_pair_handles_na_version(mock_rewriter):
pair_map = {('publicdomain', 'N/A'): 'licenses/publicdomain'}
actual_license_info = licenses._get_license_info_from_license_pair(
actual_license_info = licenses.get_license_info_from_license_pair(
'publicdomain',
'N/A',
pair_map=pair_map
Expand Down
22 changes: 22 additions & 0 deletions src/cc_catalog_airflow/templates/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
## Adding new provider API script

Openverse Catalog uses APIs of sites that share openly-licensed media to collect the data about the media and save it to the database. We call the scripts that pull the data from these APIs "Provider API scripts". You can find examples in [`provider_api_scripts` folder](../dags/provider_api_scripts).

To add a Provider API script using this template, you will need to have Python 3 installed on your machine (preferably, version 3.9). You will also need to know the name of provider, and the type of media you are going to collect (`image` or `audio`).

To add a script for collecting audio data from provider named "MyProvider", open your terminal and run
```bash
python3 src/cc_catalog_airflow/templates/create_api_script.py MyProvider -m audio
```
You should see output similar to this:
```bash
Creating files in path/to/openverse-catalog
API script: src/cc_catalog_airflow/dags/provider_api_scripts/myprovider.py
API script test: src/cc_catalog_airflow/dags/provider_api_scripts/test_myprovider.py
Airflow workflow file: src/cc_catalog_airflow/dags/myprovider_workflow.py

```
The following files have been created:
1. Airflow workflow file. You will probably NOT need to edit it.
2. `myprovider.py` script. This is a template that simplifies creating an API provider script by providing the basic structure. The scripts use small and easily-testable functions. Follow the instructions within the script comments, and complete all the TODOs. Make sure to look at sample `.json` files that will be saved for testing.
3. `test_myprovider.py`. This is a skeleton for your tests. Write tests for the functions in your Provider API script, using the `json` files with sample API responses.
Empty file.
99 changes: 99 additions & 0 deletions src/cc_catalog_airflow/templates/create_api_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import argparse
from pathlib import Path


IMAGE_STORE_INIT = 'image_store = ImageStore(provider=PROVIDER)'
AUDIO_STORE_INIT = 'audio_store = AudioStore(provider=PROVIDER)'


def _get_filled_template(template_path, provider, media_type='image'):
with open(template_path, 'r', encoding='utf8') as template:
template_string = template.read()
script_string = template_string.replace(
'{provider_title_case}', provider.title()
).replace(
'{provider_upper_case}', provider.upper()
).replace(
'{provider}', provider.lower()
)
if media_type == 'audio':
media_store_init = AUDIO_STORE_INIT
media_store = 'audio_store'
else:
media_store_init = IMAGE_STORE_INIT
media_store = 'image_store'
script_string = script_string.replace(
'media_store_init', media_store_init
).replace(
'{media_store}', media_store
).replace(
'{media_type}', media_type
)

return script_string


def fill_template(provider, media_type, templates_path):
project_path = templates_path.parent.parent.parent
template_name = 'template_provider.py_template'
script_template_path = templates_path / template_name
print(f"Creating files in {project_path}")

dags_path = templates_path.parent / 'dags'
filename = provider.replace(" ", '_').lower()

api_path = dags_path / 'provider_api_scripts'
api_script_path = api_path / f"{filename}.py"
with open(api_script_path, 'w+', encoding='utf8') as api_script:
api_script_string = _get_filled_template(
script_template_path, provider, media_type
)
api_script.write(api_script_string)
print(f"API script: {api_script_path.relative_to(project_path)}")

template_name = 'template_test.py_template'
script_template_path = templates_path / template_name
test_script_path = api_path / f"test_{filename}.py"
with open(test_script_path, 'w+', encoding='utf8') as test_script:
test_string = _get_filled_template(
script_template_path, provider, media_type
)
test_script.write(test_string)
print(f"API script test: {test_script_path.relative_to(project_path)}")

workflow_template_path = templates_path / 'workflow.py_template'
workflow_path = dags_path / f"{filename}_workflow.py"
with open(workflow_path, 'w+', encoding='utf8') as workflow_file:
workflow_string = _get_filled_template(
workflow_template_path, provider
)
workflow_file.write(workflow_string)
print("Airflow workflow file: "
f"{workflow_path.relative_to(project_path)}")


def main():
parser = argparse.ArgumentParser(
description='Create a new provider API script',
add_help=True,
)
parser.add_argument(
"provider",
help='Create the script for this provider (eg. "Wikimedia").')
parser.add_argument(
'-m', '--media', type=str, choices=['image', 'audio'],
help="Script will collect media of this type"
" ('audio'/'image'). Default value is 'image'"
)
args = parser.parse_args()
provider = args.provider
media_type = args.media
if media_type not in ['audio', 'image']:
print("No media type given, assuming it's `image`")
media_type = 'image'
templates_path = Path(__file__).parent
fill_template(provider, media_type, templates_path)


if __name__ == "__main__":
main()
Loading