From 20772bb49019a51b72e4a9f22b69ac7c24b6ad23 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Fri, 16 Jul 2021 18:43:24 +0300 Subject: [PATCH] Make wikimedia script pass license_info, not license_url (#129) * Make wikimedia script pass license_info, not license_url Signed-off-by: Olga Bulat * Fix import of get_license_info Co-authored-by: Krystle Salazar --- .../test_wikimedia_commons.py | 51 +++++++++++++------ .../provider_api_scripts/wikimedia_commons.py | 18 ++++--- 2 files changed, 47 insertions(+), 22 deletions(-) diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/test_wikimedia_commons.py b/src/cc_catalog_airflow/dags/provider_api_scripts/test_wikimedia_commons.py index 2617b8add..10082b012 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/test_wikimedia_commons.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/test_wikimedia_commons.py @@ -1,9 +1,10 @@ import json import logging import os -from unittest.mock import patch +from unittest.mock import patch, call import wikimedia_commons as wmc +from common.licenses.licenses import get_license_info RESOURCES = os.path.join( os.path.abspath(os.path.dirname(__file__)), 'tests/resources/wikimedia' @@ -252,6 +253,10 @@ def test_extract_title_gets_cleaned_title(): def test_process_image_data_handles_example_dict(): + """ + Converts sample json data to correct image metadata, + and calls `add_item` once for a valid image. + """ with open(os.path.join(RESOURCES, 'image_data_example.json')) as f: image_data = json.load(f) @@ -261,14 +266,17 @@ def test_process_image_data_handles_example_dict(): return_value=1 ) as mock_add: wmc._process_image_data(image_data) - - mock_add.assert_called_once_with( + expected_license_info = get_license_info( + license_url='https://creativecommons.org/licenses/by-sa/4.0' + ) + assert mock_add.call_count == 1 + assert mock_add.call_args == call( foreign_landing_url=( 'https://commons.wikimedia.org/w/index.php?curid=81754323'), image_url=( 'https://upload.wikimedia.org/wikipedia/commons/2/25/20120925_' 'PlozevetBretagne_LoneTree_DSC07971_PtrQs.jpg'), - license_url='https://creativecommons.org/licenses/by-sa/4.0', + license_info=expected_license_info, foreign_identifier=81754323, width=5514, height=3102, @@ -276,17 +284,28 @@ def test_process_image_data_handles_example_dict(): creator_url='https://commons.wikimedia.org/wiki/User:PtrQs', title='20120925 PlozevetBretagne LoneTree DSC07971 PtrQs', meta_data={'description': 'SONY DSC', 'global_usage_count': 0, - 'last_modified_at_source': '2019-09-01 00:38:47', 'date_originally_created': '2012-09-25 16:23:02', - 'categories': [ - 'Coasts of Ploz\u00e9vet', 'No QIC by usr:PtrQs', - ('Photographs taken with Minolta AF Zoom ' - '28-70mm F2.8 G'), - 'Self-published work', 'Taken with Sony DSLR-A900', - 'Trees in Finist\u00e8re']} + 'last_modified_at_source': '2019-09-01 00:38:47', + 'categories': [ + 'Coasts of Ploz\u00e9vet', 'No QIC by usr:PtrQs', + ('Photographs taken with Minolta AF Zoom ' + '28-70mm F2.8 G'), + 'Self-published work', 'Taken with Sony DSLR-A900', + 'Trees in Finist\u00e8re']} ) +def test_process_image_data_adds_example_dict(): + """ + `_process_image_data` calls `ImageStore.add_item` with valid arguments, + and doesn't pass unexpected arguments. Saves the item to the `ImageStore`. + """ + with open(os.path.join(RESOURCES, 'image_data_example.json')) as f: + image_data = json.load(f) + wmc._process_image_data(image_data) + assert wmc.image_store.total_items == 1 + + def test_process_image_data_throws_out_invalid_mediatype(monkeypatch): image_data = {'mediatype': 'INVALID'} @@ -396,14 +415,14 @@ def test_extract_creator_info_handles_link_as_partial_text(): assert expect_creator_url == actual_creator_url -def test_get_license_url_finds_license_url(): +def test_get_license_info_finds_license_url(): with open( os.path.join(RESOURCES, 'image_info_from_example_data.json') ) as f: image_info = json.load(f) - expect_license_url = 'https://creativecommons.org/licenses/by-sa/4.0' - actual_license_url = wmc._get_license_url(image_info) + expect_license_url = 'https://creativecommons.org/licenses/by-sa/4.0/' + actual_license_url = wmc._get_license_info(image_info).url assert actual_license_url == expect_license_url @@ -413,7 +432,7 @@ def test_get_license_url_handles_missing_license_url(): ) as f: image_info = json.load(f) expect_license_url = None - actual_license_url = wmc._get_license_url(image_info) + actual_license_url = wmc._get_license_info(image_info).url assert actual_license_url == expect_license_url @@ -423,7 +442,7 @@ def test_get_license_url_handles_cc0_license(): ) as f: image_info = json.load(f) expect_license_url = 'https://creativecommons.org/publicdomain/zero/1.0/' - actual_license_url = wmc._get_license_url(image_info) + actual_license_url = wmc._get_license_info(image_info).url assert actual_license_url == expect_license_url diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py b/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py index c978abd0f..78fe189c6 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py @@ -18,7 +18,12 @@ from urllib.parse import urlparse import lxml.html as html -from common import DelayedRequester, ImageStore +from common import ( + get_license_info, + DelayedRequester, + ImageStore, +) + from util.loader import provider_details as prov logger = logging.getLogger(__name__) @@ -236,8 +241,8 @@ def _process_image_data(image_data): valid_mediatype = _check_mediatype(image_info) if not valid_mediatype: return None - license_url = _get_license_url(image_info) - if license_url is None: + license_info = _get_license_info(image_info) + if license_info.url is None: return None image_url = image_info.get('url') creator, creator_url = _extract_creator_info(image_info) @@ -246,7 +251,7 @@ def _process_image_data(image_data): image_store.add_item( foreign_landing_url=image_info.get('descriptionshorturl'), image_url=image_url, - license_url=license_url, + license_info=license_info, foreign_identifier=foreign_id, width=image_info.get('width'), height=image_info.get('height'), @@ -345,7 +350,7 @@ def _extract_category_info(image_info): return categories_list -def _get_license_url(image_info): +def _get_license_info(image_info): license_url = ( image_info .get('extmetadata', {}) @@ -367,7 +372,8 @@ def _get_license_url(image_info): license_url = 'https://creativecommons.org/publicdomain/zero/1.0/' else: license_url = None - return license_url + license_info = get_license_info(license_url=license_url) + return license_info def _create_meta_data_dict(image_data):