-
Notifications
You must be signed in to change notification settings - Fork 214
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add justtakeitfree as an image provider
- Loading branch information
Showing
6 changed files
with
171 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
92 changes: 92 additions & 0 deletions
92
catalog/dags/providers/provider_api_scripts/justtakeitfree.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
""" | ||
Content Provider: Justtakeitfree | ||
ETL Process: Use the API to identify all CC licensed media. | ||
Output: TSV file containing the media and the | ||
respective meta-data. | ||
Notes: https://justtakeitfree.com/api/api.php | ||
""" | ||
import logging | ||
|
||
from airflow.models import Variable | ||
|
||
from common.constants import IMAGE | ||
from common.licenses import get_license_info | ||
from common.loader import provider_details as prov | ||
from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class JusttakeitfreeDataIngester(ProviderDataIngester): | ||
providers = { | ||
"image": prov.JUSTTAKEITFREE_IMAGE_PROVIDER, | ||
} | ||
endpoint = "https://justtakeitfree.com/api/api.php" | ||
|
||
def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: | ||
# On the first request, `prev_query_params` will be `None`. We can detect this | ||
# and return our default params. | ||
if not prev_query_params: | ||
return {"page": 0, "key": Variable.get("API_KEY_JUSTTAKEITFREE")} | ||
else: | ||
return { | ||
**prev_query_params, | ||
"page": prev_query_params["page"] + 1, | ||
} | ||
|
||
def get_batch_data(self, response_json): | ||
if response_json: | ||
return response_json.get("data")[0] | ||
return None | ||
|
||
def get_media_type(self, record: dict): | ||
return IMAGE | ||
|
||
def get_record_data(self, data: dict) -> dict | list[dict] | None: | ||
if not (foreign_identifier := data.get("page_link", "").split("/")[-2]): | ||
logger.debug("Skipping record with missing foreign_identifier") | ||
return None | ||
|
||
if not (foreign_landing_url := data.get("page_link")): | ||
logger.debug("Skipping record with missing foreign landing url") | ||
return None | ||
|
||
if not (url := data.get("full_image_link")): | ||
logger.debug("Skipping record with missing url") | ||
return None | ||
|
||
# Use the `get_license_info` utility to get license information from a URL. | ||
license_url = data.get("license_link") | ||
license_info = get_license_info(license_url) | ||
if license_info is None: | ||
return None | ||
|
||
creator = "Justtakeitfree Free Photos" | ||
creator_url = "https://justtakeitfree.com" | ||
raw_tags = data.get("tags") | ||
|
||
return { | ||
"foreign_landing_url": foreign_landing_url, | ||
"url": url, | ||
"license_info": license_info, | ||
"foreign_identifier": foreign_identifier, | ||
# Optional fields | ||
"creator": creator, | ||
"creator_url": creator_url, | ||
"raw_tags": raw_tags, | ||
} | ||
|
||
|
||
def main(): | ||
# Allows running ingestion from the CLI without Airflow running for debugging | ||
# purposes. | ||
ingester = JusttakeitfreeDataIngester() | ||
ingester.ingest_records() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
10 changes: 10 additions & 0 deletions
10
catalog/tests/dags/providers/provider_api_scripts/resources/justtakeitfree/single_item.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
{ | ||
"credit_requirements": "This image is published under <a href=\"https://creativecommons.org/licenses/by/4.0/deed.en\" target=\"_blank\">CC BY 4.0 licence</a>.<br />Free for commercial use. Attribution required.<br />Please, credit: © Justtakeitfree Free Photos (CC BY 4.0)<br />An active hyperlink to the page should be provided.", | ||
"credit_text": "© Justtakeitfree Free Photos (CC BY 4.0)", | ||
"full_image_link": "https://justtakeitfree.com/photos/2.jpg", | ||
"license": "(CC BY 4.0)", | ||
"license_link": "https://creativecommons.org/licenses/by/4.0/deed.en", | ||
"page_link": "https://justtakeitfree.com/photo/2/", | ||
"preview_link": "https://justtakeitfree.com/photos/2_800.jpg", | ||
"tags": ["Baturyn fortress", "Baturyn citadel", "cossack fortress"] | ||
} |
63 changes: 63 additions & 0 deletions
63
catalog/tests/dags/providers/provider_api_scripts/test_justtakeitfree.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
""" | ||
TODO: Add additional tests for any methods you added in your subclass. | ||
Try to test edge cases (missing keys, different data types returned, Nones, etc). | ||
You may also need to update the given test names to be more specific. | ||
Run your tests locally with `just test -k justtakeitfree` | ||
""" | ||
|
||
import json | ||
from pathlib import Path | ||
|
||
from common.licenses import get_license_info | ||
from providers.provider_api_scripts.justtakeitfree import JusttakeitfreeDataIngester | ||
|
||
|
||
# TODO: API responses used for testing can be added to this directory | ||
RESOURCES = Path(__file__).parent / "resources/justtakeitfree" | ||
|
||
# Set up test class | ||
ingester = JusttakeitfreeDataIngester() | ||
|
||
|
||
def test_get_next_query_params_default_response(): | ||
actual_result = ingester.get_next_query_params(None) | ||
actual_result.pop("key", None) | ||
expected_result = { | ||
"page": 0, | ||
} | ||
assert actual_result == expected_result | ||
|
||
|
||
def test_get_next_query_params_updates_parameters(): | ||
previous_query_params = { | ||
"page": 1, | ||
} | ||
actual_result = ingester.get_next_query_params(previous_query_params) | ||
actual_result.pop("key", None) | ||
|
||
expected_result = { | ||
"page": 2, | ||
} | ||
assert actual_result == expected_result | ||
|
||
|
||
def test_get_record_data(): | ||
with open(RESOURCES / "single_item.json") as f: | ||
resource_json = json.load(f) | ||
|
||
actual_data = ingester.get_record_data(resource_json) | ||
|
||
expected_data = { | ||
"foreign_landing_url": "https://justtakeitfree.com/photo/2/", | ||
"url": "https://justtakeitfree.com/photos/2.jpg", | ||
"foreign_identifier": "2", | ||
"creator": "Justtakeitfree Free Photos", | ||
"creator_url": "https://justtakeitfree.com", | ||
"license_info": get_license_info( | ||
"https://creativecommons.org/licenses/by/4.0/deed.en" | ||
), | ||
"raw_tags": ["Baturyn fortress", "Baturyn citadel", "cossack fortress"], | ||
} | ||
|
||
assert actual_data == expected_data |