From cc4b2f9fede3f50425ed88647e9ecf6062b1a636 Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Mon, 24 Oct 2022 15:34:06 +1100 Subject: [PATCH 1/9] Add default implementation for `get_media_type` for providers with single media type --- .../provider_data_ingester.py | 15 +++++--- .../mock_provider_data_ingester.py | 36 ++++++++++++++++--- .../test_provider_data_ingester.py | 19 ++++++++++ 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py b/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py index 7028e0c4c..f1db52dcd 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py @@ -337,7 +337,7 @@ def get_batch_data(self, response_json): """ pass - def process_batch(self, media_batch): + def process_batch(self, media_batch) -> int: """ Process a batch of records by adding them to the appropriate MediaStore. Returns the total count of records ingested up to this point, for all @@ -371,14 +371,19 @@ def process_batch(self, media_batch): return record_count - @abstractmethod def get_media_type(self, record: dict) -> str: """ For a given record, return the media type it represents (eg "image", "audio", - etc.) If a provider only supports a single media type, this may be hard-coded - to return that type. + etc.) If a provider only supports a single media type, this method defaults + to returning the only media type defined in the ``providers`` attribute. """ - pass + if len(self.providers) == 1: + return list(self.providers.keys())[0] + + raise NotImplementedError( + "Provider scripts that support multiple media types " + "must provide an override for ``get_media_type``." + ) @abstractmethod def get_record_data(self, data: dict) -> dict | list[dict] | None: diff --git a/tests/dags/providers/provider_api_scripts/resources/provider_data_ingester/mock_provider_data_ingester.py b/tests/dags/providers/provider_api_scripts/resources/provider_data_ingester/mock_provider_data_ingester.py index 453982d15..ea8b69b9f 100644 --- a/tests/dags/providers/provider_api_scripts/resources/provider_data_ingester/mock_provider_data_ingester.py +++ b/tests/dags/providers/provider_api_scripts/resources/provider_data_ingester/mock_provider_data_ingester.py @@ -16,10 +16,13 @@ DEFAULT_QUERY_PARAMS = {"has_image": 1, "page": 1} -class MockProviderDataIngester(ProviderDataIngester): +class MockProviderDataIngesterMixin: """ A very simple concrete implementation of the ProviderDataIngester class, for testing purposes. + + Excludes ``get_media_type`` to allow for testing implementations + that do not require it (single media type providers). """ providers = {"audio": AUDIO_PROVIDER, "image": IMAGE_PROVIDER} @@ -33,9 +36,6 @@ def get_batch_data(self, response_json): return response_json.get("data") return None - def get_media_type(self, record): - return record["media_type"] - def get_record_data(self, record): data = { "foreign_identifier": record["id"], @@ -50,6 +50,34 @@ def get_record_data(self, record): return data +class MockProviderDataIngester(MockProviderDataIngesterMixin, ProviderDataIngester): + def get_media_type(self, record): + return record["media_type"] + + +class MockImageOnlyProviderDataIngester( + MockProviderDataIngesterMixin, ProviderDataIngester +): + providers = {"image": IMAGE_PROVIDER} + + +class MockAudioOnlyProviderDataIngester( + MockProviderDataIngesterMixin, ProviderDataIngester +): + providers = {"audio": AUDIO_PROVIDER} + + +class IncorrectlyConfiguredMockProviderDataIngester( + MockProviderDataIngesterMixin, ProviderDataIngester +): + """ + Used for testing default method implementions. + """ + + # Do not configure ``get_media_type`` to test the failure case + # for the default implementation + + # Expected result of calling `get_batch_data` with `response_success.json` EXPECTED_BATCH_DATA = [ { diff --git a/tests/dags/providers/provider_api_scripts/test_provider_data_ingester.py b/tests/dags/providers/provider_api_scripts/test_provider_data_ingester.py index 7da644683..2b4a3de50 100644 --- a/tests/dags/providers/provider_api_scripts/test_provider_data_ingester.py +++ b/tests/dags/providers/provider_api_scripts/test_provider_data_ingester.py @@ -17,6 +17,9 @@ EXPECTED_BATCH_DATA, IMAGE_PROVIDER, MOCK_RECORD_DATA_LIST, + IncorrectlyConfiguredMockProviderDataIngester, + MockAudioOnlyProviderDataIngester, + MockImageOnlyProviderDataIngester, MockProviderDataIngester, ) @@ -26,6 +29,9 @@ ) ingester = MockProviderDataIngester() +image_ingester = MockImageOnlyProviderDataIngester() +audio_ingester = MockAudioOnlyProviderDataIngester() +misconfigured_ingester = IncorrectlyConfiguredMockProviderDataIngester() audio_store = MockAudioStore(AUDIO_PROVIDER) image_store = MockImageStore(IMAGE_PROVIDER) ingester.media_stores = {"audio": audio_store, "image": image_store} @@ -388,3 +394,16 @@ def test_commit_commits_all_stores(): assert audio_store_mock.called assert image_store_mock.called + + +def test_get_media_type_default_behaviour_multiple_media_types_fails(): + with pytest.raises(NotImplementedError): + misconfigured_ingester.get_media_type({}) + + +def test_get_media_type_default_behaviour_image_only_provider(): + assert image_ingester.get_media_type({}) == "image" + + +def test_get_media_type_default_behaviour_audio_only_provider(): + assert audio_ingester.get_media_type({}) == "audio" From 2781620c29785b838389acf272cbaab7a53098df Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Mon, 24 Oct 2022 15:34:12 +1100 Subject: [PATCH 2/9] Refactor Europeana to use ProviderDataIngester base class --- .../provider_api_scripts/europeana.py | 366 +++++++----------- .../provider_api_scripts/test_europeana.py | 256 ++++++------ 2 files changed, 247 insertions(+), 375 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/europeana.py b/openverse_catalog/dags/providers/provider_api_scripts/europeana.py index bd0f56b0f..ea924b192 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/europeana.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/europeana.py @@ -8,266 +8,172 @@ Notes: https://www.europeana.eu/api/v2/search.json """ - import argparse import logging from datetime import datetime, timedelta, timezone +import common from airflow.models import Variable from common.licenses import get_license_info from common.loader import provider_details as prov -from common.requester import DelayedRequester -from common.storage.image import ImageStore -from requests.exceptions import JSONDecodeError +from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester -logging.basicConfig( - format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.INFO -) logger = logging.getLogger(__name__) +logging.getLogger(common.urls.__name__).setLevel(logging.WARNING) + + +class EuropeanaRecordBuilder: + """ + A small class to contain the record building functionality + and simplify testing a bit. + """ + + def get_record_data(self, data: dict) -> dict: + record = { + "foreign_landing_url": self.get_foreign_landing_url(data), + "image_url": data.get("edmIsShownBy")[0], + "foreign_identifier": data.get("id"), + "meta_data": self.get_meta_data_dict(data), + "title": data.get("title")[0], + "license_info": get_license_info( + license_url=self.get_license_url(data.get("rights")) + ), + } + + data_providers = set(record["meta_data"]["dataProvider"]) + eligible_sub_providers = { + s + for s in EuropeanaDataIngester.sub_providers + if EuropeanaDataIngester.sub_providers[s] in data_providers + } + if len(eligible_sub_providers) > 1: + raise Exception( + f"More than one sub-provider identified for the " + f"image with foreign ID {record['foreign_identifier']}" + ) + + return record | { + "source": ( + eligible_sub_providers.pop() + if len(eligible_sub_providers) == 1 + else EuropeanaDataIngester.providers["image"] + ) + } + + def get_license_url(self, license_field) -> str | None: + if len(license_field) > 1: + logger.warning("More than one license field found") + for license_ in license_field: + if "creativecommons" in license_: + return license_ + return None + + def get_foreign_landing_url(self, data: dict) -> str: + original_url = data.get("edmIsShownAt") + if original_url is not None: + return original_url[0] + europeana_url = data.get("guid") + return europeana_url + + def get_meta_data_dict(self, data: dict) -> dict: + meta_data = { + "country": data.get("country"), + "dataProvider": data.get("dataProvider"), + "description": self.get_description(data), + } + + return {k: v for k, v in meta_data.items() if v is not None} + + def get_description(self, data: dict) -> str | None: + lang_aware_description = data.get("dcDescriptionLangAware") + if lang_aware_description: + description = lang_aware_description.get( + "en" + ) or lang_aware_description.get("def") + else: + description = data.get("dcDescription") -DELAY = 30.0 -RESOURCES_PER_REQUEST = "100" -PROVIDER = prov.EUROPEANA_DEFAULT_PROVIDER -API_KEY = Variable.get("API_KEY_EUROPEANA", default_var=None) -ENDPOINT = "https://www.europeana.eu/api/v2/search.json?" -# SUB_PROVIDERS is a collection of providers within europeana which are -# valuable to a broad audience -SUB_PROVIDERS = prov.EUROPEANA_SUB_PROVIDERS - -RESOURCE_TYPE = "IMAGE" -REUSE_TERMS = ["open", "restricted"] - -DEFAULT_QUERY_PARAMS = { - "profile": "rich", - "reusability": REUSE_TERMS, - "sort": ["europeana_id+desc", "timestamp_created+desc"], - "rows": RESOURCES_PER_REQUEST, - "media": "true", - "start": 1, - "qf": [f"TYPE:{RESOURCE_TYPE}", "provider_aggregation_edm_isShownBy:*"], -} - -delayed_requester = DelayedRequester(DELAY) -image_store = ImageStore(provider=PROVIDER) - - -def main(date): - logger.info(f"Processing Europeana API for date: {date}") - - start_timestamp, end_timestamp = _derive_timestamp_pair(date) - _get_pagewise(start_timestamp, end_timestamp) - - total_images = image_store.commit() - logger.info(f"Total images: {total_images}") - logger.info("Terminated!") - - -def _get_pagewise(start_timestamp, end_timestamp): - cursor = "*" + if description: + return description[0].strip() - while cursor is not None: - image_list, next_cursor, total_number_of_images = _get_image_list( - start_timestamp, end_timestamp, cursor - ) + return "" - if next_cursor is None: - break - cursor = next_cursor +class EuropeanaDataIngester(ProviderDataIngester): + providers = {"image": prov.EUROPEANA_DEFAULT_PROVIDER} + sub_providers = prov.EUROPEANA_SUB_PROVIDERS + batch_limit = 100 + endpoint = "https://www.europeana.eu/api/v2/search.json?" + delay = 30 - if image_list is not None: - images_stored = _process_image_list(image_list) - logger.info(f"Images stored: {images_stored} of {total_number_of_images}") + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) - else: - logger.warning("No image data! Attempting to continue") - - -def _get_image_list( - start_timestamp, - end_timestamp, - cursor, - endpoint=ENDPOINT, - max_tries=6, # one original try, plus 5 retries -): - try_number = 0 - image_list, next_cursor, total_number_of_images = (None, None, None) - for try_number in range(max_tries): - - query_param_dict = _build_query_param_dict( - start_timestamp, end_timestamp, cursor - ) - - response = delayed_requester.get( - endpoint, - params=query_param_dict, - ) - - logger.debug("response.status_code: {response.status_code}") - response_json = _extract_response_json(response) - ( - image_list, - next_cursor, - total_number_of_images, - ) = _extract_image_list_from_json(response_json) - - if image_list is not None: - break - - if try_number == max_tries - 1 and (image_list is None or next_cursor is None): - logger.warning("No more tries remaining. Returning None types.") - return image_list, next_cursor, total_number_of_images - - -def _extract_response_json(response): - if response is not None and response.status_code == 200: - try: - response_json = response.json() - except JSONDecodeError as e: - logger.warning(f"Could not get image_data json.\n{e}") - response_json = None - else: - response_json = None + # Each response back from Europeana returns a `nextCursor` + # property that needs to be passed to subsequent requests + # as `cursor`. This allows us to systematically page + # through the API data. + self.cursor = None - return response_json + self.base_request_body = { + "wskey": Variable.get("API_KEY_EUROPEANA", default_var=None), + "profile": "rich", + "reusability": ["open", "restricted"], + "sort": ["europeana_id+desc", "timestamp_created+desc"], + "rows": str(self.batch_limit), + "media": "true", + "start": 1, + "qf": ["TYPE:IMAGE", "provider_aggregation_edm_isShownBy:*"], + # As a dated DAG, Europeana accepts a ``query`` prop in the + # request params that delineates the timestamps between which + # records will have been added. The base class sets up the + # ``self.date`` attribute for us, so we can construct that + # ``query`` prop for the request params ahead of time. + "query": self._get_timestamp_query_param(self.date), + } + self.record_builder = EuropeanaRecordBuilder() -def _extract_image_list_from_json(response_json): - if response_json is None or str(response_json.get("success")) != "True": - image_list, next_cursor, total_number_of_images = None, None, None - else: - image_list = response_json.get("items") - next_cursor = response_json.get("nextCursor") - total_number_of_images = response_json.get("totalResults") + def _get_timestamp_query_param(self, date): + date_obj = datetime.strptime(date, "%Y-%m-%d") + utc_date = date_obj.replace(tzinfo=timezone.utc) + start_timestamp = utc_date.isoformat() + end_timestamp = (utc_date + timedelta(days=1)).isoformat() - return image_list, next_cursor, total_number_of_images + start_timestamp = start_timestamp.replace("+00:00", "Z") + end_timestamp = end_timestamp.replace("+00:00", "Z") + return f"timestamp_created:[{start_timestamp} TO {end_timestamp}]" -def _process_image_list(image_list): - prev_total = 0 - total_images = 0 - for image_data in image_list: - total_images = _process_image_data(image_data) - if total_images is None: - total_images = prev_total - else: - prev_total = total_images - - return total_images - - -def _process_image_data(image_data, sub_providers=SUB_PROVIDERS, provider=PROVIDER): - logger.debug(f"Processing image data: {image_data}") - license_url = _get_license_url(image_data.get("rights")) - image_url = image_data.get("edmIsShownBy")[0] - foreign_landing_url = _get_foreign_landing_url(image_data) - foreign_id = image_data.get("id") - title = image_data.get("title")[0] - meta_data = _create_meta_data_dict(image_data) - - data_providers = set(meta_data["dataProvider"]) - eligible_sub_providers = { - s for s in sub_providers if sub_providers[s] in data_providers - } - if len(eligible_sub_providers) > 1: - raise Exception( - f"More than one sub-provider identified for the " - f"image with foreign ID {foreign_id}" - ) - source = ( - eligible_sub_providers.pop() if len(eligible_sub_providers) == 1 else provider - ) + def get_next_query_params(self, prev_query_params) -> dict: + if not prev_query_params: + return self.base_request_body - license_info = get_license_info(license_url=license_url) + return prev_query_params | { + "cursor": self.cursor, + } - return image_store.add_item( - foreign_landing_url=foreign_landing_url, - image_url=image_url, - license_info=license_info, - foreign_identifier=foreign_id, - title=title, - meta_data=meta_data, - source=source, - ) + def get_should_continue(self, response_json): + if response_json.get("success") != "True": + logger.warning('Request failed with ``success = "False"``') + return False + self.cursor = response_json.get("nextCursor") -def _get_license_url(license_field): - if len(license_field) > 1: - logger.warning("More than one license field found") - for license_ in license_field: - if "creativecommons" in license_: - return license_ - return None - - -def _get_foreign_landing_url(image_data): - original_url = image_data.get("edmIsShownAt") - if original_url is not None: - return original_url[0] - europeana_url = image_data.get("guid") - return europeana_url - - -def _create_meta_data_dict(image_data): - meta_data = { - "country": image_data.get("country"), - "dataProvider": image_data.get("dataProvider"), - "description": _get_description(image_data), - } - - return {k: v for k, v in meta_data.items() if v is not None} - - -def _get_description(image_data): - if ( - image_data.get("dcDescriptionLangAware") is not None - and image_data.get("dcDescriptionLangAware").get("en") is not None - ): - description = image_data.get("dcDescriptionLangAware").get("en")[0] - elif ( - image_data.get("dcDescriptionLangAware") is not None - and image_data.get("dcDescriptionLangAware").get("def") is not None - ): - description = image_data.get("dcDescriptionLangAware").get("def")[0] - elif image_data.get("dcDescription") is not None: - description = image_data.get("dcDescription")[0] - else: - description = None - - description = description.strip() if description is not None else "" - - return description - - -def _build_query_param_dict( - start_timestamp, - end_timestamp, - cursor, - api_key=API_KEY, - default_query_param=None, -): - if default_query_param is None: - default_query_param = DEFAULT_QUERY_PARAMS - query_param_dict = default_query_param.copy() - query_param_dict.update( - wskey=api_key, - query=f"timestamp_created:[{start_timestamp} TO {end_timestamp}]", - cursor=cursor, - ) - return query_param_dict + return self.cursor is not None + def get_batch_data(self, response_json): + return response_json.get("items") -def _derive_timestamp_pair(date): - date_obj = datetime.strptime(date, "%Y-%m-%d") - utc_date = date_obj.replace(tzinfo=timezone.utc) - start_timestamp = utc_date.isoformat() - end_timestamp = (utc_date + timedelta(days=1)).isoformat() + def get_record_data(self, data: dict) -> dict: + return self.record_builder.get_record_data(data) - start_timestamp = start_timestamp.replace("+00:00", "Z") - end_timestamp = end_timestamp.replace("+00:00", "Z") - return start_timestamp, end_timestamp +def main(date): + logger.info(f"Begin: Europeana data ingestion for {date}") + ingester = EuropeanaDataIngester(date) + ingester.ingest_records() if __name__ == "__main__": diff --git a/tests/dags/providers/provider_api_scripts/test_europeana.py b/tests/dags/providers/provider_api_scripts/test_europeana.py index c3a8224b2..43447adf2 100644 --- a/tests/dags/providers/provider_api_scripts/test_europeana.py +++ b/tests/dags/providers/provider_api_scripts/test_europeana.py @@ -1,22 +1,18 @@ import json -import logging import os -from unittest.mock import MagicMock, patch -import requests +import pytest from common.licenses import LicenseInfo -from providers.provider_api_scripts import europeana +from providers.provider_api_scripts.europeana import ( + EuropeanaDataIngester, + EuropeanaRecordBuilder, +) RESOURCES = os.path.join( os.path.abspath(os.path.dirname(__file__)), "resources/europeana" ) -logging.basicConfig( - format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", - level=logging.DEBUG, -) - def _get_resource_json(json_name): with open(os.path.join(RESOURCES, json_name)) as f: @@ -25,120 +21,95 @@ def _get_resource_json(json_name): return resource_json -def test_derive_timestamp_pair(): - # Note that the timestamps are derived as if input was in UTC. - start_ts, end_ts = europeana._derive_timestamp_pair("2018-01-15") - assert start_ts == "2018-01-15T00:00:00Z" - assert end_ts == "2018-01-16T00:00:00Z" +FROZEN_DATE = "2018-01-15" -def test_get_image_list_retries_with_none_response(): - with patch.object( - europeana.delayed_requester, "get", return_value=None - ) as mock_get: - europeana._get_image_list("1234", "5678", "test_cursor", max_tries=3) +@pytest.fixture +def ingester() -> EuropeanaDataIngester: + return EuropeanaDataIngester(date=FROZEN_DATE) - assert mock_get.call_count == 3 +@pytest.fixture +def record_builder() -> EuropeanaRecordBuilder: + return EuropeanaRecordBuilder() -def test_get_image_list_for_last_page(): - response_json = _get_resource_json("europeana_example.json") - response_json["items"] = [] - response_json.pop("nextCursor", None) - r = requests.Response() - r.status_code = 200 - r.json = MagicMock(return_value=response_json) +def test_derive_timestamp_pair(ingester): + # Note that the timestamps are derived as if input was in UTC. + # The timestamps below depend on the ``FROZEN_DATE`` constant + # defined above. + assert ingester.base_request_body["query"] == ( + "timestamp_created:[2018-01-15T00:00:00Z TO 2018-01-16T00:00:00Z]" + ) - with patch.object(europeana.delayed_requester, "get", return_value=r) as mock_get: - europeana._get_image_list("1234", "5678", "test_cursor") - mock_get.assert_called_once() +def test_get_next_query_params_uses_default_first_pass(ingester): + assert ingester.get_next_query_params({}) == ingester.base_request_body -def test_get_image_list_retries_with_non_ok_response(): - response_json = _get_resource_json("europeana_example.json") - r = requests.Response() - r.status_code = 504 - r.json = MagicMock(return_value=response_json) - with patch.object(europeana.delayed_requester, "get", return_value=r) as mock_get: - europeana._get_image_list("1234", "5678", "test_cursor", max_tries=3) +def test_get_next_query_params_updates_cursor(ingester): + prev_query_params = ingester.base_request_body.copy() + # Set cursor to something, by default it will be empty + cursor = 243392 + ingester.cursor = cursor - assert mock_get.call_count == 3 + # test that it will add the cursor when none was previously set + next_query_params = ingester.get_next_query_params(prev_query_params) + assert next_query_params == prev_query_params | {"cursor": cursor} + # next test that it actually also updates any existing cursor + next_cursor = cursor + 1 + ingester.cursor = next_cursor -def test_get_image_list_with_realistic_response(): - response_json = _get_resource_json("europeana_example.json") - r = requests.Response() - r.status_code = 200 - r.json = MagicMock(return_value=response_json) - with patch.object(europeana.delayed_requester, "get", return_value=r) as mock_get: - image_list, next_cursor, total_number_of_images = europeana._get_image_list( - "1234", "5678", "test_cursor", max_tries=3 - ) - expect_image_list = _get_resource_json("europeana_image_list.json") - - assert mock_get.call_count == 1 - assert image_list == expect_image_list - - -# This test will fail if default constants change. -def test_build_query_param_dict_default(): - start_timestamp = "1234" - end_timestamp = "5678" - europeana_api_key = "test_key" - resource_type = "IMAGE" - reuse_terms = ["open", "restricted"] - resources_per_request = "100" - - actual_query_param_dict = europeana._build_query_param_dict( - start_timestamp, end_timestamp, "test_cursor", api_key=europeana_api_key + next_query_params_with_updated_cursor = ingester.get_next_query_params( + next_query_params ) - expect_query_param_dict = { - "wskey": europeana_api_key, - "profile": "rich", - "reusability": reuse_terms, - "sort": ["europeana_id+desc", "timestamp_created+desc"], - "rows": resources_per_request, - "media": "true", - "start": 1, - "qf": [f"TYPE:{resource_type}", "provider_aggregation_edm_isShownBy:*"], - "query": f"timestamp_created:[{start_timestamp} TO {end_timestamp}]", - "cursor": "test_cursor", + + assert next_query_params_with_updated_cursor == next_query_params | { + "cursor": next_cursor } - assert actual_query_param_dict == expect_query_param_dict -def test_extract_image_list_from_json_handles_realistic_input(): - test_dict = _get_resource_json("europeana_example.json") - expect_image_list = _get_resource_json("europeana_image_list.json") - expect_next_cursor = "test_next_cursor" - expect_total_number_of_images = 27 +def test_get_should_continue_updates_cursor(ingester): + assert ingester.cursor is None + + response_json = { + "nextCursor": 123533, + "success": "True", + } + + assert ingester.get_should_continue(response_json) is True + + assert ingester.cursor == response_json["nextCursor"] + + +@pytest.mark.parametrize( + ("response_json"), ( - actual_image_list, - actual_next_cursor, - actual_total_number_of_images, - ) = europeana._extract_image_list_from_json(test_dict) - assert actual_image_list == expect_image_list - assert actual_next_cursor == expect_next_cursor - assert actual_total_number_of_images == expect_total_number_of_images + {"success": "True", "nextCursor": None}, + {"success": "True"}, + {"success": "False", "nextCursor": "blam"}, + ), +) +def test_get_should_continue_returns_false(ingester, response_json): + assert ingester.get_should_continue(response_json) is False -def test_extract_image_list_from_json_returns_nones_given_non_true_success(): - test_dict = {"success": "false", "nextCursor": "test_next_cursor"} - assert europeana._extract_image_list_from_json(test_dict) == (None, None, None) +def test_get_batch_data_gets_items_property(ingester): + response_json = {"items": object()} + assert ingester.get_batch_data(response_json) is response_json["items"] -def test_extract_image_list_from_json_returns_nones_given_none_json(): - assert europeana._extract_image_list_from_json(None) == (None, None, None) +def test_get_image_list_with_realistic_response(ingester): + response_json = _get_resource_json("europeana_example.json") + record_count = ingester.process_batch(response_json["items"]) + assert record_count == len(response_json["items"]) -def test_process_image_data_with_real_example(): + +def test_record_builder_get_record_data(ingester, record_builder): image_data = _get_resource_json("image_data_example.json") - with patch.object( - europeana.image_store, "add_item", return_value=100 - ) as mock_add_item: - total_images = europeana._process_image_data(image_data) + record_data = record_builder.get_record_data(image_data) expect_meta_data = { "country": ["Spain"], @@ -146,15 +117,15 @@ def test_process_image_data_with_real_example(): "description": "Sello en seco: España artística y monumental.", } - mock_add_item.assert_called_once_with( - foreign_landing_url=( + assert record_data == { + "foreign_landing_url": ( "http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?" "id=26229" ), - image_url=( + "image_url": ( "http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes" "/imagen_id.cmd?idImagen=102620362" ), - license_info=( + "license_info": ( LicenseInfo( "cc0", "1.0", @@ -162,49 +133,48 @@ def test_process_image_data_with_real_example(): "http://creativecommons.org/publicdomain/zero/1.0/", ) ), - foreign_identifier="/2022704/lod_oai_bibliotecadigital_jcyl_es_26229_ent1", - title=( + "foreign_identifier": "/2022704/lod_oai_bibliotecadigital_jcyl_es_26229_ent1", + "title": ( "Claustro del Monasterio de S. Salvador en Oña [Material gráfico]" "= Cloître du Monastère de S. Salvador à Oña" ), - meta_data=expect_meta_data, - source=europeana.PROVIDER, - ) - assert total_images == 100 + "meta_data": expect_meta_data, + "source": ingester.providers["image"], + } -def test_get_license_url_with_real_example(): +def test_record_builder_get_license_url_with_real_example(record_builder): rights_field = ["http://creativecommons.org/publicdomain/zero/1.0/"] assert ( - europeana._get_license_url(rights_field) + record_builder.get_license_url(rights_field) == "http://creativecommons.org/publicdomain/zero/1.0/" ) -def test_get_license_url_with_non_cc_license(): +def test_get_license_url_with_non_cc_license(record_builder): rights_field = ["http://noncc.org/"] - assert europeana._get_license_url(rights_field) is None + assert record_builder.get_license_url(rights_field) is None -def test_get_license_url_with_multiple_license(): +def test_get_license_url_with_multiple_license(record_builder): rights_field = [ "http://noncc.org/", "http://creativecommons.org/publicdomain/zero/1.0/", ] expect_license = "http://creativecommons.org/publicdomain/zero/1.0/" - assert europeana._get_license_url(rights_field) == expect_license + assert record_builder.get_license_url(rights_field) == expect_license -def test_get_foreign_landing_url_with_edmIsShownAt(): +def test_get_foreign_landing_url_with_edmIsShownAt(record_builder): image_data = _get_resource_json("image_data_example.json") expect_url = "http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=26229" - assert europeana._get_foreign_landing_url(image_data) == expect_url + assert record_builder.get_foreign_landing_url(image_data) == expect_url -def test_get_foreign_landing_url_without_edmIsShownAt(): +def test_get_foreign_landing_url_without_edmIsShownAt(record_builder): image_data = _get_resource_json("image_data_example.json") image_data.pop("edmIsShownAt", None) expect_url = ( @@ -212,10 +182,10 @@ def test_get_foreign_landing_url_without_edmIsShownAt(): "_es_26229_ent1?utm_source=api&utm_medium=api&utm_campaign=test_key" ) - assert europeana._get_foreign_landing_url(image_data) == expect_url + assert record_builder.get_foreign_landing_url(image_data) == expect_url -def test_create_meta_data_dict(): +def test_get_meta_data_dict(record_builder): image_data = _get_resource_json("image_data_example.json") expect_meta_data = { @@ -224,10 +194,10 @@ def test_create_meta_data_dict(): "description": "Sello en seco: España artística y monumental.", } - assert europeana._create_meta_data_dict(image_data) == expect_meta_data + assert record_builder.get_meta_data_dict(image_data) == expect_meta_data -def test_create_meta_data_dict_without_country(): +def test_get_meta_data_dict_without_country(record_builder): image_data = _get_resource_json("image_data_example.json") image_data.pop("country", None) @@ -236,10 +206,10 @@ def test_create_meta_data_dict_without_country(): "description": "Sello en seco: España artística y monumental.", } - assert europeana._create_meta_data_dict(image_data) == expect_meta_data + assert record_builder.get_meta_data_dict(image_data) == expect_meta_data -def test_get_description_with_langaware_en(): +def test_get_description_with_langaware_en(record_builder): image_data = _get_resource_json("image_data_example.json") image_data["dcDescriptionLangAware"]["en"] = [ "First English Description", @@ -247,40 +217,37 @@ def test_get_description_with_langaware_en(): ] expect_description = "First English Description" - assert europeana._get_description(image_data) == expect_description + assert record_builder.get_description(image_data) == expect_description -def test_get_description_with_langaware_def(): +def test_get_description_with_langaware_def(record_builder): image_data = _get_resource_json("image_data_example.json") expect_description = "Sello en seco: España artística y monumental." - assert europeana._get_description(image_data) == expect_description + assert record_builder.get_description(image_data) == expect_description -def test_get_description_without_langaware(): +def test_get_description_without_langaware(record_builder): image_data = _get_resource_json("image_data_example.json") image_data.pop("dcDescriptionLangAware", None) expect_description = "Sello en seco: España artística y monumental." - assert europeana._get_description(image_data) == expect_description + assert record_builder.get_description(image_data) == expect_description -def test_get_description_without_description(): +def test_get_description_without_description(record_builder): image_data = _get_resource_json("image_data_example.json") image_data.pop("dcDescriptionLangAware", None) image_data.pop("dcDescription", None) expect_description = "" - assert europeana._get_description(image_data) == expect_description + assert record_builder.get_description(image_data) == expect_description -def test_process_image_data_with_sub_provider(): +def test_process_image_data_with_sub_provider(record_builder): image_data = _get_resource_json("image_data_sub_provider_example.json") - with patch.object( - europeana.image_store, "add_item", return_value=100 - ) as mock_add_item: - total_images = europeana._process_image_data(image_data) + record_data = record_builder.get_record_data(image_data) expect_meta_data = { "country": ["United Kingdom"], @@ -288,24 +255,23 @@ def test_process_image_data_with_sub_provider(): "description": "Lettering: Greenwich Hospital.", } - mock_add_item.assert_called_once_with( - foreign_landing_url="https://wellcomecollection.org/works/zzwnbyhb", - image_url=( + assert record_data == { + "foreign_landing_url": "https://wellcomecollection.org/works/zzwnbyhb", + "image_url": ( "https://iiif.wellcomecollection.org/image/V0013398.jpg/full/512," "/0/default.jpg" ), - license_info=LicenseInfo( + "license_info": LicenseInfo( "by", "4.0", "https://creativecommons.org/licenses/by/4.0/", "http://creativecommons.org/licenses/by/4.0/", ), - foreign_identifier="/9200579/zzwnbyhb", - title=( + "foreign_identifier": "/9200579/zzwnbyhb", + "title": ( "Royal Naval Hospital, Greenwich, with ships and rowing boats " "in the foreground. Engraving." ), - meta_data=expect_meta_data, - source="wellcome_collection", - ) - assert total_images == 100 + "meta_data": expect_meta_data, + "source": "wellcome_collection", + } From dfac506409087cc3a48d689ef49e7503bc65940b Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Tue, 25 Oct 2022 09:24:22 +1100 Subject: [PATCH 3/9] Fix description missing if en or def are empty Also update record builder tests to not call individual methods, removing the assumption that the record builder methods operate individually. Making all methods other than `get_record_data` on the record builder class solidifies this. --- .../provider_api_scripts/europeana.py | 20 ++--- .../provider_api_scripts/test_europeana.py | 75 +++++++++++++------ 2 files changed, 63 insertions(+), 32 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/europeana.py b/openverse_catalog/dags/providers/provider_api_scripts/europeana.py index ea924b192..345a7b6a7 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/europeana.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/europeana.py @@ -31,13 +31,13 @@ class EuropeanaRecordBuilder: def get_record_data(self, data: dict) -> dict: record = { - "foreign_landing_url": self.get_foreign_landing_url(data), + "foreign_landing_url": self._get_foreign_landing_url(data), "image_url": data.get("edmIsShownBy")[0], "foreign_identifier": data.get("id"), - "meta_data": self.get_meta_data_dict(data), + "meta_data": self._get_meta_data_dict(data), "title": data.get("title")[0], "license_info": get_license_info( - license_url=self.get_license_url(data.get("rights")) + license_url=self._get_license_url(data.get("rights")) ), } @@ -61,7 +61,7 @@ def get_record_data(self, data: dict) -> dict: ) } - def get_license_url(self, license_field) -> str | None: + def _get_license_url(self, license_field) -> str | None: if len(license_field) > 1: logger.warning("More than one license field found") for license_ in license_field: @@ -69,29 +69,31 @@ def get_license_url(self, license_field) -> str | None: return license_ return None - def get_foreign_landing_url(self, data: dict) -> str: + def _get_foreign_landing_url(self, data: dict) -> str: original_url = data.get("edmIsShownAt") if original_url is not None: return original_url[0] europeana_url = data.get("guid") return europeana_url - def get_meta_data_dict(self, data: dict) -> dict: + def _get_meta_data_dict(self, data: dict) -> dict: meta_data = { "country": data.get("country"), "dataProvider": data.get("dataProvider"), - "description": self.get_description(data), + "description": self._get_description(data), } return {k: v for k, v in meta_data.items() if v is not None} - def get_description(self, data: dict) -> str | None: + def _get_description(self, data: dict) -> str | None: + description = None lang_aware_description = data.get("dcDescriptionLangAware") if lang_aware_description: description = lang_aware_description.get( "en" ) or lang_aware_description.get("def") - else: + + if not description: # cover None and [] description = data.get("dcDescription") if description: diff --git a/tests/dags/providers/provider_api_scripts/test_europeana.py b/tests/dags/providers/provider_api_scripts/test_europeana.py index 43447adf2..91e51e485 100644 --- a/tests/dags/providers/provider_api_scripts/test_europeana.py +++ b/tests/dags/providers/provider_api_scripts/test_europeana.py @@ -2,7 +2,7 @@ import os import pytest -from common.licenses import LicenseInfo +from common.licenses import LicenseInfo, get_license_info from providers.provider_api_scripts.europeana import ( EuropeanaDataIngester, EuropeanaRecordBuilder, @@ -144,34 +144,42 @@ def test_record_builder_get_record_data(ingester, record_builder): def test_record_builder_get_license_url_with_real_example(record_builder): - rights_field = ["http://creativecommons.org/publicdomain/zero/1.0/"] + image_data = _get_resource_json("image_data_example.json") + image_data["rights"] = ["http://creativecommons.org/publicdomain/zero/1.0/"] - assert ( - record_builder.get_license_url(rights_field) - == "http://creativecommons.org/publicdomain/zero/1.0/" - ) + assert record_builder.get_record_data(image_data)[ + "license_info" + ] == get_license_info("http://creativecommons.org/publicdomain/zero/1.0/") def test_get_license_url_with_non_cc_license(record_builder): - rights_field = ["http://noncc.org/"] + image_data = _get_resource_json("image_data_example.json") + image_data["rights"] = ["http://noncc.org/"] - assert record_builder.get_license_url(rights_field) is None + assert record_builder.get_record_data(image_data)["license_info"] == LicenseInfo( + None, None, None, None + ) def test_get_license_url_with_multiple_license(record_builder): - rights_field = [ + image_data = _get_resource_json("image_data_example.json") + image_data["rights"] = [ "http://noncc.org/", "http://creativecommons.org/publicdomain/zero/1.0/", ] - expect_license = "http://creativecommons.org/publicdomain/zero/1.0/" - assert record_builder.get_license_url(rights_field) == expect_license + expect_license = get_license_info( + "http://creativecommons.org/publicdomain/zero/1.0/" + ) + assert record_builder.get_record_data(image_data)["license_info"] == expect_license def test_get_foreign_landing_url_with_edmIsShownAt(record_builder): image_data = _get_resource_json("image_data_example.json") expect_url = "http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=26229" - assert record_builder.get_foreign_landing_url(image_data) == expect_url + assert ( + record_builder.get_record_data(image_data)["foreign_landing_url"] == expect_url + ) def test_get_foreign_landing_url_without_edmIsShownAt(record_builder): @@ -182,7 +190,9 @@ def test_get_foreign_landing_url_without_edmIsShownAt(record_builder): "_es_26229_ent1?utm_source=api&utm_medium=api&utm_campaign=test_key" ) - assert record_builder.get_foreign_landing_url(image_data) == expect_url + assert ( + record_builder.get_record_data(image_data)["foreign_landing_url"] == expect_url + ) def test_get_meta_data_dict(record_builder): @@ -194,7 +204,7 @@ def test_get_meta_data_dict(record_builder): "description": "Sello en seco: España artística y monumental.", } - assert record_builder.get_meta_data_dict(image_data) == expect_meta_data + assert record_builder.get_record_data(image_data)["meta_data"] == expect_meta_data def test_get_meta_data_dict_without_country(record_builder): @@ -206,10 +216,19 @@ def test_get_meta_data_dict_without_country(record_builder): "description": "Sello en seco: España artística y monumental.", } - assert record_builder.get_meta_data_dict(image_data) == expect_meta_data + assert record_builder.get_record_data(image_data)["meta_data"] == expect_meta_data + + +@pytest.fixture +def assert_description(record_builder): + def fn(image_data, expected_description): + record_data = record_builder.get_record_data(image_data) + assert record_data["meta_data"]["description"] == expected_description + + return fn -def test_get_description_with_langaware_en(record_builder): +def test_get_description_with_langaware_en(assert_description): image_data = _get_resource_json("image_data_example.json") image_data["dcDescriptionLangAware"]["en"] = [ "First English Description", @@ -217,32 +236,42 @@ def test_get_description_with_langaware_en(record_builder): ] expect_description = "First English Description" - assert record_builder.get_description(image_data) == expect_description + assert_description(image_data, expect_description) -def test_get_description_with_langaware_def(record_builder): +def test_get_description_with_langaware_def(assert_description): image_data = _get_resource_json("image_data_example.json") expect_description = "Sello en seco: España artística y monumental." - assert record_builder.get_description(image_data) == expect_description + assert_description(image_data, expect_description) -def test_get_description_without_langaware(record_builder): +def test_get_description_without_langaware(assert_description): image_data = _get_resource_json("image_data_example.json") image_data.pop("dcDescriptionLangAware", None) expect_description = "Sello en seco: España artística y monumental." - assert record_builder.get_description(image_data) == expect_description + assert_description(image_data, expect_description) -def test_get_description_without_description(record_builder): +def test_get_description_without_description(assert_description): image_data = _get_resource_json("image_data_example.json") image_data.pop("dcDescriptionLangAware", None) image_data.pop("dcDescription", None) expect_description = "" - assert record_builder.get_description(image_data) == expect_description + assert_description(image_data, expect_description) + + +def test_get_description_dcDescriptionLangAware_without_en_or_def(assert_description): + image_data = _get_resource_json("image_data_example.json") + # Need to give dcDescriptionLangAware _something_ to thwart naive + # falsy checks + image_data["dcDescriptionLangAware"] = {"pt": "Não sou uma descrição"} + + expect_description = image_data["dcDescription"][0] + assert_description(image_data, expect_description) def test_process_image_data_with_sub_provider(record_builder): From 384380dc24ea8a8192c87a6234da170ec882750e Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Tue, 25 Oct 2022 09:25:58 +1100 Subject: [PATCH 4/9] Remove unnecessary batch_limit override --- .../dags/providers/provider_api_scripts/europeana.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/europeana.py b/openverse_catalog/dags/providers/provider_api_scripts/europeana.py index 345a7b6a7..65617d184 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/europeana.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/europeana.py @@ -105,7 +105,6 @@ def _get_description(self, data: dict) -> str | None: class EuropeanaDataIngester(ProviderDataIngester): providers = {"image": prov.EUROPEANA_DEFAULT_PROVIDER} sub_providers = prov.EUROPEANA_SUB_PROVIDERS - batch_limit = 100 endpoint = "https://www.europeana.eu/api/v2/search.json?" delay = 30 From b54f8af171b80713f115f0d35b454eb83669d4c0 Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Tue, 25 Oct 2022 09:30:41 +1100 Subject: [PATCH 5/9] Update provider workflow config for Europeana --- openverse_catalog/dags/providers/provider_workflows.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openverse_catalog/dags/providers/provider_workflows.py b/openverse_catalog/dags/providers/provider_workflows.py index eceb0a632..f6cb1f42c 100644 --- a/openverse_catalog/dags/providers/provider_workflows.py +++ b/openverse_catalog/dags/providers/provider_workflows.py @@ -5,6 +5,7 @@ from providers.provider_api_scripts.brooklyn_museum import BrooklynMuseumDataIngester from providers.provider_api_scripts.cleveland_museum import ClevelandDataIngester +from providers.provider_api_scripts.europeana import EuropeanaDataIngester from providers.provider_api_scripts.finnish_museums import FinnishMuseumsDataIngester from providers.provider_api_scripts.freesound import FreesoundDataIngester from providers.provider_api_scripts.inaturalist import INaturalistDataIngester @@ -123,6 +124,7 @@ def __post_init__(self): ), ProviderWorkflow( provider_script="europeana", + ingestion_callable=EuropeanaDataIngester, start_date=datetime(2011, 9, 1), schedule_string="@daily", dated=True, From 1ea1a50763b2058e3e863d30701a479e94c4f316 Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Tue, 25 Oct 2022 16:05:26 +1100 Subject: [PATCH 6/9] Add back default cursor from previous implementation --- .../dags/providers/provider_api_scripts/europeana.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/europeana.py b/openverse_catalog/dags/providers/provider_api_scripts/europeana.py index 65617d184..14de1d1fd 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/europeana.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/europeana.py @@ -132,6 +132,7 @@ def __init__(self, *args, **kwargs): # ``self.date`` attribute for us, so we can construct that # ``query`` prop for the request params ahead of time. "query": self._get_timestamp_query_param(self.date), + "cursor": "*", } self.record_builder = EuropeanaRecordBuilder() From a34b7d83df8a2d5949747e9b845d888aed345187 Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Tue, 25 Oct 2022 16:10:47 +1100 Subject: [PATCH 7/9] Move success check into `get_batch_data` Co-authored-by: Madison Swain-Bowden --- .../provider_api_scripts/europeana.py | 13 +- .../provider_data_ingester.py | 2 +- qq | 1399 +++++++++++++++++ 3 files changed, 1407 insertions(+), 7 deletions(-) create mode 100644 qq diff --git a/openverse_catalog/dags/providers/provider_api_scripts/europeana.py b/openverse_catalog/dags/providers/provider_api_scripts/europeana.py index 14de1d1fd..8dbb19624 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/europeana.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/europeana.py @@ -156,16 +156,17 @@ def get_next_query_params(self, prev_query_params) -> dict: "cursor": self.cursor, } - def get_should_continue(self, response_json): - if response_json.get("success") != "True": - logger.warning('Request failed with ``success = "False"``') - return False - + def get_should_continue(self, response_json: dict): self.cursor = response_json.get("nextCursor") return self.cursor is not None - def get_batch_data(self, response_json): + def get_batch_data(self, response_json: dict) -> None | list[dict]: + if response_json.get("success") != "True": + logger.warning('Request failed with ``success = "False"``') + # No batch data to process if the request failed. + return None + return response_json.get("items") def get_record_data(self, data: dict) -> dict: diff --git a/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py b/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py index f1db52dcd..14a12d4fa 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py @@ -331,7 +331,7 @@ def get_should_continue(self, response_json): return True @abstractmethod - def get_batch_data(self, response_json): + def get_batch_data(self, response_json) -> None | list[dict]: """ Take an API response and return the list of records. """ diff --git a/qq b/qq new file mode 100644 index 000000000..12694a072 --- /dev/null +++ b/qq @@ -0,0 +1,1399 @@ +1ea1a507 (HEAD -> refactor/europeana-provider-base-class) Add back default cursor from previous implementation +b54f8af1 (origin/refactor/europeana-provider-base-class) Update provider workflow config for Europeana +384380dc Remove unnecessary batch_limit override +dfac5064 Fix description missing if en or def are empty +2781620c Refactor Europeana to use ProviderDataIngester base class +cc4b2f9f Add default implementation for `get_media_type` for providers with single media type +9ff501e6 (origin/main, origin/HEAD, main) 🔄 Synced file(s) with WordPress/openverse (#802) +63b0fb7f Retire TSV loading workflow (#789) +3217ed5e Made improvements to `CONTRIBUTING.md` (#791) +8f92318c (tag: v1.3.5) Refactor Freesound to use ProviderDataIngester (#746) +46c2c161 Retire Walters Art Museum provider script (#786) +740cf00c Bump pytest-mock from 3.9.0 to 3.10.0 (#781) +db47359a Refactor Jamendo to use the ProviderDataIngester (#741) +d678dc7a Disable email on failure by default (#788) +8ee7fb72 Add concurrency settings for workflow (#770) +fcf1d90c 🔄 Synced file(s) with WordPress/openverse (#787) +4fee8ce9 Increase dependabot PR limit to 10 (#780) +337ea7ae 🔄 Synced file(s) with WordPress/openverse (#771) +4cb9d417 Fix italics for duration disclosure (#769) +f6538ce2 Bump pre-commit from 2.14.0 to 2.20.0 (#779) +d378ba7b Bump tldextract from 3.3.1 to 3.4.0 (#777) +d926e083 Bump apache-airflow[amazon,http,postgres] from 2.4.0 to 2.4.1 (#767) +06faf94f Bump pytest-sugar from 0.9.4 to 0.9.5 (#751) +62ee12a0 Bump isort from 5.9.3 to 5.10.1 (#764) +e49b0c32 Bump black from 22.3.0 to 22.10.0 (#778) +c1b970b1 Add user agent to StockSnap header and use header in requests by default (#765) +d4dbf4d0 Improved data refresh status reporting (#744) +7bf37fc5 Bump pytest-mock from 3.6.1 to 3.9.0 (#749) +58247f67 Bump tldextract from 3.1.0 to 3.3.1 (#752) +600b9eea Remove periods after URLs in log lines. (#763) +b571d024 Bump flake8 from 3.9.2 to 5.0.4 (#750) +516d7674 Add dependabot config (#740) +323d07bc Refactor SMK script to use the `ProviderDataIngester` class (#742) +3b58e60c Default unfurling of links and media to False in Slack notifications (#743) +b9f29df9 (tag: v1.3.4) Add tags option for provider workflows & "legacy-ingestion" tag (#739) +b4ef93ce Bump Airflow to 2.4.0, standardize version bump process (#737) +cec68932 Use Airflow variable to omit DAGs from any Slack notification (#644) +4a9c008a Update reingestion workflows to load and report data (#618) +fc627743 🔄 Synced file(s) with WordPress/openverse (#735) +28bfd169 Add spellcheck to pre-commit config (#718) +a329be22 (tag: v1.3.3) Bump Airflow version to 2.3.4 (#731) +e66bf63f 🔄 Synced file(s) with WordPress/openverse (#733) +5e7119c5 (tag: v1.3.2) 🔄 Synced file(s) with WordPress/openverse (#728) +6e9d02d6 Add none check for Cleveland `image_data` (#709) +bcda6e0b Add `DEPLOYMENT.md` & deployment-related files (#711) +70312d08 Remove error swallowing during ingestion (#713) +18decf9b Refactor Wikimedia Commons to use ProviderDataIngester (#614) +25feeb73 Allow string as exceptions in `on_failure_callback` (#695) +e9fe5b96 Always use Jamendo's "streaming" audio (#706) +9be8bcec Refactor Brooklyn Museum to use ProviderDataIngester (#701) +d828d257 Fix dagrun conf for provider scripts (#708) +70c66f93 Initialize iNaturalist with dagrun conf (#707) +b85df5b2 hardcodes the test ingestion limit to 1 000 000 (#705) +8ac257cd Refactor Metropolitan Museum of Art to use ProviderDataIngester (#674) +2a9647ab Always record provider run duration (#694) +03ce84f2 Allow DAGs to silence only errors matching predicate (#654) +a9417694 Bump iNaturalist timeouts to 5 days (#691) +8a28f948 Update CODEOWNERS (#677) +1c2fbe8a Standardize on datetime over pendulum (#678) +ee474f2c Add iNaturalist.org metadata (#549) +2d41485f Add Openverse email to DAG default args (#683) +2273271f Update audioset_view to use most recently updated f_id/provider pair (#660) +1d407e81 Use Python 3.10 everywhere (#656) +83c688d8 (tag: v1.3.1) Add configuration options to skip ingestion errors (#650) +f6e8fa01 Upgrade Airflow to v2.3.3 (#664) +40b4306b Updates Handbook Link (#662) +54aee38b Re-ping if PR is updated and don't ping if 2 approvals exist (#642) +0072114e Tighten exception handling, always flush buffer (#645) +0f584754 Automatic DAG documentation generation (#649) +704b33cb Only delete dag runs/task instances during testing that match pattern (#651) +9fea6554 Omit DAGs that are known to fail from alerts (#643) +d33083de Fix typo in README (#652) +492ae8b5 Data refresh record difference reporting (#636) +a2e1d50f Use the default provider categories during ingestion (#635) +8c04155c Partition TSVs by date (#632) +06046dae Only drop load table if it exists (#634) +991162bd Refactor Science museum to use ProviderDataIngester (#576) +ea5a7f06 Refactor Museum Victoria to use ProviderDataIngester (#600) +78232410 Re-raise pytest-socket errors within DelayedRequester (#629) +1882d777 Update Finnish Museums to use base class (#579) +2f9df1bf Adjust load data timeout and retries (#626) +3ee97b6c Update data refresh DAG to account for manual go-live (#578) +8754cb63 Generate TSV filenames in separate step (#620) +4451ee03 Patch Stocksnap tests that called out to external API (#628) +0bd0b002 Turn on catchup for dated DAGs to allow backfill (#602) +c294e3e2 Ignore DS_Store files (#627) +cd7ca961 Add date range to ingestion load reports (#613) +7bf8ec42 Update Openverse URL in the user agent string (#612) +5e86d291 Unify header added (#610) +ea1016de Add test to check for import errors for all DAGs in the dags dir (#580) +cc7322b8 Refactor StockSnap to use ProviderDataIngester (#601) +a5101162 🔄 Synced file(s) with WordPress/openverse (#604) +1f97c69a 🔄 Synced file(s) with WordPress/openverse (#603) +42ddf2bf Add missing `MD5` hash to foreign id comparison (#575) +1c9fd4cf Add base class for Provider API scripts (#555) +aacab7bb Add `filetype` to Phylopic script (#547) +dc8a68ba Post comments using JSON instead of form data (#570) +ae122cc7 Add `filetype` test to Metropolitan script (#568) +49ed0dcf Add audio_set_foreign_identifier to the audio materialized view (#565) +fa5e97b5 Fix module import for PR review reminder DAG (#566) +f5de3e2a Add PR review reminder DAG (#553) +825c5aad Add `filetype` and `filesize` to Cleveland Museum of Art API script (#537) +2ae1c4d4 Add `filetype` and `filesize` to SMK script (#542) +c4ffd95d Add flag to strip slash in urls while validating (#556) +fec0d9d1 Consolidate provider workflows using dynamic DAGs and dataclasses (#540) +4d6e5392 Add a helper function to extract extension from the media URL (#545) +5f5fc7cc Create DAG objects at top level (#551) +90980825 Add DAG to report reported media pending review (#513) +8ca6a230 Correct order of None handling in Cleveland provider script (#544) +b36aac72 Remove thumbnails from images (#526) +4cba0ced Unconditionally destroy buckets after testing (#516) +40a0d8a2 Simplify WP Photo Directory script and get missing authors (#515) +bba0413a (tag: v1.3.0) Ensure SMK images don't timeout on validation (#506) +7dbff464 airflow dockerfile: set `PYTHONPATH` to DAGs folder (#514) +31f461ac Generate DAGs to recreate popularity calculations using a factory (#507) +9dca8963 Upgrade Airflow to 2.3, python to 3.10 (#502) +1ef4a133 Retry flaky request when Smithsonian provider script detects no unit codes (#508) +1fda166d 🔄 Synced file(s) with WordPress/openverse (#509) +3ef7138a Merge popularity calculations and data refresh into a single DAG (#496) +20458817 Don't delete custom pools during test cleanup (#501) +971f3966 🔄 Synced local '.github/CODEOWNERS' with remote '.github/CODEOWNERS' (#505) +0bd79797 Add human readable description for durations under 1 second (#500) +25c25d5b (tag: v1.2.2) Recreate the audioset matview after full popularity recalculation (#493) +99bb7389 Enable reporting when there is no data to load (#492) +e98cf799 Make Airflow connection variables easier to read (#480) +b44f3398 Wikimedia: Catch bit rates that are greater than the int max (#475) +80930e9d Fix `alt_files` duplicates (#479) +4738e3f7 (tag: v1.2.1) Update Smithsonian Unit code checker DAG to alert to Slack (#452) +81952618 Change docker-compose restart policy for local development (#474) +c928589b Improved load reporting (#471) +9b962860 Rename Thingiverse.py to thingiverse.py (#472) +8c87749f Show duplicate record count in completion slack message (#442) +dd57690a Re-introduce pytest-socket (#467) +3755baeb Adjust timeouts for Data Refresh `wait_for_completion` step (#458) +039371e7 Use safe_search param to restrict results from Flickr (#460) +24117f01 Upgrade black to 22.3.0 (#463) +6e10adb6 🔄 Synced file(s) with WordPress/openverse (#462) +ed41f4e7 🔄 Synced file(s) with WordPress/openverse (#459) +c164382f Remove `apt upgrade` from PG image, upgrade to 13.6 (#455) +1d9c96d6 Handle case where Wikimedia has no audio metadata (#443) +ecaf732a 🔄 Synced file(s) with WordPress/openverse (#444) +962f1c5d Send single slack notification per provider on TSV load complete (#434) +5bce20de 🔄 Synced file(s) with WordPress/openverse (#441) +67609549 🔄 Synced file(s) with WordPress/openverse (#440) +dd347a43 (tag: v1.2.0) Add data refresh to Airflow (#397) +7cccf888 Change PhyloPic date range & schedule interval (#423) +e41e8d7a Add LRU cache to `is_valid_license_info` (#424) +c4d381fd Round duration for provider ingestion completion message (#422) +0e3675ca Enable XCom pickling in Airflow (#421) +22a8965b Use published Docker image in primary docker-compose.yml (#417) +5f941e6e Fix invalid license urls from Finnish Museum API (#418) +124d23b6 Reduce noise in NYPL ingestion (#415) +e8400cd4 Add ConnectionError to acceptable flaky exceptions for Freesound (#413) +fd68b9ea Fix schedule intervals on Cleveland Museum & Wikimedia Commons (#416) +3034e31f Update API requests for Museum Victoria DAG (#414) +8712f325 Add OFEO-SG subprovider (#412) +c5cad660 Handle duplicate keys in load_data task (#395) +38ee4938 Make 'sound' category more specific (#402) +23638152 Group test runs by module or class (#409) +e5f820ad 🔄 Synced file(s) with WordPress/openverse (#404) +7f19de2a 🔄 Synced file(s) with WordPress/openverse (#403) +f3808d85 Update Slack messages to include environment (#382) +69b2eb7a (tag: v1.1.0) Update Airflow to 2.2.4 (#372) +2b2f9636 Reconfigure retries & timeouts for typical ingestion DAGs (#361) +5188b388 Add slack message on TSV load complete (#369) +9538f384 Add provider media type to DAG tags (#360) +ada025d8 Trigger TSV loading immediately after workflow (#357) +cb19f839 Use Airflow Variables for storing API keys (#362) +9555374b Differentiate between slack channels (#359) +6dd5cb34 Updated user agent for Wikimedia Commons #140 (#355) +8431b4be Remove buckets after testing (#344) +760eab51 Use pytest-xdist for testing (#337) +cbb26f14 Ensure Freesound tests are isolated (#340) +a8df91ab Change minio ports from 500X to 501X (#341) +cadc9d3e (tag: v1.0.0) Freesound SSLError fix (#330) +8de8d703 Set up CI/CD with ghcr.io (#332) +b07282dc Fix inconsistent alignment in slack message text (#328) +5a954f91 Properly handle "None" values returned from Freesound API (#327) +0c5b43fb Add audioset_view to catalog DDL (#320) +c50f487c Set default timeout to 12 hours (#311) +738c9d88 Change request info log to debug to prevent spam (#312) +b1f24143 Make commoncrawl bucket configurable, change default (#318) +52c1d245 🔄 Synced file(s) with WordPress/openverse (#317) +8fc29a22 🔄 Synced file(s) with WordPress/openverse (#314) +4e965e2c Extend Jamendo's timeout to 24 hours (#310) +c4f3f965 Disable TSV loader scheduling (#309) +836bbbf4 Upgrade to Airflow 2.2.3 (#308) +ab90e8ee Add unique indices to catalog (#306) +bb3c9436 Add Image Categories (#302) +9d2e6a5f Bump lxml from 4.6.3 to 4.6.5 (#303) +12f80005 Remove `get_*_operator` functions, simplify commoncrawl logic (#301) +6f5f598b Remove unnecessary logging.basicConfig calls (#299) +e40b86ce Slack alerting for DAG failures (#297) +75469b7d Refactor delay tests to prevent them from being flaky (#298) +2f14bbad Specific error message for auth errors on request, improve tests (#295) +16fd77fa Retire common_api_workflows, clean up config (#296) +fb05e35b 🔄 Synced file(s) with WordPress/openverse (#294) +baadc4f7 🔄 Synced file(s) with WordPress/openverse (#293) +89767ec9 Add Provider API script for Freesound (#95) +7b142c78 Reduce TSV loader complexity (#289) +14c2d6d7 Slack alerting utilities (#279) +e3cc70a2 Add DAG tags, remove health check workflow (#277) +adafb42d Add production deployment documentation (#271) +669067d2 Retire legacy ingestion column fix (#287) +8f6a1cd5 Retire cleaner_worfklow, pg_cleaner (#288) +6173fb44 Remove tsv_to_postgres_loader_overwrite (#286) +ef614b97 Add index creation for matviews (#280) +88322d2d Respository restructure (#276) +6025630d 🔄 Synced file(s) with WordPress/openverse (#274) +dc1df6b0 Retire update workflows, refactor operators (#266) +7ee62451 Add docker entrypoint to ensure db migration on startup (#270) +a5c2ee98 Replace moto server with Minio (#254) +3464826d OAuth2 DAGs and Machinery (#246) +74ad9bd8 Add pip upgrade command, docker optimizations (#265) +d65d4c46 Add `justfile` deployment recipe (#267) +454e9a2c 🔄 Synced file(s) with WordPress/openverse (#269) +b911c69b 🔄 Synced file(s) with WordPress/openverse (#268) +601f7639 Add args option to db-shell recipe (#259) +485fc34e 🔄 Synced file(s) with WordPress/openverse (#258) +e7d7e173 🔄 Synced file(s) with WordPress/openverse (#256) +9367dc18 🔄 Synced file(s) with WordPress/openverse (#255) +4c66afb1 Edit wikimedia_audio name in popularity sql (#253) +93255a19 Add pgcli to postgres container, db-shell recipe (#252) +d5d39f1a Improve `.env` documentation & structure, update values (#251) +91579b9f Remove prefixes from issue template titles (#250) +8ea757e0 🔄 Synced file(s) with WordPress/openverse (#249) +6d02802b Make Category a StringColumn (not an ArrayColumn) (#243) +97ef1da6 Fix type in contributing.md (#245) +ea31d810 Add sample WordPress REST API script (#223) +c6f94b09 Update provider template, refactor DAG parsing tests (#237) +f4b2abc1 Remove `trackid` query parameter from set thumbnail url (#239) +2e003466 hotfix whitespace in new issue template +a9666559 Merge pull request #238 from WordPress/rm-get-log-operator +1992ba17 Remove unnecessary dag from operator util test +926e6d4a Merge pull request #230 from lyu4321/issue-176 +da7c7f0f Update .github/ISSUE_TEMPLATE/image_provider_api_integration_request.yml +4b70e986 Merge pull request #240 from WordPress/update_test +c8524980 Use `with dag` in `test_operator_util` +7608dbb4 Update labels and desc for provider template +b1742cab Update labels and desc for source template +7f320c23 remove dag argument from all the operator creation functions +463913b8 init +c87aab0c Update desc in source template +c2a4afb3 Remove extra # +7f0a3673 Add desc and missing fields to provider template +a4123c69 Update source issue template from md to yml +a023b9d0 Update provider issue template from md to yml +b044229d Docker optimization & repository restructuring (#226) +aaca3b9e [Audio] Add Wikimedia as an Audio source (#197) +7ef1b9b4 Add new columns to MediaStore and database (#196) +0ade78b4 Merge pull request #221 from WordPress/stocksnap-popularity +032ff837 Use `just` commands in CI workflow (#218) +edd0ab9c Move dev-specific services into compose overrides file (#217) +fb24e39d Implement stocksnap popularity and popularity documentation +96011a0f Revert accidentally-pushed previous stocksnap test commit +bbd8898b Fix stocksnap test to use new metadata values +3c99a17a Merge pull request #206 from WordPress/repo-sync/openverse/default +0f0797d4 🔄 Synced local '.github/workflows/new_prs.yml' with remote '.github/workflows/new_prs.yml' +7b7a3122 Organize & document `justfile`, fix issue with recreate command (#198) +7af65647 Move storage module up and deduplicate MediaStore tests (#192) +793d67ab Merge pull request #194 from WordPress/airflow-credentials +c2dad879 Issue templates (#195) +032e6ce9 Update README.md +008185f2 Merge pull request #190 from WordPress/repo-sync/openverse/default +7fa22e8e 🔄 Synced local '.github/PULL_REQUEST_TEMPLATE.md' with remote '.github/PULL_REQUEST_TEMPLATE.md' +dd8bf859 Merge pull request #187 from WordPress/cleaned-up-docs +a48c70fc Update README.md +a445771e Update README.md +07d3ffb0 Update README.md +71785485 Add missing newline +c5a1be76 Streamline monthly + daily dag lists in README.md +8ca3ba0e Merge pull request #185 from WordPress/repo-sync/openverse/default +a0f304f0 🔄 Synced local '.github/workflows/pr_label_check.yml' with remote '.github/workflows/pr_label_check.yml' +a8061309 Merge pull request #184 from MuhammadFaizanHaidar/patch-1 +fade0eb7 Renamed the source suggestion issue template +ec7e08ad Merge pull request #179 from WordPress/add/recreate-recipe +48c1f0cb Merge pull request #180 from WordPress/repo-sync/openverse/default +18c6a10b 🔄 Synced local '.github/workflows/new_prs.yml' with remote '.github/workflows/new_prs.yml' +909ead95 Allow passing flags to `test` recipe +213904c8 Add recreate recipe +186e4aa1 Merge pull request #174 from WordPress/repo-sync/openverse/default +a39d32a1 🔄 Synced local '.github/workflows/pr_label_check.yml' with remote '.github/workflows/pr_label_check.yml' +ceedd6fe Merge pull request #173 from WordPress/repo-sync/openverse/default +8680573a Merge pull request #172 from WordPress/ack-update +42e64d88 Update README.md +e96f7e3f 🔄 Created local '.github/workflows/pr_label_check.yml' from remote '.github/workflows/pr_label_check.yml' +aeee7989 🔄 Synced local '.github/workflows/new_prs.yml' with remote '.github/workflows/new_prs.yml' +375bd82d 🔄 Synced local '.github/workflows/new_issues.yml' with remote '.github/workflows/new_issues.yml' +053d85ed Update acknowledgements section +8ca48c86 Use dag_factory for Provider API DAG creation (#163) +c5e6d4bf Merge pull request #159 from WordPress/add/formatters +9f8b50f9 Fix requirements.txt comment location +4af80b0a Remove unused flake8 annotations and fix exclude pattern +ac09b2f7 Make env.template not executable +117f84d3 Rename common lint job +a5116c82 Use pre-commit for CI linting +277de7a8 Add black and isort and apply to all files +17628e74 Merge pull request #153 from WordPress/add/just-scripts +c2d63246 Remove old recipe from readme +90461c21 Do not load any .env files for just +fa728f40 Ensure containers are running before running exec +d30dd25b Use more general language for logs recipe behvaior +f7e7847e Complete list of running containers +cfcc19dc Fix justfile to use dev configuration +9cecc481 Remove directions to switch directories from README +bb47e440 Rename makeenv to dotenv +f657e964 Add preliminary just scripts +1ed278ec Merge pull request #157 from WordPress/add/pre-commit +608fa9e2 Merge pull request #151 from WordPress/add/simulated-dag +7898fc5c Merge pull request #156 from WordPress/local_s3_bucket +cc4f2569 Organize requirements files and de-duplicate +54c4afc1 Apply pre-commit to all files +92ad9f0c Add general pre-commit hook +58861903 Add pre-commit +fa3c191c Merge pull request #154 from WordPress/update/switch-to-volumes +dad98228 Add note about volume prune +c35effa7 Update example value for `AIRFLOW_CONN_AWS_DEFAULT` envvar +9e25193a Add openverse-airflow-logs to BUCKET_LIST +25d16127 Use `tries` param instead of `TRIES` constant +8a0e75e5 Update README to remove volumes on cleanup +9e93f527 Switch local postgres to use volumes +7c9158ca Replace os.path with pathlib in provider API script template (#149) +28df4bf3 Update Apache Airflow version (#148) +a2d16cef Add manually run healthcheck DAG +04eaeeed Merge pull request #147 from WordPress/fix/provider-template-path +4f9e7341 Log cleanup DAG (#139) +c7310513 Fix resource path string +6c172033 Simplify catalog folder structure (#133) +ea3b2b8f Merge pull request #145 from WordPress/fix/make-harmonious-with-api +697b406a Allow running the catalog and the API at the same time +b12ba815 Merge pull request #114 from WordPress/stocksnap +5bf81223 Update StockSnap tests and example files +bfc9d0d8 Get creator data from StockSnap API +925272e2 Format with black & flake8 +a7d562a3 Make image's `title` from tags/keywords +8dca7f98 Get `foreign_landing_url` from StockSnap API +2d099915 Merge branch 'main' into stocksnap +3410f7e3 Merge pull request #136 from WordPress/airflow-remote-logging-example +7fc4fdbd Merge branch 'airflow-remote-logging-example' of github.com:WordPress/openverse-catalog into airflow-remote-logging-example +0ad6cfee Improve remote logging docs +a3269dd8 Merge branch 'main' into airflow-remote-logging-example +c739c24d Replace `genre` property with `genres` in tests (#137) +3818b201 Update to new values in Airflow 2 (logging namespace moved in airflow config) +5f0b2ca0 Add example vars for airflow remote logging +06fb4991 [API integration] Add Jamendo provider API script (#113) +f54ad987 Merge pull request #135 from WordPress/mv_docs +73e35915 Merge pull request #134 from WordPress/repo-sync/openverse/default +ca8dd1d1 🔄 Synced local '.github/ISSUE_TEMPLATE/' with remote '.github/ISSUE_TEMPLATE/' +4ec2d707 🔄 Created local '.github/workflows/new_prs.yml' from remote '.github/workflows/new_prs.yml' +bc69ec84 🔄 Created local '.github/workflows/new_issues.yml' from remote '.github/workflows/new_issues.yml' +19ed1373 Add handbook link to README file +66df8b18 Delete docs folder +6724b148 Update stocksnap tests and example `full_item.json` +ed77e0e2 Get image title from API response instead of the scraped page +8764118f Merge pull request #131 from WordPress/pr_template +73f50dd7 Add a PR template to the repository +d1dde6fc Merge pull request #130 from WordPress/modify_audio_columns +924b2b1b Add stocksnap tests +bb31eb93 Pass license_info instead of license_ and license_version +182e1592 Rename `alt_audio_files` column to `alt_files` +dd4bcff0 Add funtion to merge arrays in sql +8ac0298b Change `genres` column to ArrayColumn type +8e2186ef Add new `ArrayColumn` type +9895bf2d Modify columns in test_audio.py +6b4fe9ff Rename `standardized__popularity` column in view tables +621765b1 Make field for audio genres plural +3a816554 Add watermarked column to sql files +eaaba615 Merge branch 'main' into stocksnap solving conflicts +20772bb4 Make wikimedia script pass license_info, not license_url (#129) +bf5ebd21 Add a script to create provider API script template (#128) +0f82371f Merge pull request #126 from WordPress/rm_duplicate_providers +546fade1 Delete duplicated CommonCrawl providers +d3db16d7 [Quality] Make provider scripts pass validated license_info to the storage module (#66) +9446c7f9 Add support for other media types to popularity calculations (#112) (#124) +8bac45a0 Add missing `watermarked` column to audio loading table (#125) +5a4c5871 Ingest wikimedia images marked with CC0 and PDM (#119) +073c1215 Clean Wikimedia item titles (#120) +5d1ecb2b Add Audio to the database (#111) +ab8fa907 Refactor to make only one extra request per image +93415d90 Add samples files of an image and a api response for tests +127fa29c Add instruction to write tsv file with image data +9519967c Fix filling of tags field +7a14f6a6 Complete image's title, creator and creator_url +b601984e Set default output dir for commoncrawl (#118) +2bf469ec Merge pull request #116 from WordPress/issue_templates +870cbc2c Add the link to the Make site +6dfbfb01 Update labels to the new format +e7aeda8b Add volunteering section to all templates +ccd23f7e Align bug and feature templates with other repos +b1cc1fee Program stocksnap script with minimum required fields +a42476a9 Add StockSnap to `dags/util/loader/provider_details.py` +14452b21 Create base provider files for stocksnap +264306c1 Improve DAG creation template Signed-off-by: Olga Bulat +2c98e242 Extract media type from staged tsv file name for loader (#110) +38b90981 Add AudioStorage entity (#85) +25e18fa7 Extract MediaStorage entity as parent to ImageStore (#83) +ab8d3ccc Merge remote-tracking branch 'origin/template' into template +67b9b303 Merge branch 'main' into template +13e607b8 Fix typo in provider template script +5efdd322 Add ingestion column to MediaStore when using provider API (#72) +5351b5e8 Remove mutable parameters in provider api scripts (#100) +cd159ccd Remove logging of url rewriting when not rewritten (#108) +0c7a5073 Fix pep8 violations (#103) +88a777ce Make the script output clearer +41714d25 Merge branch 'template' of github.com:WordPress/openverse-catalog into template +4a03768e Make image the default media type +5efe43a8 Replace relative path with absolute to fix file not found errors +1331d460 Better wording for script date parameter +85affa35 Merge pull request #104 from WordPress/release_drafter_on_main +3eb439d8 Run release drafter action on push to main branch +01e76a72 Merge pull request #90 from WordPress/local_sql_order +5e456603 Update src/cc_catalog_airflow/templates/template_provider.py_template +7181cd35 Shorten lines +b194efb8 Merge branch 'main' into local_sql_order +b343d84d Merge branch 'main' into template +99a6f0e7 Add more trailing zeros +9942d241 Merge pull request #98 from WordPress/run_ci_on_main_push_only +68172b94 Run CI on push only on main +e6a64c6f Run CI on push only on master +6f605316 Merge pull request #71 from WordPress/improve_url_logging +387c973a Fix linting errors +cb6d2647 Merge pull request #91 from WordPress/fix_dep_version_conflict +012df2db Make URL logging less verbose on success, more verbose on failure +f4d9ebb0 Merge branch 'main' into improve_url_logging +7ca3fea5 Create a Provider API script template +408cb8a4 Fix the dependency version conflict +a22d1931 Ensure Docker loads local_postgres sql scripts in correct order +6199d4b3 Merge pull request #76 from WordPress/add_testing_workflow +fc9a7a34 Merge pull request #86 from WordPress/dependabot/pip/src/cc_catalog_airflow/urllib3-1.26.5 +66b5e693 Add trailing new line to lint workflow +51290e41 Remove workflows from workflow-disabled folder +06e64b91 Merge pull request #81 from WordPress/fix_test_failures +70515f27 Bump urllib3 from 1.25.11 to 1.26.5 in /src/cc_catalog_airflow +d9e77234 Merge pull request #80 from WordPress/codeowners +20940a9f Merge pull request #68 from WordPress/update-readme +c6726a53 Fix failing text, improve import readability +dc7ebe29 Add blank lines for readability +14f0f0bc Merge pull request #78 from WordPress/extract_common_package +0cd97686 Create a CODEOWNERS file +86a67a4f Move the common package to a higher level to simplify testing +a61a6649 Add missing new lines at the end of files +400bd8f0 Re-add the lint and test workflows from the original repo +10b31ed7 Merge pull request #75 from WordPress/dependabot/pip/src/cc_catalog_airflow/flask-appbuilder-3.3.0 +9a9550c2 Bump flask-appbuilder from 3.2.3 to 3.3.0 in /src/cc_catalog_airflow +724031fb Merge pull request #70 from WordPress/dependabot/pip/src/cc_catalog_airflow/lxml-4.6.3 +1886fb09 Merge pull request #73 from WordPress/release_drafter +b4780b25 Update release-drafter.yml +3426884a Add configuration and workflow for Release Drafter +900ad221 Log the actual URL requested +94b49799 Bump lxml from 4.4.2 to 4.6.3 in /src/cc_catalog_airflow +a860168d Merge pull request #63 from WordPress/airflow_update +4ab4e6ee Update README.md +c011f4bf Fix imports +46c20851 Fix `test_operator_util` +ecd5e7a0 Replace deprecated provider imports +92fcc81a Update python, airflow, dependency versions +49539a56 Merge pull request #62 from WordPress/readme-updates +68691f6b Update openverse-catalog.md +17dffa56 Remove Openverse Search +66063f30 Add a space +fe1cdabc Update links +461e8f85 CC Catalog to Openverse Catalog +b68aa7ef Replace CC Catalog with Openverse Catalog +3e08e9e6 path fixes +d316be4a More typos and formatting for markdown files +02d831c7 Use WordPress CoC +aefd1e3c Merge pull request #54 from obulat/update_dependencies +ea1f34bd Merge pull request #56 from Automattic/flickr-improve-docs +165e5ae3 Add documetation on generating a Flickr API token +9760ce18 Fix airflow db initialization +68ef4162 Fix production dependencies compatability with upgraded airflow +9012a249 Pin pandas version +84613a9b Update to postgres 13, apache-airflow 1.10.15 +f2826460 Update README.md +e0acf60a Merge pull request #1 from Automattic/cc-a8c-migration +c9729f88 Switch to renamed workflow dir to disable actions +efb34327 Comment out and disable GitHub actions +25f226ab Remove CC meta files +cd789c26 Update README with migration notice +67b4a8c7 Merge pull request #544 from creativecommons/discontinued-notice +eb71f203 Update README.md +b9955ff3 Update README.md +98348de6 Update README.md with discontinued status +0ab8a380 Merge pull request #539 from creativecommons/ct_codeowners_1608054710 +68b7aa47 Sync Community Team to CODEOWNERS +b1bf826f Merge pull request #537 from ariessa/master +54efa946 Merge pull request #536 from dravadhis/iss463_mockimgstore +70bae81d Removed links +5391467d Create MockImageStore class for testing +709792a9 Merge remote-tracking branch 'upstream/master' +0609aef9 Fixed broken links +c045c7ff Merge pull request #535 from tushar912/fm-airflowdag +488ce9c9 Updated flickr.md +35aa2a94 Improved doc and its formatting +f0915891 Fixed broken links and formatting +ea9f45ea change start date +0a04a9f4 add tests for finnish_museums_workflow +04ef1ace add finnish_museums_workflow +abbe9eb8 Merge pull request #532 from tushar912/finnish-museums +e928abde process object list page by page +a64dd2b1 made total_images global +b1e9f311 fix line too long +d1ca7ced change provider details +8b528bd0 handle image_rights none +a092756d fix raw_tags to be array of str +7efd39e3 Merge pull request #533 from creativecommons/use_execution_date_for_commoncrawl +7eed044b Add files via upload +2da4f374 Create cc-catalog.md +6667b81a remove unused imports +e534f04f format local s3 init script with black +45b193db use execution date to calculate cc_index, rather than pulling it from S3 +b5f0a720 add additional cond for none +67c2c7fe remove unused import +29e05195 format test ac to pep8 +103653d8 fix line too long +a14f3ba3 format ac to pep8 +971f9c79 finnish museums provider and tests +3347889e Merge pull request #530 from creativecommons/common_crawl_etl_airflow_dag +6672f742 add new variables to environment template +50d386f1 fix linting error +daca6cec add test for new CommonCrawl ETL DAG +7651e2d5 add tests for operator functions +3884bc53 make s3 file loading function private +ac9c15cd format python files with black +17a10c09 remove unused import +23602f59 extract operator definitions to separate file +2eb201b1 increase number of core instances for run with real data +f7de1c11 add new Airflow DAG to run first pipeline +dc90a220 add airflowignore so that python files don't set off warnings +f8af3f32 Merge pull request #523 from creativecommons/cleaner_workflow_parallelism +9bdb9068 Merge pull request #524 from creativecommons/dependabot/pip/src/cc_catalog_airflow/cryptography-3.2 +8ef22b4f Bump cryptography from 3.1.1 to 3.2 in /src/cc_catalog_airflow +c2012f59 lower logging level for urls import +6a46f51f fix numerous bugs, turn down ImageStore logging +627efb4e add logic to handle defective dictionary when getting license_url +26782163 fix missing fields bugs, add tests for them +0bedecff add functionality to output defective identifiers to file +4c83734b fix bugs, lower DAG parallelism to avoid locking up scheduler +727e3557 (tag: v0.9.0) Merge pull request #517 from creativecommons/clean_preexisting_data_with_disk_write +ca3cdecb fix flake8 errors introduced by black +cd042bd0 remove unused MagicMock import +b56105a2 fix broken test to avoid enironment assumptions +62246613 reformat code with black +b428c200 add test for new cleaner DAG +cc21272c add DAG to run new cleaner logic +2a968ffe add remaining tests for pg_cleaning functionality +9cb650b2 refactor ImageStoreDict for better testability +4ec6f49a improve error handling in pg_cleaner +488f4e4c refactor for testability, add tests to pg_cleaner +664d6599 add test for pg_cleaner +d1a93c98 use non-deprecated logger.warning instead of logger.warn +19546ee2 add row-cleaning logic that saves to disk +61fac227 fix loader_workflow test with new number of DAGs +42b49f46 add DAG to run new overwriting logic +05e27379 fix updating logic so that it handles missing matches +3d548d79 add function to overwrite instead of upsert into image table +cabfa11c Merge pull request #516 from tushar912/ascii-false +348054e9 added test to save unicode string +51bb32fb Shorten line acc to pep8 +a9fe50a0 ensure ascii false in json.dumps +cb81077c Merge pull request #512 from dravadhis/iss366_requirements +d951bdbc Split dependencies into prod and dev +f4bfcb56 Split dependencies into prod and dev Add requirements_dev.txt and requirements_prod.txt. Modify Dockerfile, docker-compose.yml and docker-compose.override.yml to work with new requirement files. +e392b49d Merge pull request #511 from dravadhis/iss211_rawpixel +1cc98a4b Merge pull request #508 from dravadhis/iss507_DAGwalters +f5b015af Make `meta_data.Description` field in RawPixel.py +0739afc1 Make `meta_data.Description` field in RawPixel.py Make `meta_data.Description` field using `pinterest_description` in `raw_pixel.py`. +e97ae8d2 Merge pull request #506 from dravadhis/iss274walter +ce88ff0f Integrate Walters Art Museum API with CC Search Remove 'Paper & Paper-Mache' from list of classifications. +bdb9c0da Merge pull request #510 from creativecommons/clean_common_crawl_using_imagestore +a3fe3bd0 Integrate Walters Art Museum API with CC Search +23f6af84 Integrate Walters Art Museum API with CC Search +6d9893f7 Integrate Walters Art Museum API with CC Search Implement page increment logic in walters_art_museum.py +0495d4c0 extend syncer DAG to use new cleaning function +63d2c0ed add directory looping logic to tsv_cleaner, +9923e376 fix path joining to give trailing slash +90dd33d3 format file with black +49511d89 add test for new functionality +66c7e415 Merge branch 'master' into clean_common_crawl_using_imagestore +e824407b Integrate Walters Art Museum API with CC Search Modify _get_image_list to work with _get_response_json method of the DelayedRequester class. Replace get functions with bare dict.get calls. +e57ba18a Merge pull request #504 from avats-dev/fix-readme-broken-links +4dd2695b Add Walters Art Museum Workflow +75bfa891 Integrate Walters Art Museum API with CC Search Set API KEY in env.template as not_set +1190300f Integrate Walters Art Museum API +a0294269 Integrate Walters Art Museum API with CC Search +dd572712 Fix broken links +cd7ae0d1 Merge pull request #1 from creativecommons/master +75b37f8f Merge pull request #499 from dravadhis/issue_fstring +99a84eeb Update europeana.py and wikimedia_commons.py +8fdabad4 Change style to comply with style of the repository +14c5e555 Update test_brooklyn_museum.py +a414ec3d Restore untested files to original state. +c7a59f3f pass environment variables in through bash operator for modification +f48fc1ce move tsv cleaning logic to correct location +752deb26 move commit step so it only happens once per ImageStore +8a35a166 Replace all occurences of str.format() with f-strings +d0426de5 Replace all occurences of str.format() with f-strings +61c9b7e1 Replace all occurences of str.format() with f-strings +36f2c774 Replace all occurences of str.format() with f-strings +f69827f5 Merge pull request #498 from dhruvkb/patch-1 +5bfe7323 Replace all occurences of str.format() with f-strings, keeping the formatting same throughout the codebase. +3c032153 Replace ORG_GITHUB_TOKEN with ADMIN_GITHUB_TOKEN +e7d14d6f undo a string sanitization to make sure cleaning is idempotent +caf9e619 add script using ImageStore class to clean a TSV of image rows +deb1a145 make Image row namedtuple public +836aa60a Merge pull request #464 from creativecommons/verify_urls_in_imagestore +a914114d fix flake8 error +a776acdd handle reverse 2.1 license mapping and save raw license_urls +cd8828c7 Merge branch 'master' into verify_urls_in_imagestore +c4ac873c (tag: v0.8.0) Merge pull request #483 from creativecommons/image_expiration +de659eb0 Merge pull request #478 from creativecommons/common_crawl_tags_merge +6cc2ae6b Merge pull request #488 from avats-dev/cat_img_wikimedia +31d5a590 Merge pull request #491 from creativecommons/kgodey-patch-1 +5e0e7135 Rename new-source-suggestion to new-source-suggestion.md +7a1f0317 Merge pull request #490 from akshgpt7/automate-linting +6b06b72c Add pull_request event to lint.yml +67bd9ce3 edit test to check for meta_data.categories +994a8542 List comprehension for dag workflow +335f6258 api table given as argument +509daa8d Merge pull request #485 from creativecommons/europeana_reingestion_timeout_fix +434d45b1 add categories to metadata dict and extract them +b8bde687 Merge pull request #487 from avats-dev/minor-readme-typo +f6e0a206 Merge pull request #475 from akshgpt7/automate-linting +75775fc6 Removed a minor typo +a78c98f4 createad scripts folder +31b872b5 Update image expiration workflow to execute in parallel +2c0d87ee Merge branch 'master' of https://github.com/creativecommons/cccatalog into image_expiration +008890cb increased timeout to 12 +ae0059d1 Add .flake8 for configuration on test files +75ea7834 Remove pull_request event +5d073aef Add test files linting workflow +bb628229 Merge branch 'master' of https://github.com/creativecommons/cccatalog into automate-linting +ec8a1d25 Merge pull request #476 from creativecommons/smithsonian_discrepancy_fix +4238c2fc Merge pull request #474 from creativecommons/si_nmnh_improvements +984bde4c Test image expiration +16a1438a Merge branch 'master' of https://github.com/creativecommons/cccatalog into image_expiration +8769820f Merge pull request #479 from creativecommons/annatuma-source-issue-template +53e80b2c Create new-source-suggestion +dd06686c (tag: v0.7.0) Merge pull request #477 from creativecommons/popularity_calculations_sql +2a8342f8 query to update added +93ba67f8 Merge branch 'master' into verify_urls_in_imagestore +96562359 Update image expiration workflow to run sequentially +b3fa922f add tests for function to calculate standardized popularity +15edb59d add logic to avoid zero, but record raw value +9965948a Initial implementation of the image expiration workflow +b4b2d0ac Fix error in the OLDEST_PER_PROVIDER dictionary +8927ba6a Initial implementation of the expiration logic +b451473b Get all provider names into one location +a7276a05 add tests for SQL module; reformat with black +a434fcfe reformat operators module with black +9b97545f added science and met museum logic (testing) +58e64182 add tests for popularity SQL DAGs +7d5f45f8 add Airflow DAGs to create and refresh image popularity data +a1f56eaf fix SQL bugs, add index so constants view can be updated concurrently +5b721667 Expand the creator and description types considered in Smithsonain +80bec5e2 add operators to allow Airflow to use new view updating functions +a8c3189a add and reorganize SQL-via-python to burn down popularity data +c7b4b97b add new SQL-via-python functions to build and refresh popularity data +9e688d95 minor changes (testing) +cdf089c9 merge CC tags script (testing) +b7f6f293 Merge branch 'master' of https://github.com/creativecommons/cccatalog into si_nmnh_improvements +ccd228ca Variable name update +d45848cf Merge pull request #465 from creativecommons/smithsonian_unit_code_check +8c2cb4d0 Merge branch 'master' of https://github.com/creativecommons/cccatalog into smithsonian_unit_code_check +cfa8246e Raise exception when unit code table needs to be checked and update code for consistency +77bfcebd Merge pull request #473 from creativecommons/europeana_reingestion +f89b15a8 Merge pull request #355 from kss682/issue-348 +969bee47 Concatenate creators with semicolons and "and" at the end +7eeb5429 fix linting workflow syntax +6317bb4d Fix linting workflow +8926d817 Merge branch 'master' of https://github.com/creativecommons/cccatalog into si_nmnh_improvements +ee020e42 Merge branch 'master' of https://github.com/creativecommons/cccatalog into smithsonian_unit_code_check +6a7b5870 (tag: v0.6.0) Merge pull request #462 from creativecommons/nypl_implementation +4d33aacc Merge pull request #455 from creativecommons/smithsonian_sub_providers +b12f2626 Improve creator and description metadata in Smithsonian +1893349c dag script +48032026 code and test suite refactored +a3da7a73 Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue-348 +cef091ee bug fix. +497e0301 ingestion workflow +9fafd45a code refactored to make it more readable and pythonic +0a99cb75 api key set to NYPL_API_KEY +c379320b implement new SQL-based popularity calculation +401d5997 add SQL files to create tables and views on upstream DB +9d5b18a7 set up local postgres with new tables and views +b70799e4 Check for outdates unit codes as well as new additions +80692c9c Merge branch 'master' of https://github.com/creativecommons/cccatalog into smithsonian_unit_code_check +e6e838c5 single creator name retrieved from API +dacb48d2 Merge pull request #467 from creativecommons/museum_victoria +7439b88f Provider name bug fixed +0294a269 Merge branch 'master' of https://github.com/creativecommons/cccatalog into smithsonian_unit_code_check +e022048e Update the unit code workflow +e3a8439a Merge pull request #461 from creativecommons/flickr_new_subproviders +d6cbb9b2 Change Smithsonian unit code check to store values in table +94f2840c Initial implementation of Smithsonian new unit code check +7b244eea disable all calls to socket.socket, ensure tests pass +ab65fc0f Change logging levels and messages to emphasize actual issues +7e5435e2 add tests for upgrading/verifying schemes of IP addresses +ce2c4f8d add test for url scheme adding logic, fix bug in same logic +2516267a add documentation to license methods +a80e0336 added logger info , removed extra print statement +3ea22ec4 Merge branch 'master' of https://github.com/creativecommons/cccatalog into nypl_implementation +8fce51ad dag script +67050758 removed old NYPL script +5cab507f remove unused exception type +abc533d0 Add WOCinTech as a sub provider of Flickr +abc03825 Merge branch 'master' of https://github.com/creativecommons/cccatalog into smithsonian_sub_providers +57df7e0c stronger conditions in metadata method +27531216 added metadata and remaining test_suite +81ef9741 (tag: v0.5.0) Merge pull request #447 from creativecommons/museum_victoria +67b670a6 unused import removed +d8584b5b Merge branch 'master' of https://github.com/creativecommons/cccatalog into museum_victoria +11977cc6 Merge branch 'master' of https://github.com/creativecommons/cccatalog into nypl_implementation +1380f7ed tested image , title and creator methods +9ddc9855 title and creator methods added +2ed12b5f Merge pull request #442 from creativecommons/europeana_sub_providers +6e2e2b6f add logic to recover license URLs from pairs; cleanup/refactor +a0cab049 use f-strings consistently for logging in licenses module +5d6f895b refactor licenses module using new license_path_map data structure +1f4fd885 add logic to create map for recovering license paths from license pairs +116db6c3 move Japanese 2.0 licenses to recoverable lists +295fe295 clean up unused LICENSE_PATH_MAP constant +6ad2b0cc set up path map constants for reversal +355dfbc1 add a number of known license paths to constants file +7cbb9c61 fix broken constants tests +46f718d5 (tag: v0.4.2) Merge pull request #453 from creativecommons/wmc_mediatype_bugfix +fb65a179 Merge pull request #452 from creativecommons/wmc_limit_bug +41c4e457 update license tests, split path correctly in constants +5c728576 add more constraint to license path definitions +2e6f6fd6 add logic to check response code to URL rewriter +55ca7718 tidy up docstring +c78a5525 test and document CC URL validation logic +88914f30 Add workflow for updating Smithsonian sub-providers +9e9ba965 Add test case for checking Smithsonian sub-provider retrieval at DB level +00e02de7 reduce line length for PEP8 compliance +2fac6e9f add docstrings to public URL methods +32afddd7 rearrange methods in licenses for clearer logical flow +235e5d59 give redirection handling logic meaningful name +f56fc508 update image tests to avoid trying to use socket.socket +6c748751 test suite till request handler +2999ad18 image url retrieval method with new logic +1341dd03 Initial implementation of Smithsonian sub-provider retrieval at DB level +17a18090 make scheme-adding function private; reorder urls module +52033994 use urls.rewrite_url_string in licenses module +83c08809 Add test for sub-provider retrieval from Smithsonian at API level +f9b8acfc move logging init to let importer set level easily +d269685f add check to determine if object is an image mediatype +989e7f52 update script to use ImageStore.total_images property +2107bb62 image_id points to new id field in api +04316eb8 turn up parallelism to 8 +62f710dc Initial implementation of sub provider retrieval from Smithsonian at API level +cffa5b40 Drop the temporary table after sub-provider update +40e05abd turn down LIMIT, and turn up parallelism +ba1a84a0 add support for 'URLs' that are IP Addresses +03ff5086 split common.storage.util into smaller pieces +bc420729 remove unused import +7cef6344 add logic to check license URLs for correctness, refactor utils +1d4502e6 changing image id to single number +db532afd Merge branch 'master' of https://github.com/creativecommons/cccatalog into museum_victoria +bca70817 update tests to avoid tldextract calling internet +a3ebd2f8 add URL validation and scheme upgrading logic +2ed77c65 Apply consistent temporary table structure for Flickr and Europeana sub-provider update +eaf76e48 Improve Europeana sub-provider retrieval logic to reduce memory consumption +7be2fe58 Merge branch 'master' of https://github.com/creativecommons/cccatalog into europeana_sub_providers +0e6faea7 (tag: 0.4.1) Merge pull request #446 from akshgpt7/total_images +713f871c Merge pull request #444 from creativecommons/met_museum_bugfix +1b38f727 tidy up quotes and spaces +0c8f44b6 Merge pull request #448 from creativecommons/index_fix +3b23ca72 Add an index to temporary popularity table identifier +a5835beb dag for museum victoria +bdb536cc Merge branch 'master' of https://github.com/creativecommons/cccatalog into museum_victoria +3bb24113 pep8 styling +625c6707 test suite +1d2fe409 image id changed to pair numbers +bee74694 PEP-8 fixes +90761309 add total_images property +f47a0581 Merge pull request #440 from creativecommons/kgodey-patch-1 +a8b62bc5 use image name from URL for foreign_id instead of generated index +6672be4a Add workflow for europeana sub-provider update +afe2c293 Throw exception if more than one sub-provider encountered +b7d25c32 Added collaborators to CODEOWNERS now that we have one! +9f8d035f Add test case for europeana sub-provider update +96badcd2 Merge branch 'master' of https://github.com/creativecommons/cccatalog into europeana_sub_providers +eba1a115 tested get_batch objects +6f3aec15 implementation of provider +e25369ac Merge branch 'master' of https://github.com/creativecommons/cccatalog into museum_victoria +2135a0cf (tag: v0.4.0) Merge pull request #439 from creativecommons/output_dir_bug +10bb83fc Merge branch 'master' of https://github.com/creativecommons/cccatalog into europeana_sub_providers +7eb308b0 Merge pull request #420 from creativecommons/retrieve_subprovider +8b86f1c5 add logging statement to see how many rows we're updating +41c4e8f3 configure DAG for manual triggering +2645f6bf modify output path of popularity_workflow +8097c36d Clean the Flickr sub-provider update code +e350f4a3 bug fix : escape character +3741d15c Initial implementation of europeana sub provider retrieval +dfdf125c Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider +4d599de2 Add test cases for checking alternative sub-provider update methods +86f6db87 Add changes to the alternative sub-provider update methods +c1331035 Pass provider/ sub-provider information as parameters +a71ccc83 Merge pull request #428 from creativecommons/smk_provider +01c8a72f Merge pull request #427 from jhutchings1/codeql +86aa2fee Merge pull request #434 from creativecommons/swap_tablenames +e726aa15 rename local postgres building SQL files +ea5ad96c Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider +640c22aa Alternative methods of sub-provider retrieval +b76403e7 change table names in code that uses SQL +cf26c524 Merge pull request #432 from creativecommons/s3_creds +9f4aa673 Update sub-provider test to match the new image table schema +4e8d4051 testing the api contents +baefadca Read non-standard environment variables into boto3 client +35b95d3e Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider +eb2dbda0 Set spacex as separate sub provider and remove redundant source value setting +dabe1722 dag for statens museum +e1e54305 Merge branch 'master' of https://github.com/creativecommons/cccatalog into smk_provider +8ee9909c (tag: v0.3.0) Merge pull request #426 from creativecommons/popularity_calc +fb990034 Merge pull request #429 from creativecommons/deduplication +e35f6bfe remove unused SQL files +af93046e smk implementation and test suite +6306448c progress - +2bfb4c9f Consistent usage of single and double quotes +c8dd0bdf add newline +52fbc639 Merge branch 'popularity_calc' of github.com:creativecommons/cccatalog into popularity_calc +85251197 Don't use the source as a factor in the calculation of each metric +9ba46f1b Update src/cc_catalog_airflow/dags/util/popularity/math.py +b6c809c6 Update src/cc_catalog_airflow/dags/util/popularity/math.py +6bb8f3b1 Merge pull request #418 from creativecommons/science_museum-bug-fix +6a398b67 update paths operator to remove all files from staging directory +dd9c924d change sql operators to use new_image table during transition +e0230593 add new_image table to local testing setup +1ccf8e06 use new table schema in loader and SQL modules. +a4068bed Add CodeQL security scanning +5351d24c Better name for a test +5d840e57 fix naming of column, reshuffle columns for commoncrawl TSVs +7ddcceaf add utility functions to migrate TSVs to new form +6ee1aa30 add SQL file for different strategy to avoid updating image +f273a730 Missing paren in docstring +52554534 Decode S3 cache properly +f91a36ff Write the percentiles cache to s3 +add82b65 Define main before setting up DAG +a18163be Add DAG for popularity workflow +cf0c0b4b Document popularity score calculation +07081467 Refactor percentile calculation and test cache validation +aa29b18e Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider +137d0898 Changes to make sub provider information available from a common file +dbc13bb9 add looping behavior to migration SQL +cb8aa9c3 add duplication column before putting data into it. +28df08e9 Initial implementation of DB update for sub providers related to Flickr +250aa24b tidy up SQL formatting for legibility; remove extraneous columns +ad117bb3 Add test for pullingresults from psql +2f424d05 Start testing popularity workflow +3a0654ff Log progress of popularity calculation +a2835e73 Use more appropriate RuntimeError exception instead of SystemError +c8d9aeaa Modularize popularity job +2020cec1 add SQL files to implement the deduplication process +58d8590d Upload normalized popularity scores back to the metadata column +6b8ea4c3 repair error in new test table definition SQL +59d87b98 Merge pull request #425 from creativecommons/kgodey-patch-1 +37f172fa update local testing image table with new uniqueness constraints +866ce6c0 Added Catalog core committers to codeowners. +7d19f33e modify sql operators to use new uniqueness constraint +a9698d8c add test data for common changes in URL we'd like to detect +7526e573 Fix some issues preventing popularity cache from being computed; make sure output tsv is open before calling copy_expert +eb4c4204 Remove popularity logic from sql.py; that's specifically for the loader workflow and doens't belong there. Implement recomputation of percentiles from expired file cache. +e87818a3 Update sub provider retrieval logic by setting the provider value in source +f6795184 foreign id reference image uid +d3e5ee15 Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider +7059538f Merge pull request #368 from allen505/europeana +2b35e3a5 Add workload for producing normalized popularity TSV and queries for generating the popularity dump +69738bf3 Fix error in test case with setting source +923a67b8 Update sub-provider retrieval test case +30eb7cb1 Update sub-provider retrieval to consider user ID +63186bae Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider +dbca4d85 Merge pull request #410 from creativecommons/wmc_empty_response_bug +47310ad5 Merge branch 'master' into europeana +1bc9e199 Added european key to env.template Changes to be committed: modified: env.template +1a757ab4 Code refactoring as per Code review +782ee086 Merge pull request #411 from creativecommons/science_museum_workflow +c820b243 science museum workflow +bbdb8677 improve logging when the image_batch has no pages +088da3cc Merge branch 'science_museum_workflow' of https://github.com/creativecommons/cccatalog into science_museum_workflow +0e511e41 science museum workflow +1fe6f583 update _get_image_pages to handle non-empty response with no pages +f237aba6 removed comment unrelated to science museum +2f9d72fb science museum workflow +5c743502 Merge pull request #407 from creativecommons/env_template_bugfix +b6d29a6f Change schedule_interval to daily +10a17de7 Removed return from pagewise and refactored code Changes were made as per Code Review Minor changes to fit PEP8 Changes to be committed: modified: dags/provider_api_scripts/europeana.py +2adccac3 Merge pull request #354 from ChariniNana/master +fd90b54c Merge pull request #400 from creativecommons/science_museum +12dcae0c Refactored code as per Code review +708431a6 add LOADER_FILE_AGE and DATA_GOV_API_KEY to env.template +52eb5dad Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider +9f778a96 (tag: v0.2.0) Merge pull request #404 from creativecommons/loader_file_age_bugfix +c9ec3aed change loader_workflow file waiting time to 15 minutes +52952569 Merge pull request #402 from creativecommons/wikimedia_reingestion +8421ac92 Merge pull request #401 from creativecommons/smithsonian_integration +f2ef7a1a improve logging formatting strings as per Timid Robot's comment +19bcc3d6 paritioned using custom year range. +d5ba7b8a change docstring to reference correct module +fd098163 Merge branch 'master' of https://github.com/creativecommons/cccatalog into science_museum +cb193402 change DAG name to align with Flickr ingestion DAG +12441607 Add source as Flickr when the provider is a sub-provider +67ed6353 Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider +9f0a9ffb add workflow DAG to run smithsonian ingestion weekly +e77dea5a fix bug in dag factory so that it uses correct operator +1d9e7bb7 Merge branch 'master' into smithsonian_integration +2f7c36d9 Merge branch 'master' into smithsonian_integration +197064cf add explicit tests for _check_type function +a574682f add docstrings for main functions, improve logging of type checker +fc91f163 change log statement level to reduce output +52eb85f6 add tests for remaining functions +ac08f537 modify _extract_tags so that it always returns a list +6a489621 Merge pull request #394 from creativecommons/flickr_reingestion +ab0b3e16 fix problems found in testing +b5552486 refactor to avoid single-use variables, add tests +4151e473 add basic tests for processing response_json +7485e7d2 use type checker function for row getter +7a369870 improve logging calls, remove unnecessary f-strings +0640d0a5 add rudimentary type verification to handle unexpected JSON values +9f5fbe34 license method and other utilities tested +2c394c29 Add test for sub provider retrieval +43010cb1 Remove independent image store creation for default provider +98e5a6c8 Apply suggested changes in error string parsing +d3f04aa0 Merge remote-tracking branch 'upstream/master' +0e5fdc66 Merge branch 'master' of https://github.com/creativecommons/cccatalog into science_museum +c0b294fa image methods created and tested +787cdfee improve creator-finding logic, add many creator type options +6f410216 Initial implementation of sub provider retrieval +48f06049 Merge pull request #398 from sp35/patch-1 +6ae9a8f3 _get_batch_object and param method tested +21817630 add large sample JSONs from SI to .gitignore +1c992ab9 remove large sample data JSON files +1749ba2d add tests for smithsonian.py; add sample responses +3aaa1742 change to hash partitioning to control response size +b1db1ef3 Add slack channel for the repo in metadata +10788071 changes in _get_object_json and _get_license_url +416a8ef4 Set the max allowed defective rows to 10 +9fcc0494 Merge remote-tracking branch 'upstream/master' +80be33fc Skip defective rows only at local loading excluding the logic from S3 loading +48ed4b79 Trimmed a line europeana.py to fit 79 chars Changes to be committed: modified: dags/provider_api_scripts/europeana.py +d70a10fa Removed unnecessary conditions as per code review Removed empty license condtions Trimmed lines to 79 chars per line Removed import of re +6dfc6bf9 Suggestions from code review +fd0d7e80 add workflow implementing scheduled WMC reingestion +c9dc9cbc Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue-348 +006ecb1e Merge pull request #359 from kss682/issue358 +c490ed95 add sleep to test workflow +1222e012 Update push_pull_request_test.yml +5e50e50a tune ingestion strategy configuration to prefer newer data +7764ff80 add tests for new functionality +641bfb83 add tests for get_dated_main_runner to check day-shifting logic +02b9a8cd update operator getter methods to use f-strings +734db84d change default start_date to a datetime.datetime type +4f380cd1 Remove unused import +70847b93 Attempt skipping defective rows in s3 load +510fc725 clean up unused imports in test_operator_util.py +9be5e788 document ingestion workflow and reingestion day list calculator +d7374670 remove subdag operator usage for simplicity, +f3fe8e00 rename wait operator getter to conform with others +8e64af58 extract meta-DAG factory method, +024a477d add reingestion meta-DAG +fa8f7eed refactor Flickr workflow in preparation for meta-DAG +b5aaafc8 Merge pull request #390 from creativecommons/merging_strategies +6228731b add newlines at end of TSVs +c5285a31 add truncated flickr example TSV files +b720f32a add merge_jsonb_objects function, reorganize/refactor +723aa8bf lxml used to get license and other minor fixes +68e22614 add newest_non_null and merge_jsonb_objects strategies +ade00f44 Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue-348 +3a4ab37c Merge pull request #389 from creativecommons/refactor_sql_module +64c2fdf2 Use triple-quoted f-strings +c437c36f use string constants in extracted function arguments +39b2f705 rename extracted function to match its current behavior, +08f07d30 factor out string constants, +22842a72 Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue-348 +c3501add Merge pull request #374 from creativecommons/load_s3_to_postgres_workflow +ec6d0e23 Merge pull request #380 from amartya-dev/master +ab339292 Merge pull request #381 from kss682/issue371 +80fcac41 reverts formatting changes +c4148e37 check for name in enriched tags formatting changes modified test to include case for tag being dict +a60e7605 add fields to gather and looping through unit codes to script. +ff7b5792 edited image.py ti comply with pep8 +1383744a calls rewritten raw_pixel and removed RawPixel +2e2123b2 check tags against a blacklist in ImageStore +ee1e61b9 enable pulling from Smithsonian API endpoint. +e2afe96d Merge branch 'master' into smithsonian_integration +1bfd622c set up local S3 -using tests to run with --disable-socket +be39ebe4 update env.template with new environment variables +b8209e9e add final (for now) tests for s3 functions +624055b1 Merge pull request #321 from sp35/rawpixel +a674567f Remove unused import +a51fc569 incorporate function to load data from S3 into Postgres into DAG +d6539025 Merge pull request #369 from creativecommons/sweep_to_s3_workflow +7644cfac add function to load data from s3 into postgres +5e7d1363 refactor loader sql module in preparation for adding s3 loading +c1b7d286 reorganize DAG to hold new s3 loading to postgres logic +0cedb6ac Added DAG and corresponding test file New file to create DAG to execute Europeana's script Test suite to check for any import errors and the number of DAGs created Changes to be committed: new file: dags/europeana_workflow.py new file: dags/test_europeana_workflow.py +e50270c4 set up local postgres with mock aws_s3 functionality +935e2e2c change f-string to plain string, as per Timid Robot's comment +71c8e8ec freeze more requirements to let building happen without errors +25d4192a add error exit if local S3 isn't working +68e4be2e add basic tests for s3 copying method +a7a0d670 Refactor and test function for metadata functions Test functions for metadata and description functions Code Reafactoring New test for get_image_list for last page +bd4d0130 LangAware Description, tests to extract_data Description of images is taken in the following priority: -English -Default -dcDesption if neither of the above were available +e92fc0d1 [fix] Use logger for all logs and tags directly +2b586e77 set up local s3 for local running and testing +2c8bf9d6 add loading data to s3 as a dependency of local loading +44e79188 modify s3 loader trigger rule to avoid race with local loader +62f80b8f add s3 loading logic to database loader workflow +15b5811d date changes and removed old config +8759b1f9 Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue358 +b2f57ef1 patch object used to handle response +ab6e5d80 handling objects as batch and its tests +cab3afcd [Fix] Change test case for list of tags in test_raw_pixel.py +df0efb38 [fix] Faulty list of tags and logger instead of logging +f1a480d0 Added new test functions and response.json file Test function for empty list and error in response. Minor bugs fixes. This commit also adds a sample success response to be used for testing Changes to be committed: modified: dags/provider_api_scripts/europeana.py modified: dags/provider_api_scripts/test_europeana.py new file: dags/provider_api_scripts/tests/resources/europeana/europeana_example.json +901d66d3 Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue-348 +c3e347df Merge pull request #357 from akshgpt7/automate-linting +838132d4 Merge pull request #361 from creativecommons/freeze_requirements +0bbe4b76 Freeze SQLAlchemy version due to upstream bug +3c73a0d5 Remove branch restriction from push +5c18e6d1 metropolitan museum workflow +ef41f845 Merge pull request #278 from AyanChoudhary/rewrite_met_museum +8d48efbf Add workflow for linting, annotations for pull requests and push +e44acf2d Fixed timestamp and cusor bug Changed the Timestamp to ISO 8601 format Added code that caused error with last page of the results. +f9a83211 Added test file for Europeana API +7c4a1073 fix:styling issues +a3e41797 use getenv to get API key +9ea85543 proper use of image count +2bab62cf Refactor raw_pixel and test_raw_pixel - improvements +eee72263 Added code to get number of images stored modified: src/cc_catalog_airflow/dags/provider_api_scripts/europeana.py +90cdc32e Stored image & metadata using the ImageStore class +3520b9eb brooklyn museum rewritten +2e91a188 Merge remote-tracking branch 'upstream/master' +8f6e5dce Test skipping of defective rows upto a maximum number and throw error if max exceeded +cc9c4568 Update the data import from tsv to table to support skipping upto a maximum number of defective rows +1262a97c Merge pull request #344 from amartya-dev/automated_testing +50e3a6eb Merge pull request #349 from creativecommons/prod_deployment +670ab637 reorder commands in deployment bash script +4f13369b Tidy up bash script; improve Dockerfile directory handling +2f1d58d3 fix: patch test API calls with monkeypatch +ab24b4c2 Pagewise function implemented Images are retrieved pagewise till all images are retrieved +97fb972d Merge branch 'master' into prod_deployment +5fbad057 add deployment bash script to avoid remembering commands +45ade1a0 Merge pull request #330 from akshgpt7/phylopic +6abb65ce Merge branch 'master' into prod_deployment +2c7f7ce9 Merge pull request #346 from kss682/issue241 +955caef0 BashOperator used +1a7bd4d7 reconfigure wmc workflow file to match production +77d4f6dc fix bug where flickr script fails when reponse has no images +3591ff69 add testing plugins to requirements.txt, +60da0c71 test on both push and pull request +b863af74 Merge branch 'master' into prod_deployment +9656653c Add test resource for test_raw_pixel.py +b6c503a8 Add tests for raw_pixel.py +503bce53 monthly workflow indiviual scripts +0cf48025 Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue241 +64696eee Merge branch 'master' of https://github.com/creativecommons/cccatalog into phylopic +f972a59d Remove old Phylopic workflow +9f97e582 Merge pull request #328 from kss682/issue255 +5f49b694 Merge pull request #347 from creativecommons/flickr_bugfix +dcb8c33f add tests, clean up temporary bug fix +55f721d7 Merge branch 'master' into flickr_bugfix +be0da184 Added code to fetch details from the first page using Cursor based pagination. +84980d31 Monthly workflow and testsuit added +5908f5b9 monthly workflow +c6a4b64f Added command to copy env template +e7244c12 Added command to copy env template +389dde15 Changed directory before docker compose +d3fba27c Corrected the working directory specification +3fc9c045 modified commit accordinf to latest docs +3842c449 modified commit accordinf to latest docs +c6446844 Automated testing on pull request +23dabab3 Merge branch 'master' into prod_deployment +14419b32 change environment and docker compose configuration for prod use +fb9994f9 Merge pull request #342 from kss682/issue336 +79e20bad Merge branch 'master' of https://github.com/creativecommons/cccatalog into phylopic +58fe6d5f changes made as per review +9cf6251a Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue255 +1d25cef6 method filter moved to query parameter +5d656238 Merge pull request #341 from SaurabhAgarwala/pr-moving-workflow +73836ecf Add the continue-on-error configuration to the PR moving workflow +0f250a13 Merge pull request #331 from creativecommons/dag_specific_loader +be04e432 feat: added tests for additional images +4a0ee314 Merge pull request #320 from mjprince/master +f51d7b4b Merge branch 'master' of https://github.com/creativecommons/cccatalog into phylopic +2b5f16dd Merge pull request #314 from ChariniNana/master +9bd3c2c1 test suite for cleveland script. +82ba4d40 get_response retry logic changed +a657f9d7 Merge pull request #332 from akmadian/master +956c19c7 Write test for phylopic workflow +3a7c507b Swap "not ready for work" and "awaiting triage" in issue templates +26f6c45d add python script to wait until Airflow metadata DB becomes ready +30f66eeb Update : review and pep8 guide changes made +1503f364 remove old env.sh.template +17419719 update README.md with new docker-compose dev setup +fb2d7603 Remove non-essential parameter from get_response_json test +2d058d3a Code formatting +229b67a7 Add generic tests for checking the get_response_json method and remove them from provider scripts +c428a490 add env.template to give the format of the .env file +b7583dfc Rename test_phylo_pic.py to test_phylopic.py +d994bdcb Remove duplicate file phylo_pic.py, created for case insensitive systems +3d3fb4aa Remove deprecated PhyloPic.py +ad5d183c Revert "Remove old phylopic.py and test_phylopic.py" +8252aa03 Remove old phylopic.py and test_phylopic.py +294f22c6 Create Apache Airflow DAG to run new phylopic.py script. +1dd89444 Update main.yml +6c44f90f rewriten clevelend provider script with ImageStore. +2117b176 Improve rawpixel.py - rename, private non-main functions, reduce main() +bd265c0a initial fix; This needs to be checked more in depth +e4c8d930 Further stylistic changes to support future signature changes +fb4b1ca0 Add stylistic edits to support signature changes in future +677bc0f8 Merge remote-tracking branch 'upstream/master' +2837027d Merge branch 'master' into dag_specific_loader +63bf5f07 feat: write unhappy path tests for _get_image_data +676b6bff Merge branch 'master' into dag_specific_loader +6c220b95 fix: remove unused imports and fixed new function call +f8accb48 Split process_image_data method into smaller methods in rawpixel.py +410335f4 add drop table test, rename table creator +a6e6dbd0 add more sql function tests for loading and upserting logic +47447368 Update main.yml +0f5ae13b Rewrite RawPixel.py using new ImageStore class - rawpixel.py +6938cefd Create main.yml +f6aed3d7 add more sql function unit tests +0005fb81 Merge pull request #313 from creativecommons/rename_old_scripts +528e5810 Format code +13f5f9a2 Use the get_response_json function provided in requester class with phylopic script and related tests +845802ef rename phylopic.py to phylo_pic.py +af28c7c1 add initial sql function tests +883708df modify loading table creation query to fail if it already exists +ff5d1f47 add tests for util.paths submodule +e267c2cf change funciton name in paths.py to match new operator definition +d86e2b8e rename DAG tasks (nodes) for clarity +e5539765 reorganize loading operators and logic into a package +218c24ad Fix test which mocks the get_response_json method +8cad5316 increase testability of loader_workflow.py +8e0bad68 change to official python base Docker image +d1cc3db5 Use the get_response_json function provided in requester class with wikimedia commons script and related tests +98de9d1a Add get_response_json function to requester class to minimise repetition of code +fb41ae58 Merge pull request #276 from akshgpt7/phylopic +89034dfc Merge pull request #285 from ChariniNana/master +2b868f7e Merge pull request #309 from creativecommons/kgodey-patch-2 +09f3c924 Merge pull request #306 from creativecommons/issue_template_fixes +e91381ea Code formatting in the test file +63a9b846 Merge pull request #308 from creativecommons/kgodey-patch-1 +453165dc Code formatting to comply with PEP8 +304a0f84 Delete older CODEOWNERS file +62fa4034 Added CODEOWNERS file +217278f7 Add foreign_identifier arg to add_item in phylopic.py, write test for it and add default 'all' value to date +fdaaaa39 modify issue templates as per comments by Timid Robot and Kriti +3ea544fd Merge branch 'master' of https://github.com/creativecommons/cccatalog into phylopic +15009eb4 Modify date keys to reflect the information they provide +12c55130 Merge remote-tracking branch 'upstream/master' +6f99b434 Simplify parameters and minor fixes in phylopic.py +c61a76e9 Merge pull request #303 from mariuszskon/fix/etlmods-deprecation-warning +655be1dc increase wait for file to finish updating +9475bd68 Merge pull request #290 from qubit99/master +d4ed42b6 parallelize DB loading somewhat, +a9b250b1 Updated test to verify date uploaded/taken are stored in meta data +afea1e98 Merge remote-tracking branch 'upstream/master' +3820596f Fix etlMods.py DeprecationWarning for invalid escape sequence +e523687b fix:handle foreign_landing_url value from the API and rename _get_data_for_each_image to _get_data_for_image +a1b6d9e4 fix:made requested changes and chnged foreign url according to new API +350f87ea Merge pull request #292 from creativecommons/new_issue_templates +ba67f961 add numerous new templates for issues +6904102d Updated Docstring +21e4f6c8 Merge pull request #288 from Milind712000/fix-readme-file-links +f6502b0b Merge branch 'master' into dag_specific_loader +57566942 increase wait time for production, change name for descriptiveness +6fd9210a Write tests for phylopic provider API script +663f23f3 Add example files for phylopic tests +3d6a0442 Fix README.md file links +dcc44ef6 extract sql.py from main DAG file for clarity +fe1bf147 finish basic version of loader DAG, add a smoke test +79eb62d8 fix:write tests for _get_data_for_each_image through _process_image_data, used new api response as sample response and enforced PEP8 standards in test +3d2766a4 [Issue 222] date info added to metadata +412a2e74 fix:removed unused vars and imports, fixed fatal bug in get_data_for_each_image by using keyword arguments, removed default mode from arg parse and left only date as the sole parameter and refactored code to confirm with PEP8 +fb58d77d add branching logic to loader dag +7161426a Break down larger functions +7962b201 Merge branch 'master' of https://github.com/creativecommons/cccatalog into phylopic +a0bf4e3d Refactor phylopic.py according to make requested changes +6f38793f fix:changes test for met..museum.py to function with mockpatches +0350bd7d fix:requested changes in metropolitan_museum_of_art.py +34a2fc55 add new loader_workflow.py, refactored a bit +7ba0a4e5 add initial Smithsonian Institution Provider API script +fc8f003e Merge pull request #272 from akshgpt7/flickr +e5aa677a Fix break logic in flickr.py and write test for it +373f4a66 refactor:change class names to follow convention +70140556 refactor:logger format to the new syntax +fa27f6a5 feat: completed test for create_meta_data +2691d079 feat: completed test for get_response_json +0b5d1e73 feat: completed test for get_object_ids +0a6bcf67 feat: refactored code with new classes +1578db9a feat:changed file to new name and updated imports from the new classes +17d8b8b9 Add _get_response_json() method +a000160b Merge branch 'master' of https://github.com/creativecommons/cccatalog into phylopic +668872d9 Refactor the new phylopic API script +15183839 Rewrite phylopic API script using new model. +00d4969b Merge branch 'master' of https://github.com/creativecommons/cccatalog into flickr +c6acda37 Fix requested changes +7ac75321 Merge pull request #269 from creativecommons/provider-api-issue-template +0b1c0299 Logical fix in returning Nonetypes +08da7324 Change 'tries' to 'max_tries' in flickr's provider api script's _get_image_list method for more clarity +44e6c0b9 Refactor flickr API script's _get_image_list method to use 'tries' instead of 'retries' +e2be5c4d remove unnecessary comment in cleveland museum +c309dcf0 Merge pull request #270 from akshgpt7/cleveland +f864e891 use has_image parameter in Cleveland Museum script +807cb5e8 Add Provider API issue template +ebf8582a add docker-compose to test Apache Airflow operations on PostgreSQL +f00d1576 Merge pull request #266 from creativecommons/flickr_dag +e40669f3 remove deprecated Flickr script and associated cruft +cdf90758 add new flickr dag, and a basic smoke test +b4c9c70b Merge pull request #263 from creativecommons/flickr_rewrite +849aaad9 add whitespace around equals sign +1080186a add more tests, most at a higher level +7a3518de extract json checking to its own function +20f81d49 add more tests for new flickr.py script +87b8ae1c Merge pull request #261 from creativecommons/wikimedia_timeout_increase +8d226e11 raise exception if retries are exceeded +2a2c48ab increase timeout to help wikimedia commons script succeed +1c1fc52b Merge pull request #259 from creativecommons/requester_bugfix +283c4c8c delete extraneous requests.get outside of try/except block +752676cb add resource jsons to avoid large test functions +bb04b7b5 add more tests for Flickr script +c99b09e9 default to empty string for description so we can strip it +97f853dd rewrite Flickr.py to flickr.py, add basic tests +f373ed35 add logging message for missing columns in ImageStore +e3d1b407 Merge pull request #250 from creativecommons/wikimedia_commons_dag +56eda8b8 expand ts to timestamp for clarity in wikimedia_commons.py +a520362d change itereator variable from i to _ +b5a68a6e modify wikimedia_workflow DAG test to work from other directories +3ad1c3dd add DAG for new Wikimedia Commons script +fbfcb8d4 change image_batch recursion into loop for safety +a99c39f4 remove old WikimediaCommons script and tests +98a31aa6 update Dockerfile to avoid permissions problems with env.sh +c36f3d28 Merge pull request #248 from creativecommons/wikimedia_rewrite +e6b1334a conform to PEP8 numbers of lines between test functions +232588b1 add docstring for initializing DelayedRequester +8ea121b8 Add globalusage tallying logic to wikimedia_commons.py +1f3d1fa1 port and add tests for new wikimedia_commons.py script +066ccf4a Merge pull request #246 from creativecommons/image_repository_class +752e85c3 add initial version of new wikimedia_commons.py script +ab0cabf8 add DelayedRequest class to handle rate limited requests +b314e9d0 reorganize image store class +1f9e7e8f add early exit when there is no license or version available +6b7c3088 reorganize directory structure for delayed_request module +5049b57b add logic to enrich meta_data with license_url by default +de23bd56 change pairs from generator to list comprehension for logging +796f6c5b move unused filesize column one layer to DB +2ad78259 add docstrings to public ImageStore methods +640da118 add docstrings to public functions in storage.util +f2a5c010 remove unused enforce_all_arguments_truthy function +b5b204b8 try harder to cast booleans, add docstrings for column classes +4dd8b15c finish ImageStore.commit logic +852757f9 add writing to disk and more verification logic to ImageStore +74c29f33 add image.py with ImageStore class +4b7f8752 add columns.py, creating column types +afddfa0b add sanitization of strings and json to storage utilities +d56d79ae add methods to enforce truthiness and merge provider/source +c4b80600 Merge pull request #245 from creativecommons/turn_on_separate_dags +41d04dfb add character limit enforcer to avoid DB import failures, +781435ed add basic utilities to be used by ImageStore class, +0ae98f94 schedule separate DAGs to run in place of dailyWorkflow.py +993c74d2 Merge pull request #235 from creativecommons/json_string_bugfix +329b2c1c rewrap function arguments +24e95792 add function to sanitize json values before dumping to json string +42b65e91 Merge pull request #230 from creativecommons/airflow_daily_dag_split +bd19070d capitalize all letters in constants from `config.py` +ad2b3411 wrap long line +133851be add comment to Dockerfile explaining 'hash' style image tag +c2fb113d Change links to reference style, where appropriate +cdaa0a77 specify Amazon EMR to avoid acronym-knowledge overhead +80acb00f make minor changes to conform to PEP8 +c0a67a4a update README with new filepaths, and clean up cruft +0051db98 remove old dailyWorkflow.py DAG file +6c69a400 Add newlines to ends of files to please git +939b0922 Fix crontab bug in WikimediaCommons DAG; add DAG config validation +a307b323 split dailyWorkflow.py into separate DAGs. +9676210a use env.sh in Dockerfile +1ad3cfc7 add airflow testing detritus to .gitignore +bd5c05e3 move dag files to synchronize with Docker Container +08d37f63 move Dockerfile and requirements.txt to DAG directory +35697bb5 Move example output files to testing resource location +33c53961 move api provider scripts to make deployment simpler +cf7d68a8 Merge pull request #226 from creativecommons/wikimedia_commons_creator_fix +0e8d9818 Use single quote for all strings in `test_WikimediaCommons.py` +0c28c78f Extract json examples from test file +37959914 use parentheses instead of backslashes for line continuation +cd765de2 extract row_generator from list comprehension for readability +b1a6eb06 change docstring to correct endpoint for documentation +b07de227 add functionality to scrape text from description field +db7fc30a refactor WikimediaCommons.py with snake_case, change endpoint +cee194dc Merge branch 'flickr_test_speed' into wikimedia_commons_creator_fix +4027b911 bring WikimediaCommons.py into pep8 compliance, remove star import +d3f8aa50 monkeypatch delay function to speed up testing +897a808a refactor create_tsv_list_row, add logging functionality +9dfa77e5 Refactor getMetaData into process_image_data +906d64b5 change Wikimedia Commons script to use artist info for creator +cf8c0d85 (tag: v0.1.0) Merge pull request #218 from creativecommons/extract_row_formatter +1392dadf change name of etlMods import to be more meaningful +574a3684 align more variables with pep8, remove star import from Flickr.py +08c608e1 extract row-writing function from Flickr.py +b76dc709 Merge pull request #213 from creativecommons/test_env_setup +8ed02300 add Dockerfile to set up local testing/development environment; add test for Flickr.py +fd9d71e0 Merge pull request #198 from creativecommons/string_sanitizer_bugfix +bd75338c add casting to string as first step of sanitizeString method +ece000fe Merge pull request #190 from paulofilip3/master +3a66c105 Fix requirements.txt +9cbf0f76 Add gitignore +2c0fb06b Update README.md +4dd2b8ba Update README.md +ce9c808e Update README.md +3d787d11 Update README.md +d58c7a4a Update CC Catalog Common Crawl test cases +662fd157 Update README.md +74cc30fc Update requirements.txt +55445e89 Update README.md +aef747e3 Add wikimedia commons to the workflow +e01dd5cf Bug fix +e22fef7d Add cc catalog workflow scripts +ab5bf45c Add Wikimedia Commons using the API +d7faf342 Add NYPL using the API +aef20219 extract popularity metrics from Behance +8fd4d09e Add new date parameter to query images on flickr +401c4737 Remove Brooklyn Museum from the common crawl providers +d0607b61 Add RawPixel using the API +4c6e6b36 Remove RawPixel from the common crawl providers +5d9f9a2d Merge branch 'master' of github.com:creativecommons/cccatalog +e6c5e75d Add new provider +b5695fa3 Add function to extract the license +d39b37bf Update CONTRIBUTING.md +94de9068 Update .cc-metadata.yml +0dd15188 Update and rename .github/CODEOWNERS to CODEOWNERS +0c55f62e Update and rename CODEOWNERS to .github/CODEOWNERS +83fc4b20 Update common crawl and api jobs to standardize the output data +146674d2 Sanitize strings +b55def75 Sanitize strings +612cc846 Bug fix +312a9d5f Bug fix +9c838103 Add new provider +1483e51a Bug fix +384ebcfe Update modules +8740f0fb Update the output description +c9bdb3da Add optional parameter for http requests +25173032 Create function to extract CC license and version from url +65994fe0 Make repository contribution ready +e4477830 Bug fix +9cf6a058 Update log statistics +dc15c511 Bug fix +b884a402 Set default mode to the start of the previous hour +dd9e0488 Include sys module +be04fd8c Escape special characters +65769114 Add new provider +98442106 Bug fix +d2709afe Add new Common Crawl providers +5d854355 Replace empty strings with null +0eac921d Update thumbnail source +e95e9191 Update code to parse changes in the HTML +6f32e45b Bug FIx +e984d251 Bug fix +72b1b714 Add new providers +2fbbeb43 Bug fix +0fb57d2a Bug fix +34a222fe Refactor common crawl scripts +44ed6124 Update argument name +a752543f Update driver to identify 3D models by date +dd0ec1e7 Remove redundant modules +fe6300d5 Update imported modules +e92268da Bug fix +aa01a978 Modularize code +e4d57373 Modularize code +f2c0b51b reformat the output and santitize strings +9fb964d6 Provide optional arguments to execute the script +2368ef60 Add Met Museum as a new content provider +5c4b476b Update image detection logic +9d98d82a Change common crawl index extraction logic +f2a04b14 Merge branch 'master' of github.com:creativecommons/cccatalog +ace5fdc2 Add Cleveland Museum as a new provider +b2ca5b0b Update requirements.txt +1ad2b811 Add new API provider +0c37ad38 restructure common crawl providers +473490f7 Merge branch 'master' of github.com:creativecommons/cccatalog +16196822 Add generic exception handling for requests +62103934 Update README.md +d1efa480 Update README.md +d5901e0a Update ExtractCCLinks.py +c6949f15 Add new providers +d44bdc45 Add new providers +07a8b3d6 Create a default parameter for the common crawl ETL process +a31aa0f1 Add new content providers and sample data +76af3781 Add data extraction steps for the Met +00417189 Merge branch 'master' of github.com:creativecommons/cccatalog +adb2d20f Add the Met Museum as a content provider +125504d6 Update README.md +9b84a269 Add test cases +f5532c42 Update ExtractCCLinks.py +1a813349 Identify domains that link to creative commons +a31ee4bb First tests for the common crawl parsing strategy +2138ad24 Some basic tests for working with spark and wark files and some example processing we are going to be running. +ffa2b85e Initial commit From 17985e8e4ce9ed3ae212ed714aa7b6a9f69ae6ae Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Wed, 26 Oct 2022 13:56:02 +1100 Subject: [PATCH 8/9] Short curcuit empty fields into `None` record --- .../provider_api_scripts/europeana.py | 115 +++++++++++++----- .../provider_api_scripts/test_europeana.py | 59 +++++++-- 2 files changed, 135 insertions(+), 39 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/europeana.py b/openverse_catalog/dags/providers/provider_api_scripts/europeana.py index 8dbb19624..3e3de3504 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/europeana.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/europeana.py @@ -9,6 +9,7 @@ Notes: https://www.europeana.eu/api/v2/search.json """ import argparse +import functools import logging from datetime import datetime, timedelta, timezone @@ -23,6 +24,30 @@ logging.getLogger(common.urls.__name__).setLevel(logging.WARNING) +class EmptyRequiredFieldException(Exception): + def __init__(self, method_name: str, value): + super().__init__(f"`{method_name}` returned an empty value: {value}.") + + +def raise_if_empty(fn): + """ + Used to decorate RecordBuilder methods for "required" fields + to shortcut record building in the case where a record would + be missing some required fields and be thrown out anyway. + """ + + @functools.wraps(fn) + def inner(*args, **kwargs): + value = fn(*args, **kwargs) + + if not value: + raise EmptyRequiredFieldException(fn.__name__, value) + + return value + + return inner + + class EuropeanaRecordBuilder: """ A small class to contain the record building functionality @@ -30,50 +55,76 @@ class EuropeanaRecordBuilder: """ def get_record_data(self, data: dict) -> dict: - record = { - "foreign_landing_url": self._get_foreign_landing_url(data), - "image_url": data.get("edmIsShownBy")[0], - "foreign_identifier": data.get("id"), - "meta_data": self._get_meta_data_dict(data), - "title": data.get("title")[0], - "license_info": get_license_info( - license_url=self._get_license_url(data.get("rights")) - ), - } + try: + record = { + "foreign_landing_url": self._get_foreign_landing_url(data), + "image_url": self._get_image_url(data), + "foreign_identifier": self._get_foreign_identifier(data), + "meta_data": self._get_meta_data_dict(data), + "title": self._get_title(data), + "license_info": get_license_info( + license_url=self._get_license_url(data) + ), + } + + data_providers = set(record["meta_data"]["dataProvider"]) + eligible_sub_providers = { + s + for s in EuropeanaDataIngester.sub_providers + if EuropeanaDataIngester.sub_providers[s] in data_providers + } + if len(eligible_sub_providers) > 1: + raise Exception( + f"More than one sub-provider identified for the " + f"image with foreign ID {record['foreign_identifier']}" + ) + + return record | { + "source": ( + eligible_sub_providers.pop() + if len(eligible_sub_providers) == 1 + else EuropeanaDataIngester.providers["image"] + ) + } + except EmptyRequiredFieldException as exc: + logger.warning("A required field was empty", exc_info=exc) + return None - data_providers = set(record["meta_data"]["dataProvider"]) - eligible_sub_providers = { - s - for s in EuropeanaDataIngester.sub_providers - if EuropeanaDataIngester.sub_providers[s] in data_providers - } - if len(eligible_sub_providers) > 1: - raise Exception( - f"More than one sub-provider identified for the " - f"image with foreign ID {record['foreign_identifier']}" - ) - - return record | { - "source": ( - eligible_sub_providers.pop() - if len(eligible_sub_providers) == 1 - else EuropeanaDataIngester.providers["image"] - ) - } + @raise_if_empty + def _get_image_url(self, data: dict) -> str | None: + group = data.get("edmIsShownBy") + return group[0] if group else None + + @raise_if_empty + def _get_foreign_identifier(self, data: dict) -> str | None: + return data.get("id") + + @raise_if_empty + def _get_title(self, data: dict) -> str | None: + group = data.get("title") + return group[0] if group else None + + @raise_if_empty + def _get_license_url(self, data: dict) -> str | None: + license_field = data.get("rights") + if not license_field: + return None - def _get_license_url(self, license_field) -> str | None: if len(license_field) > 1: logger.warning("More than one license field found") for license_ in license_field: if "creativecommons" in license_: return license_ + return None + @raise_if_empty def _get_foreign_landing_url(self, data: dict) -> str: original_url = data.get("edmIsShownAt") - if original_url is not None: + if original_url: return original_url[0] europeana_url = data.get("guid") + return europeana_url def _get_meta_data_dict(self, data: dict) -> dict: @@ -163,7 +214,7 @@ def get_should_continue(self, response_json: dict): def get_batch_data(self, response_json: dict) -> None | list[dict]: if response_json.get("success") != "True": - logger.warning('Request failed with ``success = "False"``') + logger.warning('Request failed with ``success != "True"``') # No batch data to process if the request failed. return None diff --git a/tests/dags/providers/provider_api_scripts/test_europeana.py b/tests/dags/providers/provider_api_scripts/test_europeana.py index 91e51e485..d1fea135e 100644 --- a/tests/dags/providers/provider_api_scripts/test_europeana.py +++ b/tests/dags/providers/provider_api_scripts/test_europeana.py @@ -86,17 +86,21 @@ def test_get_should_continue_updates_cursor(ingester): @pytest.mark.parametrize( ("response_json"), ( - {"success": "True", "nextCursor": None}, - {"success": "True"}, - {"success": "False", "nextCursor": "blam"}, + {}, + {"nextCursor": None}, ), ) def test_get_should_continue_returns_false(ingester, response_json): assert ingester.get_should_continue(response_json) is False +def test_get_batch_data_returns_None_if_success_not_True(ingester): + response_json = {"success": "False", "items": [1]} + assert ingester.get_batch_data(response_json) is None + + def test_get_batch_data_gets_items_property(ingester): - response_json = {"items": object()} + response_json = {"success": "True", "items": object()} assert ingester.get_batch_data(response_json) is response_json["items"] @@ -156,9 +160,7 @@ def test_get_license_url_with_non_cc_license(record_builder): image_data = _get_resource_json("image_data_example.json") image_data["rights"] = ["http://noncc.org/"] - assert record_builder.get_record_data(image_data)["license_info"] == LicenseInfo( - None, None, None, None - ) + assert record_builder.get_record_data(image_data) is None def test_get_license_url_with_multiple_license(record_builder): @@ -304,3 +306,46 @@ def test_process_image_data_with_sub_provider(record_builder): "meta_data": expect_meta_data, "source": "wellcome_collection", } + + +DELETE = object() + + +@pytest.mark.parametrize( + ("field_name", "value", "extra_empty_fields"), + ( + ("id", "", ()), + ("id", None, ()), + ("id", DELETE, ()), + ("edmIsShownAt", "", ("guid",)), + ("edmIsShownAt", [], ("guid",)), + ("edmIsShownAt", [""], ("guid",)), + ("edmIsShownAt", None, ("guid",)), + ("edmIsShownAt", DELETE, ("guid",)), + ("rights", [], ()), + ("rights", [""], ()), + ("rights", ["not-cc"], ()), + ("rights", DELETE, ()), + ("title", "", ()), + ("title", None, ()), + ("title", DELETE, ()), + ("edmIsShownBy", "", ()), + ("edmIsShownBy", None, ()), + ("edmIsShownBy", [], ()), + ("edmIsShownBy", [""], ()), + ("edmIsShownBy", DELETE, ()), + ), +) +def test_record_builder_returns_None_if_missing_required_field( + record_builder, field_name, value, extra_empty_fields +): + image_data = _get_resource_json("image_data_example.json") + for empty_field in extra_empty_fields: + del image_data[empty_field] + + if value is DELETE: + del image_data[field_name] + else: + image_data[field_name] = value + + assert record_builder.get_record_data(image_data) is None From f02a4741814ede69b5349b0988241a88263364f1 Mon Sep 17 00:00:00 2001 From: sarayourfriend <24264157+sarayourfriend@users.noreply.github.com> Date: Fri, 28 Oct 2022 10:19:02 +1100 Subject: [PATCH 9/9] Remove erroneously added file --- qq | 1399 ------------------------------------------------------------ 1 file changed, 1399 deletions(-) delete mode 100644 qq diff --git a/qq b/qq deleted file mode 100644 index 12694a072..000000000 --- a/qq +++ /dev/null @@ -1,1399 +0,0 @@ -1ea1a507 (HEAD -> refactor/europeana-provider-base-class) Add back default cursor from previous implementation -b54f8af1 (origin/refactor/europeana-provider-base-class) Update provider workflow config for Europeana -384380dc Remove unnecessary batch_limit override -dfac5064 Fix description missing if en or def are empty -2781620c Refactor Europeana to use ProviderDataIngester base class -cc4b2f9f Add default implementation for `get_media_type` for providers with single media type -9ff501e6 (origin/main, origin/HEAD, main) 🔄 Synced file(s) with WordPress/openverse (#802) -63b0fb7f Retire TSV loading workflow (#789) -3217ed5e Made improvements to `CONTRIBUTING.md` (#791) -8f92318c (tag: v1.3.5) Refactor Freesound to use ProviderDataIngester (#746) -46c2c161 Retire Walters Art Museum provider script (#786) -740cf00c Bump pytest-mock from 3.9.0 to 3.10.0 (#781) -db47359a Refactor Jamendo to use the ProviderDataIngester (#741) -d678dc7a Disable email on failure by default (#788) -8ee7fb72 Add concurrency settings for workflow (#770) -fcf1d90c 🔄 Synced file(s) with WordPress/openverse (#787) -4fee8ce9 Increase dependabot PR limit to 10 (#780) -337ea7ae 🔄 Synced file(s) with WordPress/openverse (#771) -4cb9d417 Fix italics for duration disclosure (#769) -f6538ce2 Bump pre-commit from 2.14.0 to 2.20.0 (#779) -d378ba7b Bump tldextract from 3.3.1 to 3.4.0 (#777) -d926e083 Bump apache-airflow[amazon,http,postgres] from 2.4.0 to 2.4.1 (#767) -06faf94f Bump pytest-sugar from 0.9.4 to 0.9.5 (#751) -62ee12a0 Bump isort from 5.9.3 to 5.10.1 (#764) -e49b0c32 Bump black from 22.3.0 to 22.10.0 (#778) -c1b970b1 Add user agent to StockSnap header and use header in requests by default (#765) -d4dbf4d0 Improved data refresh status reporting (#744) -7bf37fc5 Bump pytest-mock from 3.6.1 to 3.9.0 (#749) -58247f67 Bump tldextract from 3.1.0 to 3.3.1 (#752) -600b9eea Remove periods after URLs in log lines. (#763) -b571d024 Bump flake8 from 3.9.2 to 5.0.4 (#750) -516d7674 Add dependabot config (#740) -323d07bc Refactor SMK script to use the `ProviderDataIngester` class (#742) -3b58e60c Default unfurling of links and media to False in Slack notifications (#743) -b9f29df9 (tag: v1.3.4) Add tags option for provider workflows & "legacy-ingestion" tag (#739) -b4ef93ce Bump Airflow to 2.4.0, standardize version bump process (#737) -cec68932 Use Airflow variable to omit DAGs from any Slack notification (#644) -4a9c008a Update reingestion workflows to load and report data (#618) -fc627743 🔄 Synced file(s) with WordPress/openverse (#735) -28bfd169 Add spellcheck to pre-commit config (#718) -a329be22 (tag: v1.3.3) Bump Airflow version to 2.3.4 (#731) -e66bf63f 🔄 Synced file(s) with WordPress/openverse (#733) -5e7119c5 (tag: v1.3.2) 🔄 Synced file(s) with WordPress/openverse (#728) -6e9d02d6 Add none check for Cleveland `image_data` (#709) -bcda6e0b Add `DEPLOYMENT.md` & deployment-related files (#711) -70312d08 Remove error swallowing during ingestion (#713) -18decf9b Refactor Wikimedia Commons to use ProviderDataIngester (#614) -25feeb73 Allow string as exceptions in `on_failure_callback` (#695) -e9fe5b96 Always use Jamendo's "streaming" audio (#706) -9be8bcec Refactor Brooklyn Museum to use ProviderDataIngester (#701) -d828d257 Fix dagrun conf for provider scripts (#708) -70c66f93 Initialize iNaturalist with dagrun conf (#707) -b85df5b2 hardcodes the test ingestion limit to 1 000 000 (#705) -8ac257cd Refactor Metropolitan Museum of Art to use ProviderDataIngester (#674) -2a9647ab Always record provider run duration (#694) -03ce84f2 Allow DAGs to silence only errors matching predicate (#654) -a9417694 Bump iNaturalist timeouts to 5 days (#691) -8a28f948 Update CODEOWNERS (#677) -1c2fbe8a Standardize on datetime over pendulum (#678) -ee474f2c Add iNaturalist.org metadata (#549) -2d41485f Add Openverse email to DAG default args (#683) -2273271f Update audioset_view to use most recently updated f_id/provider pair (#660) -1d407e81 Use Python 3.10 everywhere (#656) -83c688d8 (tag: v1.3.1) Add configuration options to skip ingestion errors (#650) -f6e8fa01 Upgrade Airflow to v2.3.3 (#664) -40b4306b Updates Handbook Link (#662) -54aee38b Re-ping if PR is updated and don't ping if 2 approvals exist (#642) -0072114e Tighten exception handling, always flush buffer (#645) -0f584754 Automatic DAG documentation generation (#649) -704b33cb Only delete dag runs/task instances during testing that match pattern (#651) -9fea6554 Omit DAGs that are known to fail from alerts (#643) -d33083de Fix typo in README (#652) -492ae8b5 Data refresh record difference reporting (#636) -a2e1d50f Use the default provider categories during ingestion (#635) -8c04155c Partition TSVs by date (#632) -06046dae Only drop load table if it exists (#634) -991162bd Refactor Science museum to use ProviderDataIngester (#576) -ea5a7f06 Refactor Museum Victoria to use ProviderDataIngester (#600) -78232410 Re-raise pytest-socket errors within DelayedRequester (#629) -1882d777 Update Finnish Museums to use base class (#579) -2f9df1bf Adjust load data timeout and retries (#626) -3ee97b6c Update data refresh DAG to account for manual go-live (#578) -8754cb63 Generate TSV filenames in separate step (#620) -4451ee03 Patch Stocksnap tests that called out to external API (#628) -0bd0b002 Turn on catchup for dated DAGs to allow backfill (#602) -c294e3e2 Ignore DS_Store files (#627) -cd7ca961 Add date range to ingestion load reports (#613) -7bf8ec42 Update Openverse URL in the user agent string (#612) -5e86d291 Unify header added (#610) -ea1016de Add test to check for import errors for all DAGs in the dags dir (#580) -cc7322b8 Refactor StockSnap to use ProviderDataIngester (#601) -a5101162 🔄 Synced file(s) with WordPress/openverse (#604) -1f97c69a 🔄 Synced file(s) with WordPress/openverse (#603) -42ddf2bf Add missing `MD5` hash to foreign id comparison (#575) -1c9fd4cf Add base class for Provider API scripts (#555) -aacab7bb Add `filetype` to Phylopic script (#547) -dc8a68ba Post comments using JSON instead of form data (#570) -ae122cc7 Add `filetype` test to Metropolitan script (#568) -49ed0dcf Add audio_set_foreign_identifier to the audio materialized view (#565) -fa5e97b5 Fix module import for PR review reminder DAG (#566) -f5de3e2a Add PR review reminder DAG (#553) -825c5aad Add `filetype` and `filesize` to Cleveland Museum of Art API script (#537) -2ae1c4d4 Add `filetype` and `filesize` to SMK script (#542) -c4ffd95d Add flag to strip slash in urls while validating (#556) -fec0d9d1 Consolidate provider workflows using dynamic DAGs and dataclasses (#540) -4d6e5392 Add a helper function to extract extension from the media URL (#545) -5f5fc7cc Create DAG objects at top level (#551) -90980825 Add DAG to report reported media pending review (#513) -8ca6a230 Correct order of None handling in Cleveland provider script (#544) -b36aac72 Remove thumbnails from images (#526) -4cba0ced Unconditionally destroy buckets after testing (#516) -40a0d8a2 Simplify WP Photo Directory script and get missing authors (#515) -bba0413a (tag: v1.3.0) Ensure SMK images don't timeout on validation (#506) -7dbff464 airflow dockerfile: set `PYTHONPATH` to DAGs folder (#514) -31f461ac Generate DAGs to recreate popularity calculations using a factory (#507) -9dca8963 Upgrade Airflow to 2.3, python to 3.10 (#502) -1ef4a133 Retry flaky request when Smithsonian provider script detects no unit codes (#508) -1fda166d 🔄 Synced file(s) with WordPress/openverse (#509) -3ef7138a Merge popularity calculations and data refresh into a single DAG (#496) -20458817 Don't delete custom pools during test cleanup (#501) -971f3966 🔄 Synced local '.github/CODEOWNERS' with remote '.github/CODEOWNERS' (#505) -0bd79797 Add human readable description for durations under 1 second (#500) -25c25d5b (tag: v1.2.2) Recreate the audioset matview after full popularity recalculation (#493) -99bb7389 Enable reporting when there is no data to load (#492) -e98cf799 Make Airflow connection variables easier to read (#480) -b44f3398 Wikimedia: Catch bit rates that are greater than the int max (#475) -80930e9d Fix `alt_files` duplicates (#479) -4738e3f7 (tag: v1.2.1) Update Smithsonian Unit code checker DAG to alert to Slack (#452) -81952618 Change docker-compose restart policy for local development (#474) -c928589b Improved load reporting (#471) -9b962860 Rename Thingiverse.py to thingiverse.py (#472) -8c87749f Show duplicate record count in completion slack message (#442) -dd57690a Re-introduce pytest-socket (#467) -3755baeb Adjust timeouts for Data Refresh `wait_for_completion` step (#458) -039371e7 Use safe_search param to restrict results from Flickr (#460) -24117f01 Upgrade black to 22.3.0 (#463) -6e10adb6 🔄 Synced file(s) with WordPress/openverse (#462) -ed41f4e7 🔄 Synced file(s) with WordPress/openverse (#459) -c164382f Remove `apt upgrade` from PG image, upgrade to 13.6 (#455) -1d9c96d6 Handle case where Wikimedia has no audio metadata (#443) -ecaf732a 🔄 Synced file(s) with WordPress/openverse (#444) -962f1c5d Send single slack notification per provider on TSV load complete (#434) -5bce20de 🔄 Synced file(s) with WordPress/openverse (#441) -67609549 🔄 Synced file(s) with WordPress/openverse (#440) -dd347a43 (tag: v1.2.0) Add data refresh to Airflow (#397) -7cccf888 Change PhyloPic date range & schedule interval (#423) -e41e8d7a Add LRU cache to `is_valid_license_info` (#424) -c4d381fd Round duration for provider ingestion completion message (#422) -0e3675ca Enable XCom pickling in Airflow (#421) -22a8965b Use published Docker image in primary docker-compose.yml (#417) -5f941e6e Fix invalid license urls from Finnish Museum API (#418) -124d23b6 Reduce noise in NYPL ingestion (#415) -e8400cd4 Add ConnectionError to acceptable flaky exceptions for Freesound (#413) -fd68b9ea Fix schedule intervals on Cleveland Museum & Wikimedia Commons (#416) -3034e31f Update API requests for Museum Victoria DAG (#414) -8712f325 Add OFEO-SG subprovider (#412) -c5cad660 Handle duplicate keys in load_data task (#395) -38ee4938 Make 'sound' category more specific (#402) -23638152 Group test runs by module or class (#409) -e5f820ad 🔄 Synced file(s) with WordPress/openverse (#404) -7f19de2a 🔄 Synced file(s) with WordPress/openverse (#403) -f3808d85 Update Slack messages to include environment (#382) -69b2eb7a (tag: v1.1.0) Update Airflow to 2.2.4 (#372) -2b2f9636 Reconfigure retries & timeouts for typical ingestion DAGs (#361) -5188b388 Add slack message on TSV load complete (#369) -9538f384 Add provider media type to DAG tags (#360) -ada025d8 Trigger TSV loading immediately after workflow (#357) -cb19f839 Use Airflow Variables for storing API keys (#362) -9555374b Differentiate between slack channels (#359) -6dd5cb34 Updated user agent for Wikimedia Commons #140 (#355) -8431b4be Remove buckets after testing (#344) -760eab51 Use pytest-xdist for testing (#337) -cbb26f14 Ensure Freesound tests are isolated (#340) -a8df91ab Change minio ports from 500X to 501X (#341) -cadc9d3e (tag: v1.0.0) Freesound SSLError fix (#330) -8de8d703 Set up CI/CD with ghcr.io (#332) -b07282dc Fix inconsistent alignment in slack message text (#328) -5a954f91 Properly handle "None" values returned from Freesound API (#327) -0c5b43fb Add audioset_view to catalog DDL (#320) -c50f487c Set default timeout to 12 hours (#311) -738c9d88 Change request info log to debug to prevent spam (#312) -b1f24143 Make commoncrawl bucket configurable, change default (#318) -52c1d245 🔄 Synced file(s) with WordPress/openverse (#317) -8fc29a22 🔄 Synced file(s) with WordPress/openverse (#314) -4e965e2c Extend Jamendo's timeout to 24 hours (#310) -c4f3f965 Disable TSV loader scheduling (#309) -836bbbf4 Upgrade to Airflow 2.2.3 (#308) -ab90e8ee Add unique indices to catalog (#306) -bb3c9436 Add Image Categories (#302) -9d2e6a5f Bump lxml from 4.6.3 to 4.6.5 (#303) -12f80005 Remove `get_*_operator` functions, simplify commoncrawl logic (#301) -6f5f598b Remove unnecessary logging.basicConfig calls (#299) -e40b86ce Slack alerting for DAG failures (#297) -75469b7d Refactor delay tests to prevent them from being flaky (#298) -2f14bbad Specific error message for auth errors on request, improve tests (#295) -16fd77fa Retire common_api_workflows, clean up config (#296) -fb05e35b 🔄 Synced file(s) with WordPress/openverse (#294) -baadc4f7 🔄 Synced file(s) with WordPress/openverse (#293) -89767ec9 Add Provider API script for Freesound (#95) -7b142c78 Reduce TSV loader complexity (#289) -14c2d6d7 Slack alerting utilities (#279) -e3cc70a2 Add DAG tags, remove health check workflow (#277) -adafb42d Add production deployment documentation (#271) -669067d2 Retire legacy ingestion column fix (#287) -8f6a1cd5 Retire cleaner_worfklow, pg_cleaner (#288) -6173fb44 Remove tsv_to_postgres_loader_overwrite (#286) -ef614b97 Add index creation for matviews (#280) -88322d2d Respository restructure (#276) -6025630d 🔄 Synced file(s) with WordPress/openverse (#274) -dc1df6b0 Retire update workflows, refactor operators (#266) -7ee62451 Add docker entrypoint to ensure db migration on startup (#270) -a5c2ee98 Replace moto server with Minio (#254) -3464826d OAuth2 DAGs and Machinery (#246) -74ad9bd8 Add pip upgrade command, docker optimizations (#265) -d65d4c46 Add `justfile` deployment recipe (#267) -454e9a2c 🔄 Synced file(s) with WordPress/openverse (#269) -b911c69b 🔄 Synced file(s) with WordPress/openverse (#268) -601f7639 Add args option to db-shell recipe (#259) -485fc34e 🔄 Synced file(s) with WordPress/openverse (#258) -e7d7e173 🔄 Synced file(s) with WordPress/openverse (#256) -9367dc18 🔄 Synced file(s) with WordPress/openverse (#255) -4c66afb1 Edit wikimedia_audio name in popularity sql (#253) -93255a19 Add pgcli to postgres container, db-shell recipe (#252) -d5d39f1a Improve `.env` documentation & structure, update values (#251) -91579b9f Remove prefixes from issue template titles (#250) -8ea757e0 🔄 Synced file(s) with WordPress/openverse (#249) -6d02802b Make Category a StringColumn (not an ArrayColumn) (#243) -97ef1da6 Fix type in contributing.md (#245) -ea31d810 Add sample WordPress REST API script (#223) -c6f94b09 Update provider template, refactor DAG parsing tests (#237) -f4b2abc1 Remove `trackid` query parameter from set thumbnail url (#239) -2e003466 hotfix whitespace in new issue template -a9666559 Merge pull request #238 from WordPress/rm-get-log-operator -1992ba17 Remove unnecessary dag from operator util test -926e6d4a Merge pull request #230 from lyu4321/issue-176 -da7c7f0f Update .github/ISSUE_TEMPLATE/image_provider_api_integration_request.yml -4b70e986 Merge pull request #240 from WordPress/update_test -c8524980 Use `with dag` in `test_operator_util` -7608dbb4 Update labels and desc for provider template -b1742cab Update labels and desc for source template -7f320c23 remove dag argument from all the operator creation functions -463913b8 init -c87aab0c Update desc in source template -c2a4afb3 Remove extra # -7f0a3673 Add desc and missing fields to provider template -a4123c69 Update source issue template from md to yml -a023b9d0 Update provider issue template from md to yml -b044229d Docker optimization & repository restructuring (#226) -aaca3b9e [Audio] Add Wikimedia as an Audio source (#197) -7ef1b9b4 Add new columns to MediaStore and database (#196) -0ade78b4 Merge pull request #221 from WordPress/stocksnap-popularity -032ff837 Use `just` commands in CI workflow (#218) -edd0ab9c Move dev-specific services into compose overrides file (#217) -fb24e39d Implement stocksnap popularity and popularity documentation -96011a0f Revert accidentally-pushed previous stocksnap test commit -bbd8898b Fix stocksnap test to use new metadata values -3c99a17a Merge pull request #206 from WordPress/repo-sync/openverse/default -0f0797d4 🔄 Synced local '.github/workflows/new_prs.yml' with remote '.github/workflows/new_prs.yml' -7b7a3122 Organize & document `justfile`, fix issue with recreate command (#198) -7af65647 Move storage module up and deduplicate MediaStore tests (#192) -793d67ab Merge pull request #194 from WordPress/airflow-credentials -c2dad879 Issue templates (#195) -032e6ce9 Update README.md -008185f2 Merge pull request #190 from WordPress/repo-sync/openverse/default -7fa22e8e 🔄 Synced local '.github/PULL_REQUEST_TEMPLATE.md' with remote '.github/PULL_REQUEST_TEMPLATE.md' -dd8bf859 Merge pull request #187 from WordPress/cleaned-up-docs -a48c70fc Update README.md -a445771e Update README.md -07d3ffb0 Update README.md -71785485 Add missing newline -c5a1be76 Streamline monthly + daily dag lists in README.md -8ca3ba0e Merge pull request #185 from WordPress/repo-sync/openverse/default -a0f304f0 🔄 Synced local '.github/workflows/pr_label_check.yml' with remote '.github/workflows/pr_label_check.yml' -a8061309 Merge pull request #184 from MuhammadFaizanHaidar/patch-1 -fade0eb7 Renamed the source suggestion issue template -ec7e08ad Merge pull request #179 from WordPress/add/recreate-recipe -48c1f0cb Merge pull request #180 from WordPress/repo-sync/openverse/default -18c6a10b 🔄 Synced local '.github/workflows/new_prs.yml' with remote '.github/workflows/new_prs.yml' -909ead95 Allow passing flags to `test` recipe -213904c8 Add recreate recipe -186e4aa1 Merge pull request #174 from WordPress/repo-sync/openverse/default -a39d32a1 🔄 Synced local '.github/workflows/pr_label_check.yml' with remote '.github/workflows/pr_label_check.yml' -ceedd6fe Merge pull request #173 from WordPress/repo-sync/openverse/default -8680573a Merge pull request #172 from WordPress/ack-update -42e64d88 Update README.md -e96f7e3f 🔄 Created local '.github/workflows/pr_label_check.yml' from remote '.github/workflows/pr_label_check.yml' -aeee7989 🔄 Synced local '.github/workflows/new_prs.yml' with remote '.github/workflows/new_prs.yml' -375bd82d 🔄 Synced local '.github/workflows/new_issues.yml' with remote '.github/workflows/new_issues.yml' -053d85ed Update acknowledgements section -8ca48c86 Use dag_factory for Provider API DAG creation (#163) -c5e6d4bf Merge pull request #159 from WordPress/add/formatters -9f8b50f9 Fix requirements.txt comment location -4af80b0a Remove unused flake8 annotations and fix exclude pattern -ac09b2f7 Make env.template not executable -117f84d3 Rename common lint job -a5116c82 Use pre-commit for CI linting -277de7a8 Add black and isort and apply to all files -17628e74 Merge pull request #153 from WordPress/add/just-scripts -c2d63246 Remove old recipe from readme -90461c21 Do not load any .env files for just -fa728f40 Ensure containers are running before running exec -d30dd25b Use more general language for logs recipe behvaior -f7e7847e Complete list of running containers -cfcc19dc Fix justfile to use dev configuration -9cecc481 Remove directions to switch directories from README -bb47e440 Rename makeenv to dotenv -f657e964 Add preliminary just scripts -1ed278ec Merge pull request #157 from WordPress/add/pre-commit -608fa9e2 Merge pull request #151 from WordPress/add/simulated-dag -7898fc5c Merge pull request #156 from WordPress/local_s3_bucket -cc4f2569 Organize requirements files and de-duplicate -54c4afc1 Apply pre-commit to all files -92ad9f0c Add general pre-commit hook -58861903 Add pre-commit -fa3c191c Merge pull request #154 from WordPress/update/switch-to-volumes -dad98228 Add note about volume prune -c35effa7 Update example value for `AIRFLOW_CONN_AWS_DEFAULT` envvar -9e25193a Add openverse-airflow-logs to BUCKET_LIST -25d16127 Use `tries` param instead of `TRIES` constant -8a0e75e5 Update README to remove volumes on cleanup -9e93f527 Switch local postgres to use volumes -7c9158ca Replace os.path with pathlib in provider API script template (#149) -28df4bf3 Update Apache Airflow version (#148) -a2d16cef Add manually run healthcheck DAG -04eaeeed Merge pull request #147 from WordPress/fix/provider-template-path -4f9e7341 Log cleanup DAG (#139) -c7310513 Fix resource path string -6c172033 Simplify catalog folder structure (#133) -ea3b2b8f Merge pull request #145 from WordPress/fix/make-harmonious-with-api -697b406a Allow running the catalog and the API at the same time -b12ba815 Merge pull request #114 from WordPress/stocksnap -5bf81223 Update StockSnap tests and example files -bfc9d0d8 Get creator data from StockSnap API -925272e2 Format with black & flake8 -a7d562a3 Make image's `title` from tags/keywords -8dca7f98 Get `foreign_landing_url` from StockSnap API -2d099915 Merge branch 'main' into stocksnap -3410f7e3 Merge pull request #136 from WordPress/airflow-remote-logging-example -7fc4fdbd Merge branch 'airflow-remote-logging-example' of github.com:WordPress/openverse-catalog into airflow-remote-logging-example -0ad6cfee Improve remote logging docs -a3269dd8 Merge branch 'main' into airflow-remote-logging-example -c739c24d Replace `genre` property with `genres` in tests (#137) -3818b201 Update to new values in Airflow 2 (logging namespace moved in airflow config) -5f0b2ca0 Add example vars for airflow remote logging -06fb4991 [API integration] Add Jamendo provider API script (#113) -f54ad987 Merge pull request #135 from WordPress/mv_docs -73e35915 Merge pull request #134 from WordPress/repo-sync/openverse/default -ca8dd1d1 🔄 Synced local '.github/ISSUE_TEMPLATE/' with remote '.github/ISSUE_TEMPLATE/' -4ec2d707 🔄 Created local '.github/workflows/new_prs.yml' from remote '.github/workflows/new_prs.yml' -bc69ec84 🔄 Created local '.github/workflows/new_issues.yml' from remote '.github/workflows/new_issues.yml' -19ed1373 Add handbook link to README file -66df8b18 Delete docs folder -6724b148 Update stocksnap tests and example `full_item.json` -ed77e0e2 Get image title from API response instead of the scraped page -8764118f Merge pull request #131 from WordPress/pr_template -73f50dd7 Add a PR template to the repository -d1dde6fc Merge pull request #130 from WordPress/modify_audio_columns -924b2b1b Add stocksnap tests -bb31eb93 Pass license_info instead of license_ and license_version -182e1592 Rename `alt_audio_files` column to `alt_files` -dd4bcff0 Add funtion to merge arrays in sql -8ac0298b Change `genres` column to ArrayColumn type -8e2186ef Add new `ArrayColumn` type -9895bf2d Modify columns in test_audio.py -6b4fe9ff Rename `standardized__popularity` column in view tables -621765b1 Make field for audio genres plural -3a816554 Add watermarked column to sql files -eaaba615 Merge branch 'main' into stocksnap solving conflicts -20772bb4 Make wikimedia script pass license_info, not license_url (#129) -bf5ebd21 Add a script to create provider API script template (#128) -0f82371f Merge pull request #126 from WordPress/rm_duplicate_providers -546fade1 Delete duplicated CommonCrawl providers -d3db16d7 [Quality] Make provider scripts pass validated license_info to the storage module (#66) -9446c7f9 Add support for other media types to popularity calculations (#112) (#124) -8bac45a0 Add missing `watermarked` column to audio loading table (#125) -5a4c5871 Ingest wikimedia images marked with CC0 and PDM (#119) -073c1215 Clean Wikimedia item titles (#120) -5d1ecb2b Add Audio to the database (#111) -ab8fa907 Refactor to make only one extra request per image -93415d90 Add samples files of an image and a api response for tests -127fa29c Add instruction to write tsv file with image data -9519967c Fix filling of tags field -7a14f6a6 Complete image's title, creator and creator_url -b601984e Set default output dir for commoncrawl (#118) -2bf469ec Merge pull request #116 from WordPress/issue_templates -870cbc2c Add the link to the Make site -6dfbfb01 Update labels to the new format -e7aeda8b Add volunteering section to all templates -ccd23f7e Align bug and feature templates with other repos -b1cc1fee Program stocksnap script with minimum required fields -a42476a9 Add StockSnap to `dags/util/loader/provider_details.py` -14452b21 Create base provider files for stocksnap -264306c1 Improve DAG creation template Signed-off-by: Olga Bulat -2c98e242 Extract media type from staged tsv file name for loader (#110) -38b90981 Add AudioStorage entity (#85) -25e18fa7 Extract MediaStorage entity as parent to ImageStore (#83) -ab8d3ccc Merge remote-tracking branch 'origin/template' into template -67b9b303 Merge branch 'main' into template -13e607b8 Fix typo in provider template script -5efdd322 Add ingestion column to MediaStore when using provider API (#72) -5351b5e8 Remove mutable parameters in provider api scripts (#100) -cd159ccd Remove logging of url rewriting when not rewritten (#108) -0c7a5073 Fix pep8 violations (#103) -88a777ce Make the script output clearer -41714d25 Merge branch 'template' of github.com:WordPress/openverse-catalog into template -4a03768e Make image the default media type -5efe43a8 Replace relative path with absolute to fix file not found errors -1331d460 Better wording for script date parameter -85affa35 Merge pull request #104 from WordPress/release_drafter_on_main -3eb439d8 Run release drafter action on push to main branch -01e76a72 Merge pull request #90 from WordPress/local_sql_order -5e456603 Update src/cc_catalog_airflow/templates/template_provider.py_template -7181cd35 Shorten lines -b194efb8 Merge branch 'main' into local_sql_order -b343d84d Merge branch 'main' into template -99a6f0e7 Add more trailing zeros -9942d241 Merge pull request #98 from WordPress/run_ci_on_main_push_only -68172b94 Run CI on push only on main -e6a64c6f Run CI on push only on master -6f605316 Merge pull request #71 from WordPress/improve_url_logging -387c973a Fix linting errors -cb6d2647 Merge pull request #91 from WordPress/fix_dep_version_conflict -012df2db Make URL logging less verbose on success, more verbose on failure -f4d9ebb0 Merge branch 'main' into improve_url_logging -7ca3fea5 Create a Provider API script template -408cb8a4 Fix the dependency version conflict -a22d1931 Ensure Docker loads local_postgres sql scripts in correct order -6199d4b3 Merge pull request #76 from WordPress/add_testing_workflow -fc9a7a34 Merge pull request #86 from WordPress/dependabot/pip/src/cc_catalog_airflow/urllib3-1.26.5 -66b5e693 Add trailing new line to lint workflow -51290e41 Remove workflows from workflow-disabled folder -06e64b91 Merge pull request #81 from WordPress/fix_test_failures -70515f27 Bump urllib3 from 1.25.11 to 1.26.5 in /src/cc_catalog_airflow -d9e77234 Merge pull request #80 from WordPress/codeowners -20940a9f Merge pull request #68 from WordPress/update-readme -c6726a53 Fix failing text, improve import readability -dc7ebe29 Add blank lines for readability -14f0f0bc Merge pull request #78 from WordPress/extract_common_package -0cd97686 Create a CODEOWNERS file -86a67a4f Move the common package to a higher level to simplify testing -a61a6649 Add missing new lines at the end of files -400bd8f0 Re-add the lint and test workflows from the original repo -10b31ed7 Merge pull request #75 from WordPress/dependabot/pip/src/cc_catalog_airflow/flask-appbuilder-3.3.0 -9a9550c2 Bump flask-appbuilder from 3.2.3 to 3.3.0 in /src/cc_catalog_airflow -724031fb Merge pull request #70 from WordPress/dependabot/pip/src/cc_catalog_airflow/lxml-4.6.3 -1886fb09 Merge pull request #73 from WordPress/release_drafter -b4780b25 Update release-drafter.yml -3426884a Add configuration and workflow for Release Drafter -900ad221 Log the actual URL requested -94b49799 Bump lxml from 4.4.2 to 4.6.3 in /src/cc_catalog_airflow -a860168d Merge pull request #63 from WordPress/airflow_update -4ab4e6ee Update README.md -c011f4bf Fix imports -46c20851 Fix `test_operator_util` -ecd5e7a0 Replace deprecated provider imports -92fcc81a Update python, airflow, dependency versions -49539a56 Merge pull request #62 from WordPress/readme-updates -68691f6b Update openverse-catalog.md -17dffa56 Remove Openverse Search -66063f30 Add a space -fe1cdabc Update links -461e8f85 CC Catalog to Openverse Catalog -b68aa7ef Replace CC Catalog with Openverse Catalog -3e08e9e6 path fixes -d316be4a More typos and formatting for markdown files -02d831c7 Use WordPress CoC -aefd1e3c Merge pull request #54 from obulat/update_dependencies -ea1f34bd Merge pull request #56 from Automattic/flickr-improve-docs -165e5ae3 Add documetation on generating a Flickr API token -9760ce18 Fix airflow db initialization -68ef4162 Fix production dependencies compatability with upgraded airflow -9012a249 Pin pandas version -84613a9b Update to postgres 13, apache-airflow 1.10.15 -f2826460 Update README.md -e0acf60a Merge pull request #1 from Automattic/cc-a8c-migration -c9729f88 Switch to renamed workflow dir to disable actions -efb34327 Comment out and disable GitHub actions -25f226ab Remove CC meta files -cd789c26 Update README with migration notice -67b4a8c7 Merge pull request #544 from creativecommons/discontinued-notice -eb71f203 Update README.md -b9955ff3 Update README.md -98348de6 Update README.md with discontinued status -0ab8a380 Merge pull request #539 from creativecommons/ct_codeowners_1608054710 -68b7aa47 Sync Community Team to CODEOWNERS -b1bf826f Merge pull request #537 from ariessa/master -54efa946 Merge pull request #536 from dravadhis/iss463_mockimgstore -70bae81d Removed links -5391467d Create MockImageStore class for testing -709792a9 Merge remote-tracking branch 'upstream/master' -0609aef9 Fixed broken links -c045c7ff Merge pull request #535 from tushar912/fm-airflowdag -488ce9c9 Updated flickr.md -35aa2a94 Improved doc and its formatting -f0915891 Fixed broken links and formatting -ea9f45ea change start date -0a04a9f4 add tests for finnish_museums_workflow -04ef1ace add finnish_museums_workflow -abbe9eb8 Merge pull request #532 from tushar912/finnish-museums -e928abde process object list page by page -a64dd2b1 made total_images global -b1e9f311 fix line too long -d1ca7ced change provider details -8b528bd0 handle image_rights none -a092756d fix raw_tags to be array of str -7efd39e3 Merge pull request #533 from creativecommons/use_execution_date_for_commoncrawl -7eed044b Add files via upload -2da4f374 Create cc-catalog.md -6667b81a remove unused imports -e534f04f format local s3 init script with black -45b193db use execution date to calculate cc_index, rather than pulling it from S3 -b5f0a720 add additional cond for none -67c2c7fe remove unused import -29e05195 format test ac to pep8 -103653d8 fix line too long -a14f3ba3 format ac to pep8 -971f9c79 finnish museums provider and tests -3347889e Merge pull request #530 from creativecommons/common_crawl_etl_airflow_dag -6672f742 add new variables to environment template -50d386f1 fix linting error -daca6cec add test for new CommonCrawl ETL DAG -7651e2d5 add tests for operator functions -3884bc53 make s3 file loading function private -ac9c15cd format python files with black -17a10c09 remove unused import -23602f59 extract operator definitions to separate file -2eb201b1 increase number of core instances for run with real data -f7de1c11 add new Airflow DAG to run first pipeline -dc90a220 add airflowignore so that python files don't set off warnings -f8af3f32 Merge pull request #523 from creativecommons/cleaner_workflow_parallelism -9bdb9068 Merge pull request #524 from creativecommons/dependabot/pip/src/cc_catalog_airflow/cryptography-3.2 -8ef22b4f Bump cryptography from 3.1.1 to 3.2 in /src/cc_catalog_airflow -c2012f59 lower logging level for urls import -6a46f51f fix numerous bugs, turn down ImageStore logging -627efb4e add logic to handle defective dictionary when getting license_url -26782163 fix missing fields bugs, add tests for them -0bedecff add functionality to output defective identifiers to file -4c83734b fix bugs, lower DAG parallelism to avoid locking up scheduler -727e3557 (tag: v0.9.0) Merge pull request #517 from creativecommons/clean_preexisting_data_with_disk_write -ca3cdecb fix flake8 errors introduced by black -cd042bd0 remove unused MagicMock import -b56105a2 fix broken test to avoid enironment assumptions -62246613 reformat code with black -b428c200 add test for new cleaner DAG -cc21272c add DAG to run new cleaner logic -2a968ffe add remaining tests for pg_cleaning functionality -9cb650b2 refactor ImageStoreDict for better testability -4ec6f49a improve error handling in pg_cleaner -488f4e4c refactor for testability, add tests to pg_cleaner -664d6599 add test for pg_cleaner -d1a93c98 use non-deprecated logger.warning instead of logger.warn -19546ee2 add row-cleaning logic that saves to disk -61fac227 fix loader_workflow test with new number of DAGs -42b49f46 add DAG to run new overwriting logic -05e27379 fix updating logic so that it handles missing matches -3d548d79 add function to overwrite instead of upsert into image table -cabfa11c Merge pull request #516 from tushar912/ascii-false -348054e9 added test to save unicode string -51bb32fb Shorten line acc to pep8 -a9fe50a0 ensure ascii false in json.dumps -cb81077c Merge pull request #512 from dravadhis/iss366_requirements -d951bdbc Split dependencies into prod and dev -f4bfcb56 Split dependencies into prod and dev Add requirements_dev.txt and requirements_prod.txt. Modify Dockerfile, docker-compose.yml and docker-compose.override.yml to work with new requirement files. -e392b49d Merge pull request #511 from dravadhis/iss211_rawpixel -1cc98a4b Merge pull request #508 from dravadhis/iss507_DAGwalters -f5b015af Make `meta_data.Description` field in RawPixel.py -0739afc1 Make `meta_data.Description` field in RawPixel.py Make `meta_data.Description` field using `pinterest_description` in `raw_pixel.py`. -e97ae8d2 Merge pull request #506 from dravadhis/iss274walter -ce88ff0f Integrate Walters Art Museum API with CC Search Remove 'Paper & Paper-Mache' from list of classifications. -bdb9c0da Merge pull request #510 from creativecommons/clean_common_crawl_using_imagestore -a3fe3bd0 Integrate Walters Art Museum API with CC Search -23f6af84 Integrate Walters Art Museum API with CC Search -6d9893f7 Integrate Walters Art Museum API with CC Search Implement page increment logic in walters_art_museum.py -0495d4c0 extend syncer DAG to use new cleaning function -63d2c0ed add directory looping logic to tsv_cleaner, -9923e376 fix path joining to give trailing slash -90dd33d3 format file with black -49511d89 add test for new functionality -66c7e415 Merge branch 'master' into clean_common_crawl_using_imagestore -e824407b Integrate Walters Art Museum API with CC Search Modify _get_image_list to work with _get_response_json method of the DelayedRequester class. Replace get functions with bare dict.get calls. -e57ba18a Merge pull request #504 from avats-dev/fix-readme-broken-links -4dd2695b Add Walters Art Museum Workflow -75bfa891 Integrate Walters Art Museum API with CC Search Set API KEY in env.template as not_set -1190300f Integrate Walters Art Museum API -a0294269 Integrate Walters Art Museum API with CC Search -dd572712 Fix broken links -cd7ae0d1 Merge pull request #1 from creativecommons/master -75b37f8f Merge pull request #499 from dravadhis/issue_fstring -99a84eeb Update europeana.py and wikimedia_commons.py -8fdabad4 Change style to comply with style of the repository -14c5e555 Update test_brooklyn_museum.py -a414ec3d Restore untested files to original state. -c7a59f3f pass environment variables in through bash operator for modification -f48fc1ce move tsv cleaning logic to correct location -752deb26 move commit step so it only happens once per ImageStore -8a35a166 Replace all occurences of str.format() with f-strings -d0426de5 Replace all occurences of str.format() with f-strings -61c9b7e1 Replace all occurences of str.format() with f-strings -36f2c774 Replace all occurences of str.format() with f-strings -f69827f5 Merge pull request #498 from dhruvkb/patch-1 -5bfe7323 Replace all occurences of str.format() with f-strings, keeping the formatting same throughout the codebase. -3c032153 Replace ORG_GITHUB_TOKEN with ADMIN_GITHUB_TOKEN -e7d14d6f undo a string sanitization to make sure cleaning is idempotent -caf9e619 add script using ImageStore class to clean a TSV of image rows -deb1a145 make Image row namedtuple public -836aa60a Merge pull request #464 from creativecommons/verify_urls_in_imagestore -a914114d fix flake8 error -a776acdd handle reverse 2.1 license mapping and save raw license_urls -cd8828c7 Merge branch 'master' into verify_urls_in_imagestore -c4ac873c (tag: v0.8.0) Merge pull request #483 from creativecommons/image_expiration -de659eb0 Merge pull request #478 from creativecommons/common_crawl_tags_merge -6cc2ae6b Merge pull request #488 from avats-dev/cat_img_wikimedia -31d5a590 Merge pull request #491 from creativecommons/kgodey-patch-1 -5e0e7135 Rename new-source-suggestion to new-source-suggestion.md -7a1f0317 Merge pull request #490 from akshgpt7/automate-linting -6b06b72c Add pull_request event to lint.yml -67bd9ce3 edit test to check for meta_data.categories -994a8542 List comprehension for dag workflow -335f6258 api table given as argument -509daa8d Merge pull request #485 from creativecommons/europeana_reingestion_timeout_fix -434d45b1 add categories to metadata dict and extract them -b8bde687 Merge pull request #487 from avats-dev/minor-readme-typo -f6e0a206 Merge pull request #475 from akshgpt7/automate-linting -75775fc6 Removed a minor typo -a78c98f4 createad scripts folder -31b872b5 Update image expiration workflow to execute in parallel -2c0d87ee Merge branch 'master' of https://github.com/creativecommons/cccatalog into image_expiration -008890cb increased timeout to 12 -ae0059d1 Add .flake8 for configuration on test files -75ea7834 Remove pull_request event -5d073aef Add test files linting workflow -bb628229 Merge branch 'master' of https://github.com/creativecommons/cccatalog into automate-linting -ec8a1d25 Merge pull request #476 from creativecommons/smithsonian_discrepancy_fix -4238c2fc Merge pull request #474 from creativecommons/si_nmnh_improvements -984bde4c Test image expiration -16a1438a Merge branch 'master' of https://github.com/creativecommons/cccatalog into image_expiration -8769820f Merge pull request #479 from creativecommons/annatuma-source-issue-template -53e80b2c Create new-source-suggestion -dd06686c (tag: v0.7.0) Merge pull request #477 from creativecommons/popularity_calculations_sql -2a8342f8 query to update added -93ba67f8 Merge branch 'master' into verify_urls_in_imagestore -96562359 Update image expiration workflow to run sequentially -b3fa922f add tests for function to calculate standardized popularity -15edb59d add logic to avoid zero, but record raw value -9965948a Initial implementation of the image expiration workflow -b4b2d0ac Fix error in the OLDEST_PER_PROVIDER dictionary -8927ba6a Initial implementation of the expiration logic -b451473b Get all provider names into one location -a7276a05 add tests for SQL module; reformat with black -a434fcfe reformat operators module with black -9b97545f added science and met museum logic (testing) -58e64182 add tests for popularity SQL DAGs -7d5f45f8 add Airflow DAGs to create and refresh image popularity data -a1f56eaf fix SQL bugs, add index so constants view can be updated concurrently -5b721667 Expand the creator and description types considered in Smithsonain -80bec5e2 add operators to allow Airflow to use new view updating functions -a8c3189a add and reorganize SQL-via-python to burn down popularity data -c7b4b97b add new SQL-via-python functions to build and refresh popularity data -9e688d95 minor changes (testing) -cdf089c9 merge CC tags script (testing) -b7f6f293 Merge branch 'master' of https://github.com/creativecommons/cccatalog into si_nmnh_improvements -ccd228ca Variable name update -d45848cf Merge pull request #465 from creativecommons/smithsonian_unit_code_check -8c2cb4d0 Merge branch 'master' of https://github.com/creativecommons/cccatalog into smithsonian_unit_code_check -cfa8246e Raise exception when unit code table needs to be checked and update code for consistency -77bfcebd Merge pull request #473 from creativecommons/europeana_reingestion -f89b15a8 Merge pull request #355 from kss682/issue-348 -969bee47 Concatenate creators with semicolons and "and" at the end -7eeb5429 fix linting workflow syntax -6317bb4d Fix linting workflow -8926d817 Merge branch 'master' of https://github.com/creativecommons/cccatalog into si_nmnh_improvements -ee020e42 Merge branch 'master' of https://github.com/creativecommons/cccatalog into smithsonian_unit_code_check -6a7b5870 (tag: v0.6.0) Merge pull request #462 from creativecommons/nypl_implementation -4d33aacc Merge pull request #455 from creativecommons/smithsonian_sub_providers -b12f2626 Improve creator and description metadata in Smithsonian -1893349c dag script -48032026 code and test suite refactored -a3da7a73 Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue-348 -cef091ee bug fix. -497e0301 ingestion workflow -9fafd45a code refactored to make it more readable and pythonic -0a99cb75 api key set to NYPL_API_KEY -c379320b implement new SQL-based popularity calculation -401d5997 add SQL files to create tables and views on upstream DB -9d5b18a7 set up local postgres with new tables and views -b70799e4 Check for outdates unit codes as well as new additions -80692c9c Merge branch 'master' of https://github.com/creativecommons/cccatalog into smithsonian_unit_code_check -e6e838c5 single creator name retrieved from API -dacb48d2 Merge pull request #467 from creativecommons/museum_victoria -7439b88f Provider name bug fixed -0294a269 Merge branch 'master' of https://github.com/creativecommons/cccatalog into smithsonian_unit_code_check -e022048e Update the unit code workflow -e3a8439a Merge pull request #461 from creativecommons/flickr_new_subproviders -d6cbb9b2 Change Smithsonian unit code check to store values in table -94f2840c Initial implementation of Smithsonian new unit code check -7b244eea disable all calls to socket.socket, ensure tests pass -ab65fc0f Change logging levels and messages to emphasize actual issues -7e5435e2 add tests for upgrading/verifying schemes of IP addresses -ce2c4f8d add test for url scheme adding logic, fix bug in same logic -2516267a add documentation to license methods -a80e0336 added logger info , removed extra print statement -3ea22ec4 Merge branch 'master' of https://github.com/creativecommons/cccatalog into nypl_implementation -8fce51ad dag script -67050758 removed old NYPL script -5cab507f remove unused exception type -abc533d0 Add WOCinTech as a sub provider of Flickr -abc03825 Merge branch 'master' of https://github.com/creativecommons/cccatalog into smithsonian_sub_providers -57df7e0c stronger conditions in metadata method -27531216 added metadata and remaining test_suite -81ef9741 (tag: v0.5.0) Merge pull request #447 from creativecommons/museum_victoria -67b670a6 unused import removed -d8584b5b Merge branch 'master' of https://github.com/creativecommons/cccatalog into museum_victoria -11977cc6 Merge branch 'master' of https://github.com/creativecommons/cccatalog into nypl_implementation -1380f7ed tested image , title and creator methods -9ddc9855 title and creator methods added -2ed12b5f Merge pull request #442 from creativecommons/europeana_sub_providers -6e2e2b6f add logic to recover license URLs from pairs; cleanup/refactor -a0cab049 use f-strings consistently for logging in licenses module -5d6f895b refactor licenses module using new license_path_map data structure -1f4fd885 add logic to create map for recovering license paths from license pairs -116db6c3 move Japanese 2.0 licenses to recoverable lists -295fe295 clean up unused LICENSE_PATH_MAP constant -6ad2b0cc set up path map constants for reversal -355dfbc1 add a number of known license paths to constants file -7cbb9c61 fix broken constants tests -46f718d5 (tag: v0.4.2) Merge pull request #453 from creativecommons/wmc_mediatype_bugfix -fb65a179 Merge pull request #452 from creativecommons/wmc_limit_bug -41c4e457 update license tests, split path correctly in constants -5c728576 add more constraint to license path definitions -2e6f6fd6 add logic to check response code to URL rewriter -55ca7718 tidy up docstring -c78a5525 test and document CC URL validation logic -88914f30 Add workflow for updating Smithsonian sub-providers -9e9ba965 Add test case for checking Smithsonian sub-provider retrieval at DB level -00e02de7 reduce line length for PEP8 compliance -2fac6e9f add docstrings to public URL methods -32afddd7 rearrange methods in licenses for clearer logical flow -235e5d59 give redirection handling logic meaningful name -f56fc508 update image tests to avoid trying to use socket.socket -6c748751 test suite till request handler -2999ad18 image url retrieval method with new logic -1341dd03 Initial implementation of Smithsonian sub-provider retrieval at DB level -17a18090 make scheme-adding function private; reorder urls module -52033994 use urls.rewrite_url_string in licenses module -83c08809 Add test for sub-provider retrieval from Smithsonian at API level -f9b8acfc move logging init to let importer set level easily -d269685f add check to determine if object is an image mediatype -989e7f52 update script to use ImageStore.total_images property -2107bb62 image_id points to new id field in api -04316eb8 turn up parallelism to 8 -62f710dc Initial implementation of sub provider retrieval from Smithsonian at API level -cffa5b40 Drop the temporary table after sub-provider update -40e05abd turn down LIMIT, and turn up parallelism -ba1a84a0 add support for 'URLs' that are IP Addresses -03ff5086 split common.storage.util into smaller pieces -bc420729 remove unused import -7cef6344 add logic to check license URLs for correctness, refactor utils -1d4502e6 changing image id to single number -db532afd Merge branch 'master' of https://github.com/creativecommons/cccatalog into museum_victoria -bca70817 update tests to avoid tldextract calling internet -a3ebd2f8 add URL validation and scheme upgrading logic -2ed77c65 Apply consistent temporary table structure for Flickr and Europeana sub-provider update -eaf76e48 Improve Europeana sub-provider retrieval logic to reduce memory consumption -7be2fe58 Merge branch 'master' of https://github.com/creativecommons/cccatalog into europeana_sub_providers -0e6faea7 (tag: 0.4.1) Merge pull request #446 from akshgpt7/total_images -713f871c Merge pull request #444 from creativecommons/met_museum_bugfix -1b38f727 tidy up quotes and spaces -0c8f44b6 Merge pull request #448 from creativecommons/index_fix -3b23ca72 Add an index to temporary popularity table identifier -a5835beb dag for museum victoria -bdb536cc Merge branch 'master' of https://github.com/creativecommons/cccatalog into museum_victoria -3bb24113 pep8 styling -625c6707 test suite -1d2fe409 image id changed to pair numbers -bee74694 PEP-8 fixes -90761309 add total_images property -f47a0581 Merge pull request #440 from creativecommons/kgodey-patch-1 -a8b62bc5 use image name from URL for foreign_id instead of generated index -6672be4a Add workflow for europeana sub-provider update -afe2c293 Throw exception if more than one sub-provider encountered -b7d25c32 Added collaborators to CODEOWNERS now that we have one! -9f8d035f Add test case for europeana sub-provider update -96badcd2 Merge branch 'master' of https://github.com/creativecommons/cccatalog into europeana_sub_providers -eba1a115 tested get_batch objects -6f3aec15 implementation of provider -e25369ac Merge branch 'master' of https://github.com/creativecommons/cccatalog into museum_victoria -2135a0cf (tag: v0.4.0) Merge pull request #439 from creativecommons/output_dir_bug -10bb83fc Merge branch 'master' of https://github.com/creativecommons/cccatalog into europeana_sub_providers -7eb308b0 Merge pull request #420 from creativecommons/retrieve_subprovider -8b86f1c5 add logging statement to see how many rows we're updating -41c4e8f3 configure DAG for manual triggering -2645f6bf modify output path of popularity_workflow -8097c36d Clean the Flickr sub-provider update code -e350f4a3 bug fix : escape character -3741d15c Initial implementation of europeana sub provider retrieval -dfdf125c Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider -4d599de2 Add test cases for checking alternative sub-provider update methods -86f6db87 Add changes to the alternative sub-provider update methods -c1331035 Pass provider/ sub-provider information as parameters -a71ccc83 Merge pull request #428 from creativecommons/smk_provider -01c8a72f Merge pull request #427 from jhutchings1/codeql -86aa2fee Merge pull request #434 from creativecommons/swap_tablenames -e726aa15 rename local postgres building SQL files -ea5ad96c Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider -640c22aa Alternative methods of sub-provider retrieval -b76403e7 change table names in code that uses SQL -cf26c524 Merge pull request #432 from creativecommons/s3_creds -9f4aa673 Update sub-provider test to match the new image table schema -4e8d4051 testing the api contents -baefadca Read non-standard environment variables into boto3 client -35b95d3e Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider -eb2dbda0 Set spacex as separate sub provider and remove redundant source value setting -dabe1722 dag for statens museum -e1e54305 Merge branch 'master' of https://github.com/creativecommons/cccatalog into smk_provider -8ee9909c (tag: v0.3.0) Merge pull request #426 from creativecommons/popularity_calc -fb990034 Merge pull request #429 from creativecommons/deduplication -e35f6bfe remove unused SQL files -af93046e smk implementation and test suite -6306448c progress - -2bfb4c9f Consistent usage of single and double quotes -c8dd0bdf add newline -52fbc639 Merge branch 'popularity_calc' of github.com:creativecommons/cccatalog into popularity_calc -85251197 Don't use the source as a factor in the calculation of each metric -9ba46f1b Update src/cc_catalog_airflow/dags/util/popularity/math.py -b6c809c6 Update src/cc_catalog_airflow/dags/util/popularity/math.py -6bb8f3b1 Merge pull request #418 from creativecommons/science_museum-bug-fix -6a398b67 update paths operator to remove all files from staging directory -dd9c924d change sql operators to use new_image table during transition -e0230593 add new_image table to local testing setup -1ccf8e06 use new table schema in loader and SQL modules. -a4068bed Add CodeQL security scanning -5351d24c Better name for a test -5d840e57 fix naming of column, reshuffle columns for commoncrawl TSVs -7ddcceaf add utility functions to migrate TSVs to new form -6ee1aa30 add SQL file for different strategy to avoid updating image -f273a730 Missing paren in docstring -52554534 Decode S3 cache properly -f91a36ff Write the percentiles cache to s3 -add82b65 Define main before setting up DAG -a18163be Add DAG for popularity workflow -cf0c0b4b Document popularity score calculation -07081467 Refactor percentile calculation and test cache validation -aa29b18e Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider -137d0898 Changes to make sub provider information available from a common file -dbc13bb9 add looping behavior to migration SQL -cb8aa9c3 add duplication column before putting data into it. -28df08e9 Initial implementation of DB update for sub providers related to Flickr -250aa24b tidy up SQL formatting for legibility; remove extraneous columns -ad117bb3 Add test for pullingresults from psql -2f424d05 Start testing popularity workflow -3a0654ff Log progress of popularity calculation -a2835e73 Use more appropriate RuntimeError exception instead of SystemError -c8d9aeaa Modularize popularity job -2020cec1 add SQL files to implement the deduplication process -58d8590d Upload normalized popularity scores back to the metadata column -6b8ea4c3 repair error in new test table definition SQL -59d87b98 Merge pull request #425 from creativecommons/kgodey-patch-1 -37f172fa update local testing image table with new uniqueness constraints -866ce6c0 Added Catalog core committers to codeowners. -7d19f33e modify sql operators to use new uniqueness constraint -a9698d8c add test data for common changes in URL we'd like to detect -7526e573 Fix some issues preventing popularity cache from being computed; make sure output tsv is open before calling copy_expert -eb4c4204 Remove popularity logic from sql.py; that's specifically for the loader workflow and doens't belong there. Implement recomputation of percentiles from expired file cache. -e87818a3 Update sub provider retrieval logic by setting the provider value in source -f6795184 foreign id reference image uid -d3e5ee15 Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider -7059538f Merge pull request #368 from allen505/europeana -2b35e3a5 Add workload for producing normalized popularity TSV and queries for generating the popularity dump -69738bf3 Fix error in test case with setting source -923a67b8 Update sub-provider retrieval test case -30eb7cb1 Update sub-provider retrieval to consider user ID -63186bae Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider -dbca4d85 Merge pull request #410 from creativecommons/wmc_empty_response_bug -47310ad5 Merge branch 'master' into europeana -1bc9e199 Added european key to env.template Changes to be committed: modified: env.template -1a757ab4 Code refactoring as per Code review -782ee086 Merge pull request #411 from creativecommons/science_museum_workflow -c820b243 science museum workflow -bbdb8677 improve logging when the image_batch has no pages -088da3cc Merge branch 'science_museum_workflow' of https://github.com/creativecommons/cccatalog into science_museum_workflow -0e511e41 science museum workflow -1fe6f583 update _get_image_pages to handle non-empty response with no pages -f237aba6 removed comment unrelated to science museum -2f9d72fb science museum workflow -5c743502 Merge pull request #407 from creativecommons/env_template_bugfix -b6d29a6f Change schedule_interval to daily -10a17de7 Removed return from pagewise and refactored code Changes were made as per Code Review Minor changes to fit PEP8 Changes to be committed: modified: dags/provider_api_scripts/europeana.py -2adccac3 Merge pull request #354 from ChariniNana/master -fd90b54c Merge pull request #400 from creativecommons/science_museum -12dcae0c Refactored code as per Code review -708431a6 add LOADER_FILE_AGE and DATA_GOV_API_KEY to env.template -52eb5dad Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider -9f778a96 (tag: v0.2.0) Merge pull request #404 from creativecommons/loader_file_age_bugfix -c9ec3aed change loader_workflow file waiting time to 15 minutes -52952569 Merge pull request #402 from creativecommons/wikimedia_reingestion -8421ac92 Merge pull request #401 from creativecommons/smithsonian_integration -f2ef7a1a improve logging formatting strings as per Timid Robot's comment -19bcc3d6 paritioned using custom year range. -d5ba7b8a change docstring to reference correct module -fd098163 Merge branch 'master' of https://github.com/creativecommons/cccatalog into science_museum -cb193402 change DAG name to align with Flickr ingestion DAG -12441607 Add source as Flickr when the provider is a sub-provider -67ed6353 Merge branch 'master' of https://github.com/creativecommons/cccatalog into retrieve_subprovider -9f0a9ffb add workflow DAG to run smithsonian ingestion weekly -e77dea5a fix bug in dag factory so that it uses correct operator -1d9e7bb7 Merge branch 'master' into smithsonian_integration -2f7c36d9 Merge branch 'master' into smithsonian_integration -197064cf add explicit tests for _check_type function -a574682f add docstrings for main functions, improve logging of type checker -fc91f163 change log statement level to reduce output -52eb85f6 add tests for remaining functions -ac08f537 modify _extract_tags so that it always returns a list -6a489621 Merge pull request #394 from creativecommons/flickr_reingestion -ab0b3e16 fix problems found in testing -b5552486 refactor to avoid single-use variables, add tests -4151e473 add basic tests for processing response_json -7485e7d2 use type checker function for row getter -7a369870 improve logging calls, remove unnecessary f-strings -0640d0a5 add rudimentary type verification to handle unexpected JSON values -9f5fbe34 license method and other utilities tested -2c394c29 Add test for sub provider retrieval -43010cb1 Remove independent image store creation for default provider -98e5a6c8 Apply suggested changes in error string parsing -d3f04aa0 Merge remote-tracking branch 'upstream/master' -0e5fdc66 Merge branch 'master' of https://github.com/creativecommons/cccatalog into science_museum -c0b294fa image methods created and tested -787cdfee improve creator-finding logic, add many creator type options -6f410216 Initial implementation of sub provider retrieval -48f06049 Merge pull request #398 from sp35/patch-1 -6ae9a8f3 _get_batch_object and param method tested -21817630 add large sample JSONs from SI to .gitignore -1c992ab9 remove large sample data JSON files -1749ba2d add tests for smithsonian.py; add sample responses -3aaa1742 change to hash partitioning to control response size -b1db1ef3 Add slack channel for the repo in metadata -10788071 changes in _get_object_json and _get_license_url -416a8ef4 Set the max allowed defective rows to 10 -9fcc0494 Merge remote-tracking branch 'upstream/master' -80be33fc Skip defective rows only at local loading excluding the logic from S3 loading -48ed4b79 Trimmed a line europeana.py to fit 79 chars Changes to be committed: modified: dags/provider_api_scripts/europeana.py -d70a10fa Removed unnecessary conditions as per code review Removed empty license condtions Trimmed lines to 79 chars per line Removed import of re -6dfc6bf9 Suggestions from code review -fd0d7e80 add workflow implementing scheduled WMC reingestion -c9dc9cbc Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue-348 -006ecb1e Merge pull request #359 from kss682/issue358 -c490ed95 add sleep to test workflow -1222e012 Update push_pull_request_test.yml -5e50e50a tune ingestion strategy configuration to prefer newer data -7764ff80 add tests for new functionality -641bfb83 add tests for get_dated_main_runner to check day-shifting logic -02b9a8cd update operator getter methods to use f-strings -734db84d change default start_date to a datetime.datetime type -4f380cd1 Remove unused import -70847b93 Attempt skipping defective rows in s3 load -510fc725 clean up unused imports in test_operator_util.py -9be5e788 document ingestion workflow and reingestion day list calculator -d7374670 remove subdag operator usage for simplicity, -f3fe8e00 rename wait operator getter to conform with others -8e64af58 extract meta-DAG factory method, -024a477d add reingestion meta-DAG -fa8f7eed refactor Flickr workflow in preparation for meta-DAG -b5aaafc8 Merge pull request #390 from creativecommons/merging_strategies -6228731b add newlines at end of TSVs -c5285a31 add truncated flickr example TSV files -b720f32a add merge_jsonb_objects function, reorganize/refactor -723aa8bf lxml used to get license and other minor fixes -68e22614 add newest_non_null and merge_jsonb_objects strategies -ade00f44 Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue-348 -3a4ab37c Merge pull request #389 from creativecommons/refactor_sql_module -64c2fdf2 Use triple-quoted f-strings -c437c36f use string constants in extracted function arguments -39b2f705 rename extracted function to match its current behavior, -08f07d30 factor out string constants, -22842a72 Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue-348 -c3501add Merge pull request #374 from creativecommons/load_s3_to_postgres_workflow -ec6d0e23 Merge pull request #380 from amartya-dev/master -ab339292 Merge pull request #381 from kss682/issue371 -80fcac41 reverts formatting changes -c4148e37 check for name in enriched tags formatting changes modified test to include case for tag being dict -a60e7605 add fields to gather and looping through unit codes to script. -ff7b5792 edited image.py ti comply with pep8 -1383744a calls rewritten raw_pixel and removed RawPixel -2e2123b2 check tags against a blacklist in ImageStore -ee1e61b9 enable pulling from Smithsonian API endpoint. -e2afe96d Merge branch 'master' into smithsonian_integration -1bfd622c set up local S3 -using tests to run with --disable-socket -be39ebe4 update env.template with new environment variables -b8209e9e add final (for now) tests for s3 functions -624055b1 Merge pull request #321 from sp35/rawpixel -a674567f Remove unused import -a51fc569 incorporate function to load data from S3 into Postgres into DAG -d6539025 Merge pull request #369 from creativecommons/sweep_to_s3_workflow -7644cfac add function to load data from s3 into postgres -5e7d1363 refactor loader sql module in preparation for adding s3 loading -c1b7d286 reorganize DAG to hold new s3 loading to postgres logic -0cedb6ac Added DAG and corresponding test file New file to create DAG to execute Europeana's script Test suite to check for any import errors and the number of DAGs created Changes to be committed: new file: dags/europeana_workflow.py new file: dags/test_europeana_workflow.py -e50270c4 set up local postgres with mock aws_s3 functionality -935e2e2c change f-string to plain string, as per Timid Robot's comment -71c8e8ec freeze more requirements to let building happen without errors -25d4192a add error exit if local S3 isn't working -68e4be2e add basic tests for s3 copying method -a7a0d670 Refactor and test function for metadata functions Test functions for metadata and description functions Code Reafactoring New test for get_image_list for last page -bd4d0130 LangAware Description, tests to extract_data Description of images is taken in the following priority: -English -Default -dcDesption if neither of the above were available -e92fc0d1 [fix] Use logger for all logs and tags directly -2b586e77 set up local s3 for local running and testing -2c8bf9d6 add loading data to s3 as a dependency of local loading -44e79188 modify s3 loader trigger rule to avoid race with local loader -62f80b8f add s3 loading logic to database loader workflow -15b5811d date changes and removed old config -8759b1f9 Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue358 -b2f57ef1 patch object used to handle response -ab6e5d80 handling objects as batch and its tests -cab3afcd [Fix] Change test case for list of tags in test_raw_pixel.py -df0efb38 [fix] Faulty list of tags and logger instead of logging -f1a480d0 Added new test functions and response.json file Test function for empty list and error in response. Minor bugs fixes. This commit also adds a sample success response to be used for testing Changes to be committed: modified: dags/provider_api_scripts/europeana.py modified: dags/provider_api_scripts/test_europeana.py new file: dags/provider_api_scripts/tests/resources/europeana/europeana_example.json -901d66d3 Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue-348 -c3e347df Merge pull request #357 from akshgpt7/automate-linting -838132d4 Merge pull request #361 from creativecommons/freeze_requirements -0bbe4b76 Freeze SQLAlchemy version due to upstream bug -3c73a0d5 Remove branch restriction from push -5c18e6d1 metropolitan museum workflow -ef41f845 Merge pull request #278 from AyanChoudhary/rewrite_met_museum -8d48efbf Add workflow for linting, annotations for pull requests and push -e44acf2d Fixed timestamp and cusor bug Changed the Timestamp to ISO 8601 format Added code that caused error with last page of the results. -f9a83211 Added test file for Europeana API -7c4a1073 fix:styling issues -a3e41797 use getenv to get API key -9ea85543 proper use of image count -2bab62cf Refactor raw_pixel and test_raw_pixel - improvements -eee72263 Added code to get number of images stored modified: src/cc_catalog_airflow/dags/provider_api_scripts/europeana.py -90cdc32e Stored image & metadata using the ImageStore class -3520b9eb brooklyn museum rewritten -2e91a188 Merge remote-tracking branch 'upstream/master' -8f6e5dce Test skipping of defective rows upto a maximum number and throw error if max exceeded -cc9c4568 Update the data import from tsv to table to support skipping upto a maximum number of defective rows -1262a97c Merge pull request #344 from amartya-dev/automated_testing -50e3a6eb Merge pull request #349 from creativecommons/prod_deployment -670ab637 reorder commands in deployment bash script -4f13369b Tidy up bash script; improve Dockerfile directory handling -2f1d58d3 fix: patch test API calls with monkeypatch -ab24b4c2 Pagewise function implemented Images are retrieved pagewise till all images are retrieved -97fb972d Merge branch 'master' into prod_deployment -5fbad057 add deployment bash script to avoid remembering commands -45ade1a0 Merge pull request #330 from akshgpt7/phylopic -6abb65ce Merge branch 'master' into prod_deployment -2c7f7ce9 Merge pull request #346 from kss682/issue241 -955caef0 BashOperator used -1a7bd4d7 reconfigure wmc workflow file to match production -77d4f6dc fix bug where flickr script fails when reponse has no images -3591ff69 add testing plugins to requirements.txt, -60da0c71 test on both push and pull request -b863af74 Merge branch 'master' into prod_deployment -9656653c Add test resource for test_raw_pixel.py -b6c503a8 Add tests for raw_pixel.py -503bce53 monthly workflow indiviual scripts -0cf48025 Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue241 -64696eee Merge branch 'master' of https://github.com/creativecommons/cccatalog into phylopic -f972a59d Remove old Phylopic workflow -9f97e582 Merge pull request #328 from kss682/issue255 -5f49b694 Merge pull request #347 from creativecommons/flickr_bugfix -dcb8c33f add tests, clean up temporary bug fix -55f721d7 Merge branch 'master' into flickr_bugfix -be0da184 Added code to fetch details from the first page using Cursor based pagination. -84980d31 Monthly workflow and testsuit added -5908f5b9 monthly workflow -c6a4b64f Added command to copy env template -e7244c12 Added command to copy env template -389dde15 Changed directory before docker compose -d3fba27c Corrected the working directory specification -3fc9c045 modified commit accordinf to latest docs -3842c449 modified commit accordinf to latest docs -c6446844 Automated testing on pull request -23dabab3 Merge branch 'master' into prod_deployment -14419b32 change environment and docker compose configuration for prod use -fb9994f9 Merge pull request #342 from kss682/issue336 -79e20bad Merge branch 'master' of https://github.com/creativecommons/cccatalog into phylopic -58fe6d5f changes made as per review -9cf6251a Merge branch 'master' of https://github.com/creativecommons/cccatalog into issue255 -1d25cef6 method filter moved to query parameter -5d656238 Merge pull request #341 from SaurabhAgarwala/pr-moving-workflow -73836ecf Add the continue-on-error configuration to the PR moving workflow -0f250a13 Merge pull request #331 from creativecommons/dag_specific_loader -be04e432 feat: added tests for additional images -4a0ee314 Merge pull request #320 from mjprince/master -f51d7b4b Merge branch 'master' of https://github.com/creativecommons/cccatalog into phylopic -2b5f16dd Merge pull request #314 from ChariniNana/master -9bd3c2c1 test suite for cleveland script. -82ba4d40 get_response retry logic changed -a657f9d7 Merge pull request #332 from akmadian/master -956c19c7 Write test for phylopic workflow -3a7c507b Swap "not ready for work" and "awaiting triage" in issue templates -26f6c45d add python script to wait until Airflow metadata DB becomes ready -30f66eeb Update : review and pep8 guide changes made -1503f364 remove old env.sh.template -17419719 update README.md with new docker-compose dev setup -fb2d7603 Remove non-essential parameter from get_response_json test -2d058d3a Code formatting -229b67a7 Add generic tests for checking the get_response_json method and remove them from provider scripts -c428a490 add env.template to give the format of the .env file -b7583dfc Rename test_phylo_pic.py to test_phylopic.py -d994bdcb Remove duplicate file phylo_pic.py, created for case insensitive systems -3d3fb4aa Remove deprecated PhyloPic.py -ad5d183c Revert "Remove old phylopic.py and test_phylopic.py" -8252aa03 Remove old phylopic.py and test_phylopic.py -294f22c6 Create Apache Airflow DAG to run new phylopic.py script. -1dd89444 Update main.yml -6c44f90f rewriten clevelend provider script with ImageStore. -2117b176 Improve rawpixel.py - rename, private non-main functions, reduce main() -bd265c0a initial fix; This needs to be checked more in depth -e4c8d930 Further stylistic changes to support future signature changes -fb4b1ca0 Add stylistic edits to support signature changes in future -677bc0f8 Merge remote-tracking branch 'upstream/master' -2837027d Merge branch 'master' into dag_specific_loader -63bf5f07 feat: write unhappy path tests for _get_image_data -676b6bff Merge branch 'master' into dag_specific_loader -6c220b95 fix: remove unused imports and fixed new function call -f8accb48 Split process_image_data method into smaller methods in rawpixel.py -410335f4 add drop table test, rename table creator -a6e6dbd0 add more sql function tests for loading and upserting logic -47447368 Update main.yml -0f5ae13b Rewrite RawPixel.py using new ImageStore class - rawpixel.py -6938cefd Create main.yml -f6aed3d7 add more sql function unit tests -0005fb81 Merge pull request #313 from creativecommons/rename_old_scripts -528e5810 Format code -13f5f9a2 Use the get_response_json function provided in requester class with phylopic script and related tests -845802ef rename phylopic.py to phylo_pic.py -af28c7c1 add initial sql function tests -883708df modify loading table creation query to fail if it already exists -ff5d1f47 add tests for util.paths submodule -e267c2cf change funciton name in paths.py to match new operator definition -d86e2b8e rename DAG tasks (nodes) for clarity -e5539765 reorganize loading operators and logic into a package -218c24ad Fix test which mocks the get_response_json method -8cad5316 increase testability of loader_workflow.py -8e0bad68 change to official python base Docker image -d1cc3db5 Use the get_response_json function provided in requester class with wikimedia commons script and related tests -98de9d1a Add get_response_json function to requester class to minimise repetition of code -fb41ae58 Merge pull request #276 from akshgpt7/phylopic -89034dfc Merge pull request #285 from ChariniNana/master -2b868f7e Merge pull request #309 from creativecommons/kgodey-patch-2 -09f3c924 Merge pull request #306 from creativecommons/issue_template_fixes -e91381ea Code formatting in the test file -63a9b846 Merge pull request #308 from creativecommons/kgodey-patch-1 -453165dc Code formatting to comply with PEP8 -304a0f84 Delete older CODEOWNERS file -62fa4034 Added CODEOWNERS file -217278f7 Add foreign_identifier arg to add_item in phylopic.py, write test for it and add default 'all' value to date -fdaaaa39 modify issue templates as per comments by Timid Robot and Kriti -3ea544fd Merge branch 'master' of https://github.com/creativecommons/cccatalog into phylopic -15009eb4 Modify date keys to reflect the information they provide -12c55130 Merge remote-tracking branch 'upstream/master' -6f99b434 Simplify parameters and minor fixes in phylopic.py -c61a76e9 Merge pull request #303 from mariuszskon/fix/etlmods-deprecation-warning -655be1dc increase wait for file to finish updating -9475bd68 Merge pull request #290 from qubit99/master -d4ed42b6 parallelize DB loading somewhat, -a9b250b1 Updated test to verify date uploaded/taken are stored in meta data -afea1e98 Merge remote-tracking branch 'upstream/master' -3820596f Fix etlMods.py DeprecationWarning for invalid escape sequence -e523687b fix:handle foreign_landing_url value from the API and rename _get_data_for_each_image to _get_data_for_image -a1b6d9e4 fix:made requested changes and chnged foreign url according to new API -350f87ea Merge pull request #292 from creativecommons/new_issue_templates -ba67f961 add numerous new templates for issues -6904102d Updated Docstring -21e4f6c8 Merge pull request #288 from Milind712000/fix-readme-file-links -f6502b0b Merge branch 'master' into dag_specific_loader -57566942 increase wait time for production, change name for descriptiveness -6fd9210a Write tests for phylopic provider API script -663f23f3 Add example files for phylopic tests -3d6a0442 Fix README.md file links -dcc44ef6 extract sql.py from main DAG file for clarity -fe1bf147 finish basic version of loader DAG, add a smoke test -79eb62d8 fix:write tests for _get_data_for_each_image through _process_image_data, used new api response as sample response and enforced PEP8 standards in test -3d2766a4 [Issue 222] date info added to metadata -412a2e74 fix:removed unused vars and imports, fixed fatal bug in get_data_for_each_image by using keyword arguments, removed default mode from arg parse and left only date as the sole parameter and refactored code to confirm with PEP8 -fb58d77d add branching logic to loader dag -7161426a Break down larger functions -7962b201 Merge branch 'master' of https://github.com/creativecommons/cccatalog into phylopic -a0bf4e3d Refactor phylopic.py according to make requested changes -6f38793f fix:changes test for met..museum.py to function with mockpatches -0350bd7d fix:requested changes in metropolitan_museum_of_art.py -34a2fc55 add new loader_workflow.py, refactored a bit -7ba0a4e5 add initial Smithsonian Institution Provider API script -fc8f003e Merge pull request #272 from akshgpt7/flickr -e5aa677a Fix break logic in flickr.py and write test for it -373f4a66 refactor:change class names to follow convention -70140556 refactor:logger format to the new syntax -fa27f6a5 feat: completed test for create_meta_data -2691d079 feat: completed test for get_response_json -0b5d1e73 feat: completed test for get_object_ids -0a6bcf67 feat: refactored code with new classes -1578db9a feat:changed file to new name and updated imports from the new classes -17d8b8b9 Add _get_response_json() method -a000160b Merge branch 'master' of https://github.com/creativecommons/cccatalog into phylopic -668872d9 Refactor the new phylopic API script -15183839 Rewrite phylopic API script using new model. -00d4969b Merge branch 'master' of https://github.com/creativecommons/cccatalog into flickr -c6acda37 Fix requested changes -7ac75321 Merge pull request #269 from creativecommons/provider-api-issue-template -0b1c0299 Logical fix in returning Nonetypes -08da7324 Change 'tries' to 'max_tries' in flickr's provider api script's _get_image_list method for more clarity -44e6c0b9 Refactor flickr API script's _get_image_list method to use 'tries' instead of 'retries' -e2be5c4d remove unnecessary comment in cleveland museum -c309dcf0 Merge pull request #270 from akshgpt7/cleveland -f864e891 use has_image parameter in Cleveland Museum script -807cb5e8 Add Provider API issue template -ebf8582a add docker-compose to test Apache Airflow operations on PostgreSQL -f00d1576 Merge pull request #266 from creativecommons/flickr_dag -e40669f3 remove deprecated Flickr script and associated cruft -cdf90758 add new flickr dag, and a basic smoke test -b4c9c70b Merge pull request #263 from creativecommons/flickr_rewrite -849aaad9 add whitespace around equals sign -1080186a add more tests, most at a higher level -7a3518de extract json checking to its own function -20f81d49 add more tests for new flickr.py script -87b8ae1c Merge pull request #261 from creativecommons/wikimedia_timeout_increase -8d226e11 raise exception if retries are exceeded -2a2c48ab increase timeout to help wikimedia commons script succeed -1c1fc52b Merge pull request #259 from creativecommons/requester_bugfix -283c4c8c delete extraneous requests.get outside of try/except block -752676cb add resource jsons to avoid large test functions -bb04b7b5 add more tests for Flickr script -c99b09e9 default to empty string for description so we can strip it -97f853dd rewrite Flickr.py to flickr.py, add basic tests -f373ed35 add logging message for missing columns in ImageStore -e3d1b407 Merge pull request #250 from creativecommons/wikimedia_commons_dag -56eda8b8 expand ts to timestamp for clarity in wikimedia_commons.py -a520362d change itereator variable from i to _ -b5a68a6e modify wikimedia_workflow DAG test to work from other directories -3ad1c3dd add DAG for new Wikimedia Commons script -fbfcb8d4 change image_batch recursion into loop for safety -a99c39f4 remove old WikimediaCommons script and tests -98a31aa6 update Dockerfile to avoid permissions problems with env.sh -c36f3d28 Merge pull request #248 from creativecommons/wikimedia_rewrite -e6b1334a conform to PEP8 numbers of lines between test functions -232588b1 add docstring for initializing DelayedRequester -8ea121b8 Add globalusage tallying logic to wikimedia_commons.py -1f3d1fa1 port and add tests for new wikimedia_commons.py script -066ccf4a Merge pull request #246 from creativecommons/image_repository_class -752e85c3 add initial version of new wikimedia_commons.py script -ab0cabf8 add DelayedRequest class to handle rate limited requests -b314e9d0 reorganize image store class -1f9e7e8f add early exit when there is no license or version available -6b7c3088 reorganize directory structure for delayed_request module -5049b57b add logic to enrich meta_data with license_url by default -de23bd56 change pairs from generator to list comprehension for logging -796f6c5b move unused filesize column one layer to DB -2ad78259 add docstrings to public ImageStore methods -640da118 add docstrings to public functions in storage.util -f2a5c010 remove unused enforce_all_arguments_truthy function -b5b204b8 try harder to cast booleans, add docstrings for column classes -4dd8b15c finish ImageStore.commit logic -852757f9 add writing to disk and more verification logic to ImageStore -74c29f33 add image.py with ImageStore class -4b7f8752 add columns.py, creating column types -afddfa0b add sanitization of strings and json to storage utilities -d56d79ae add methods to enforce truthiness and merge provider/source -c4b80600 Merge pull request #245 from creativecommons/turn_on_separate_dags -41d04dfb add character limit enforcer to avoid DB import failures, -781435ed add basic utilities to be used by ImageStore class, -0ae98f94 schedule separate DAGs to run in place of dailyWorkflow.py -993c74d2 Merge pull request #235 from creativecommons/json_string_bugfix -329b2c1c rewrap function arguments -24e95792 add function to sanitize json values before dumping to json string -42b65e91 Merge pull request #230 from creativecommons/airflow_daily_dag_split -bd19070d capitalize all letters in constants from `config.py` -ad2b3411 wrap long line -133851be add comment to Dockerfile explaining 'hash' style image tag -c2fb113d Change links to reference style, where appropriate -cdaa0a77 specify Amazon EMR to avoid acronym-knowledge overhead -80acb00f make minor changes to conform to PEP8 -c0a67a4a update README with new filepaths, and clean up cruft -0051db98 remove old dailyWorkflow.py DAG file -6c69a400 Add newlines to ends of files to please git -939b0922 Fix crontab bug in WikimediaCommons DAG; add DAG config validation -a307b323 split dailyWorkflow.py into separate DAGs. -9676210a use env.sh in Dockerfile -1ad3cfc7 add airflow testing detritus to .gitignore -bd5c05e3 move dag files to synchronize with Docker Container -08d37f63 move Dockerfile and requirements.txt to DAG directory -35697bb5 Move example output files to testing resource location -33c53961 move api provider scripts to make deployment simpler -cf7d68a8 Merge pull request #226 from creativecommons/wikimedia_commons_creator_fix -0e8d9818 Use single quote for all strings in `test_WikimediaCommons.py` -0c28c78f Extract json examples from test file -37959914 use parentheses instead of backslashes for line continuation -cd765de2 extract row_generator from list comprehension for readability -b1a6eb06 change docstring to correct endpoint for documentation -b07de227 add functionality to scrape text from description field -db7fc30a refactor WikimediaCommons.py with snake_case, change endpoint -cee194dc Merge branch 'flickr_test_speed' into wikimedia_commons_creator_fix -4027b911 bring WikimediaCommons.py into pep8 compliance, remove star import -d3f8aa50 monkeypatch delay function to speed up testing -897a808a refactor create_tsv_list_row, add logging functionality -9dfa77e5 Refactor getMetaData into process_image_data -906d64b5 change Wikimedia Commons script to use artist info for creator -cf8c0d85 (tag: v0.1.0) Merge pull request #218 from creativecommons/extract_row_formatter -1392dadf change name of etlMods import to be more meaningful -574a3684 align more variables with pep8, remove star import from Flickr.py -08c608e1 extract row-writing function from Flickr.py -b76dc709 Merge pull request #213 from creativecommons/test_env_setup -8ed02300 add Dockerfile to set up local testing/development environment; add test for Flickr.py -fd9d71e0 Merge pull request #198 from creativecommons/string_sanitizer_bugfix -bd75338c add casting to string as first step of sanitizeString method -ece000fe Merge pull request #190 from paulofilip3/master -3a66c105 Fix requirements.txt -9cbf0f76 Add gitignore -2c0fb06b Update README.md -4dd2b8ba Update README.md -ce9c808e Update README.md -3d787d11 Update README.md -d58c7a4a Update CC Catalog Common Crawl test cases -662fd157 Update README.md -74cc30fc Update requirements.txt -55445e89 Update README.md -aef747e3 Add wikimedia commons to the workflow -e01dd5cf Bug fix -e22fef7d Add cc catalog workflow scripts -ab5bf45c Add Wikimedia Commons using the API -d7faf342 Add NYPL using the API -aef20219 extract popularity metrics from Behance -8fd4d09e Add new date parameter to query images on flickr -401c4737 Remove Brooklyn Museum from the common crawl providers -d0607b61 Add RawPixel using the API -4c6e6b36 Remove RawPixel from the common crawl providers -5d9f9a2d Merge branch 'master' of github.com:creativecommons/cccatalog -e6c5e75d Add new provider -b5695fa3 Add function to extract the license -d39b37bf Update CONTRIBUTING.md -94de9068 Update .cc-metadata.yml -0dd15188 Update and rename .github/CODEOWNERS to CODEOWNERS -0c55f62e Update and rename CODEOWNERS to .github/CODEOWNERS -83fc4b20 Update common crawl and api jobs to standardize the output data -146674d2 Sanitize strings -b55def75 Sanitize strings -612cc846 Bug fix -312a9d5f Bug fix -9c838103 Add new provider -1483e51a Bug fix -384ebcfe Update modules -8740f0fb Update the output description -c9bdb3da Add optional parameter for http requests -25173032 Create function to extract CC license and version from url -65994fe0 Make repository contribution ready -e4477830 Bug fix -9cf6a058 Update log statistics -dc15c511 Bug fix -b884a402 Set default mode to the start of the previous hour -dd9e0488 Include sys module -be04fd8c Escape special characters -65769114 Add new provider -98442106 Bug fix -d2709afe Add new Common Crawl providers -5d854355 Replace empty strings with null -0eac921d Update thumbnail source -e95e9191 Update code to parse changes in the HTML -6f32e45b Bug FIx -e984d251 Bug fix -72b1b714 Add new providers -2fbbeb43 Bug fix -0fb57d2a Bug fix -34a222fe Refactor common crawl scripts -44ed6124 Update argument name -a752543f Update driver to identify 3D models by date -dd0ec1e7 Remove redundant modules -fe6300d5 Update imported modules -e92268da Bug fix -aa01a978 Modularize code -e4d57373 Modularize code -f2c0b51b reformat the output and santitize strings -9fb964d6 Provide optional arguments to execute the script -2368ef60 Add Met Museum as a new content provider -5c4b476b Update image detection logic -9d98d82a Change common crawl index extraction logic -f2a04b14 Merge branch 'master' of github.com:creativecommons/cccatalog -ace5fdc2 Add Cleveland Museum as a new provider -b2ca5b0b Update requirements.txt -1ad2b811 Add new API provider -0c37ad38 restructure common crawl providers -473490f7 Merge branch 'master' of github.com:creativecommons/cccatalog -16196822 Add generic exception handling for requests -62103934 Update README.md -d1efa480 Update README.md -d5901e0a Update ExtractCCLinks.py -c6949f15 Add new providers -d44bdc45 Add new providers -07a8b3d6 Create a default parameter for the common crawl ETL process -a31aa0f1 Add new content providers and sample data -76af3781 Add data extraction steps for the Met -00417189 Merge branch 'master' of github.com:creativecommons/cccatalog -adb2d20f Add the Met Museum as a content provider -125504d6 Update README.md -9b84a269 Add test cases -f5532c42 Update ExtractCCLinks.py -1a813349 Identify domains that link to creative commons -a31ee4bb First tests for the common crawl parsing strategy -2138ad24 Some basic tests for working with spark and wark files and some example processing we are going to be running. -ffa2b85e Initial commit