From 4ab4e6eec41e3ab2abfa6518dc91c7a995640607 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Tue, 18 May 2021 09:58:56 +0300 Subject: [PATCH 01/18] Update README.md --- README.md | 48 ++++++++++++++++++------------------------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index e9ca1be26..7a808b9ca 100644 --- a/README.md +++ b/README.md @@ -49,40 +49,29 @@ The steps above are performed in [`ExtractCCLinks.py`][ex_cc_links]. various API ETL jobs which pull and process data from a number of open APIs on the internet. -### Common API Workflows +### Daily API Workflows -The Airflow DAGs defined in [`common_api_workflows.py`][api_flows] manage daily -ETL jobs for the following platforms, by running the linked scripts: +Workflows that have a `schedule_string='@daily'` parameter are run daily. The DAG +workflows run `provider_api_scripts` to load and extract media data from the APIs. +Below are some of the daily DAG workflows that run the corresponding `provider_api_scripts` +daily: -- [Met Museum](src/cc_catalog_airflow/dags/provider_api_scripts/metropolitan_museum_of_art.py) -- [PhyloPic](src/cc_catalog_airflow/dags/provider_api_scripts/phylopic.py) -- [Thingiverse](src/cc_catalog_airflow/dags/provider_api_scripts/Thingiverse.py) - -[api_flows]: src/cc_catalog_airflow/dags/common_api_workflows.py - -### Other Daily API Workflows - -Airflow DAGs, defined in their own files, also run the following scripts daily: - -- [Flickr](src/cc_catalog_airflow/dags/provider_api_scripts/flickr.py) -- [Wikimedia Commons](src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py) - -In the future, we'll migrate to the latter style of Airflow DAGs and -accompanying Provider API Scripts. +- [Met Museum Workflow](src/cc_catalog_airflow/dags/metropolitan_museum_workflow.py) ( [API script](src/cc_catalog_airflow/dags/provider_api_scripts/metropolitan_museum_of_art.py) ) +- [PhyloPic Workflow](src/cc_catalog_airflow/dags/phylopic_workflow.py) ( [API script](src/cc_catalog_airflow/dags/provider_api_scripts/phylopic.py) ) +- [Flickr Workflow](src/cc_catalog_airflow/dags/flickr_workflow.py) ( [API script](src/cc_catalog_airflow/dags/provider_api_scripts/flickr.py) ) +- [Wikimedia Commons Workflow](src/cc_catalog_airflow/dags/wikimedia_workflow.py) ( [Commons API script](src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py) ) ### Monthly Workflow -The Airflow DAG defined in [`monthlyWorkflow.py`][mon_flow] handles the monthly -jobs that are scheduled to run on the 15th day of each month at 16:00 UTC. This -workflow is reserved for long-running jobs or APIs that do not have date -filtering capabilities so the data is reprocessed monthly to keep the catalog -updated. The following tasks are performed: +Some API ingestion workflows are scheduled to run on the 15th day of each +month at 16:00 UTC. These workflows are reserved for long-running jobs or +APIs that do not have date filtering capabilities so the data is reprocessed +monthly to keep the catalog updated. The following tasks are performed monthly: -- [Cleveland Museum of Art](src/cc_catalog_airflow/dags/provider_api_scripts/ClevelandMuseum.py) -- [RawPixel](src/cc_catalog_airflow/dags/provider_api_scripts/RawPixel.py) +- [Cleveland Museum of Art](src/cc_catalog_airflow/dags/provider_api_scripts/cleveland_museum_of_art.py) +- [RawPixel](src/cc_catalog_airflow/dags/provider_api_scripts/raw_pixel.py) - [Common Crawl Syncer](src/cc_catalog_airflow/dags/commoncrawl_s3_syncer/SyncImageProviders.py) -[mon_flow]: src/cc_catalog_airflow/dags/monthlyWorkflow.py ### DB_Loader @@ -92,11 +81,10 @@ into the upstream database. It includes some data preprocessing steps. [db_loader]: src/cc_catalog_airflow/dags/loader_workflow.py -### Other API Jobs (not in the workflow) +### Other API Jobs -- [Brooklyn Museum](src/cc_catalog_airflow/dags/provider_api_scripts/BrooklynMuseum.py) -- [NYPL](src/cc_catalog_airflow/dags/provider_api_scripts/NYPL.py) -- Cleveland Public Library +- [Brooklyn Museum](src/cc_catalog_airflow/dags/provider_api_scripts/brooklyn_museum.py) +- [NYPL](src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py) ## Development setup for Airflow and API puller scripts From 900ad2216084940e87240e6447fe074e840efa34 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Tue, 25 May 2021 14:01:23 +0300 Subject: [PATCH 02/18] Log the actual URL requested Currently, the base_url and the query params are logged. Adding the actual requested URL to the log makes debugging easier. --- .../dags/provider_api_scripts/common/requester.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/common/requester.py b/src/cc_catalog_airflow/dags/provider_api_scripts/common/requester.py index 057c1f4ea..e359941b1 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/common/requester.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/common/requester.py @@ -41,10 +41,11 @@ def get(self, url, params=None, **kwargs): try: response = requests.get(url, params=params, **kwargs) if response.status_code == requests.codes.ok: + logger.info(f'Received response from url {response.url}') return response else: logger.warning( - f'Unable to request URL: {url}. ' + f'Unable to request URL: {response.url}. ' f'Status code: {response.status_code}' ) return response From 400bd8f09f5983e827016a86d72ac7174a804004 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Mon, 31 May 2021 07:54:36 +0300 Subject: [PATCH 03/18] Re-add the lint and test workflows from the original repo Signed-off-by: Olga Bulat --- .github/workflows/lint.yml | 27 ++++++++++++++++++++ .github/workflows/push_pull_request_test.yml | 16 ++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 .github/workflows/lint.yml create mode 100644 .github/workflows/push_pull_request_test.yml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000..a9c1279a1 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,27 @@ +name: flake8_lint + +on: + push: + paths: + - '**.py' + pull_request: + paths: + - '**.py' + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v1 + - name: Set up Python 3.9 + uses: actions/setup-python@v1 + with: + python-version: 3.9 + - name: Setup flake8 annotations # To annotate the errors in the pull request + uses: rbialon/flake8-annotations@v1 + - name: Lint with flake8 + # Lint only for changed files, rather than the entire repository + run: | + pip install flake8>=3.7.0 + git diff HEAD^ | flake8 --diff \ No newline at end of file diff --git a/.github/workflows/push_pull_request_test.yml b/.github/workflows/push_pull_request_test.yml new file mode 100644 index 000000000..32c1075dc --- /dev/null +++ b/.github/workflows/push_pull_request_test.yml @@ -0,0 +1,16 @@ +name: Automated testing workflow +on: [push, pull_request] +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Build the stack + run: | + cd ./src/cc_catalog_airflow + cp env.template .env + docker-compose up -d + - name: Test + run: | + sleep 10 + docker exec cc_catalog_airflow_webserver_1 /usr/local/airflow/.local/bin/pytest \ No newline at end of file From a61a664908e19591a49fd3ea4b40a991bebe7d77 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Mon, 31 May 2021 07:56:13 +0300 Subject: [PATCH 04/18] Add missing new lines at the end of files Signed-off-by: Olga Bulat --- .github/workflows/lint.yml | 2 +- .github/workflows/push_pull_request_test.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index a9c1279a1..b68bad987 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -24,4 +24,4 @@ jobs: # Lint only for changed files, rather than the entire repository run: | pip install flake8>=3.7.0 - git diff HEAD^ | flake8 --diff \ No newline at end of file + git diff HEAD^ | flake8 --diff diff --git a/.github/workflows/push_pull_request_test.yml b/.github/workflows/push_pull_request_test.yml index 32c1075dc..e4e8fa1e1 100644 --- a/.github/workflows/push_pull_request_test.yml +++ b/.github/workflows/push_pull_request_test.yml @@ -13,4 +13,4 @@ jobs: - name: Test run: | sleep 10 - docker exec cc_catalog_airflow_webserver_1 /usr/local/airflow/.local/bin/pytest \ No newline at end of file + docker exec cc_catalog_airflow_webserver_1 /usr/local/airflow/.local/bin/pytest From 0cd9768665defcefaf43b7c1b3c84b8ec4c8ad1c Mon Sep 17 00:00:00 2001 From: Dhruv Bhanushali Date: Thu, 3 Jun 2021 10:24:10 +0530 Subject: [PATCH 05/18] Create a CODEOWNERS file --- .github/CODEOWNERS | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 000000000..12c96c788 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,2 @@ +# All files are owned by the Openverse Developers team +* @WordPress/openverse-developers From dc7ebe29e11c501cf445802aa95a38b38431df40 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Thu, 3 Jun 2021 09:32:50 +0300 Subject: [PATCH 06/18] Add blank lines for readability Signed-off-by: Olga Bulat --- .github/workflows/push_pull_request_test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/push_pull_request_test.yml b/.github/workflows/push_pull_request_test.yml index e4e8fa1e1..8ba4b4b86 100644 --- a/.github/workflows/push_pull_request_test.yml +++ b/.github/workflows/push_pull_request_test.yml @@ -1,8 +1,11 @@ name: Automated testing workflow + on: [push, pull_request] + jobs: test: runs-on: ubuntu-latest + steps: - uses: actions/checkout@v2 - name: Build the stack From 70515f27070cc64ac3f51f88729313aabf8bef3a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 3 Jun 2021 16:11:00 +0000 Subject: [PATCH 07/18] Bump urllib3 from 1.25.11 to 1.26.5 in /src/cc_catalog_airflow Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.25.11 to 1.26.5. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/1.25.11...1.26.5) --- updated-dependencies: - dependency-name: urllib3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- src/cc_catalog_airflow/requirements_dev.txt | 2 +- src/cc_catalog_airflow/requirements_prod.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cc_catalog_airflow/requirements_dev.txt b/src/cc_catalog_airflow/requirements_dev.txt index edb78a38b..67f691dbe 100644 --- a/src/cc_catalog_airflow/requirements_dev.txt +++ b/src/cc_catalog_airflow/requirements_dev.txt @@ -105,7 +105,7 @@ termcolor==1.1.0 text-unidecode==1.3 typing-extensions==3.10.0.0 unicodecsv==0.14.1 -urllib3==1.25.11 +urllib3==1.26.5 watchtower==0.7.3 Werkzeug==1.0.1 WTForms==2.3.3 diff --git a/src/cc_catalog_airflow/requirements_prod.txt b/src/cc_catalog_airflow/requirements_prod.txt index abc23a9c3..5511e8c0f 100644 --- a/src/cc_catalog_airflow/requirements_prod.txt +++ b/src/cc_catalog_airflow/requirements_prod.txt @@ -118,7 +118,7 @@ toml==0.10.2 traitlets==5.0.5 typing-extensions==3.10.0.0 unicodecsv==0.14.1 -urllib3==1.25.11 +urllib3==1.26.5 watchtower==0.7.3 wcwidth==0.2.5 Werkzeug==1.0.1 From 51290e41b7d369c1be8a4748f38501695c883fb3 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Thu, 3 Jun 2021 20:12:49 +0300 Subject: [PATCH 08/18] Remove workflows from workflow-disabled folder Signed-off-by: Olga Bulat --- .github/workflows-disabled/lint.yml | 28 ------------------- .../push_pull_request_test.yml | 16 ----------- .github/workflows/lint.yml | 2 +- 3 files changed, 1 insertion(+), 45 deletions(-) delete mode 100644 .github/workflows-disabled/lint.yml delete mode 100644 .github/workflows-disabled/push_pull_request_test.yml diff --git a/.github/workflows-disabled/lint.yml b/.github/workflows-disabled/lint.yml deleted file mode 100644 index 20cd78105..000000000 --- a/.github/workflows-disabled/lint.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: flake8_lint - -on: - push: - paths: - - '**.py' - pull_request: - paths: - - '**.py' - - -jobs: - build: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v1 - - name: Set up Python 3.7 - uses: actions/setup-python@v1 - with: - python-version: 3.7 - - name: Setup flake8 annotations # To annotate the errors in the pull request - uses: rbialon/flake8-annotations@v1 - - name: Lint with flake8 - # Lint for only changed files, rather than the entire repository - run: | - pip install flake8>=3.7.0 - git diff HEAD^ | flake8 --diff diff --git a/.github/workflows-disabled/push_pull_request_test.yml b/.github/workflows-disabled/push_pull_request_test.yml deleted file mode 100644 index e4e8fa1e1..000000000 --- a/.github/workflows-disabled/push_pull_request_test.yml +++ /dev/null @@ -1,16 +0,0 @@ -name: Automated testing workflow -on: [push, pull_request] -jobs: - test: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Build the stack - run: | - cd ./src/cc_catalog_airflow - cp env.template .env - docker-compose up -d - - name: Test - run: | - sleep 10 - docker exec cc_catalog_airflow_webserver_1 /usr/local/airflow/.local/bin/pytest diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b68bad987..a9c1279a1 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -24,4 +24,4 @@ jobs: # Lint only for changed files, rather than the entire repository run: | pip install flake8>=3.7.0 - git diff HEAD^ | flake8 --diff + git diff HEAD^ | flake8 --diff \ No newline at end of file From 66b5e693666aa6897a4a0e927fe7c2314eb679a3 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Thu, 3 Jun 2021 20:18:58 +0300 Subject: [PATCH 09/18] Add trailing new line to lint workflow Signed-off-by: Olga Bulat --- .github/workflows/lint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index a9c1279a1..b68bad987 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -24,4 +24,4 @@ jobs: # Lint only for changed files, rather than the entire repository run: | pip install flake8>=3.7.0 - git diff HEAD^ | flake8 --diff \ No newline at end of file + git diff HEAD^ | flake8 --diff From a22d1931b7593ad6133ff4ae43ae94a1d41bb79e Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Tue, 8 Jun 2021 10:55:49 +0300 Subject: [PATCH 10/18] Ensure Docker loads local_postgres sql scripts in correct order Signed-off-by: Olga Bulat --- .../{airflow_user_db.sql => 01_airflow_user_db.sql} | 0 .../{aws_s3_mock.sql => 02_aws_s3_mock.sql} | 0 ...image_schema.sql => 03_openledger_image_schema.sql} | 0 ...ger_image_view.sql => 04_openledger_image_view.sql} | 0 ...e_schema.sql => 05_openledger_old_image_schema.sql} | 0 src/cc_catalog_airflow/local_postgres/Dockerfile | 10 +++++----- 6 files changed, 5 insertions(+), 5 deletions(-) rename src/cc_catalog_airflow/local_postgres/{airflow_user_db.sql => 01_airflow_user_db.sql} (100%) rename src/cc_catalog_airflow/local_postgres/{aws_s3_mock.sql => 02_aws_s3_mock.sql} (100%) rename src/cc_catalog_airflow/local_postgres/{openledger_image_schema.sql => 03_openledger_image_schema.sql} (100%) rename src/cc_catalog_airflow/local_postgres/{openledger_image_view.sql => 04_openledger_image_view.sql} (100%) rename src/cc_catalog_airflow/local_postgres/{openledger_old_image_schema.sql => 05_openledger_old_image_schema.sql} (100%) diff --git a/src/cc_catalog_airflow/local_postgres/airflow_user_db.sql b/src/cc_catalog_airflow/local_postgres/01_airflow_user_db.sql similarity index 100% rename from src/cc_catalog_airflow/local_postgres/airflow_user_db.sql rename to src/cc_catalog_airflow/local_postgres/01_airflow_user_db.sql diff --git a/src/cc_catalog_airflow/local_postgres/aws_s3_mock.sql b/src/cc_catalog_airflow/local_postgres/02_aws_s3_mock.sql similarity index 100% rename from src/cc_catalog_airflow/local_postgres/aws_s3_mock.sql rename to src/cc_catalog_airflow/local_postgres/02_aws_s3_mock.sql diff --git a/src/cc_catalog_airflow/local_postgres/openledger_image_schema.sql b/src/cc_catalog_airflow/local_postgres/03_openledger_image_schema.sql similarity index 100% rename from src/cc_catalog_airflow/local_postgres/openledger_image_schema.sql rename to src/cc_catalog_airflow/local_postgres/03_openledger_image_schema.sql diff --git a/src/cc_catalog_airflow/local_postgres/openledger_image_view.sql b/src/cc_catalog_airflow/local_postgres/04_openledger_image_view.sql similarity index 100% rename from src/cc_catalog_airflow/local_postgres/openledger_image_view.sql rename to src/cc_catalog_airflow/local_postgres/04_openledger_image_view.sql diff --git a/src/cc_catalog_airflow/local_postgres/openledger_old_image_schema.sql b/src/cc_catalog_airflow/local_postgres/05_openledger_old_image_schema.sql similarity index 100% rename from src/cc_catalog_airflow/local_postgres/openledger_old_image_schema.sql rename to src/cc_catalog_airflow/local_postgres/05_openledger_old_image_schema.sql diff --git a/src/cc_catalog_airflow/local_postgres/Dockerfile b/src/cc_catalog_airflow/local_postgres/Dockerfile index bdc43da3d..fce37f37c 100644 --- a/src/cc_catalog_airflow/local_postgres/Dockerfile +++ b/src/cc_catalog_airflow/local_postgres/Dockerfile @@ -2,9 +2,9 @@ FROM postgres:13.2 ENV POSTGRES_USER=deploy ENV POSTGRES_PASSWORD=deploy ENV POSTGRES_DB=openledger -ADD ./openledger_image_schema.sql /docker-entrypoint-initdb.d -ADD ./openledger_old_image_schema.sql /docker-entrypoint-initdb.d -ADD ./openledger_image_view.sql /docker-entrypoint-initdb.d -ADD ./aws_s3_mock.sql /docker-entrypoint-initdb.d -ADD ./airflow_user_db.sql /docker-entrypoint-initdb.d +ADD 01_airflow_user_db.sql /docker-entrypoint-initdb.d +ADD 02_aws_s3_mock.sql /docker-entrypoint-initdb.d +ADD 03_openledger_image_schema.sql /docker-entrypoint-initdb.d +ADD 04_openledger_image_view.sql /docker-entrypoint-initdb.d +ADD 05_openledger_old_image_schema.sql /docker-entrypoint-initdb.d RUN apt-get -y update && apt-get -y install python3-boto3 postgresql-plpython3-13 From 408cb8a42114768c5b7fbe3a034aeaebbc62e20c Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Tue, 8 Jun 2021 11:55:58 +0300 Subject: [PATCH 11/18] Fix the dependency version conflict Signed-off-by: Olga Bulat --- src/cc_catalog_airflow/local_s3/Dockerfile | 3 +-- src/cc_catalog_airflow/requirements_dev.txt | 6 +++--- src/cc_catalog_airflow/requirements_prod.txt | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/cc_catalog_airflow/local_s3/Dockerfile b/src/cc_catalog_airflow/local_s3/Dockerfile index dc8703a11..375d85e40 100644 --- a/src/cc_catalog_airflow/local_s3/Dockerfile +++ b/src/cc_catalog_airflow/local_s3/Dockerfile @@ -1,5 +1,4 @@ -FROM python:3.7-buster - +FROM python:3.9 ARG CCCATALOG_STORAGE_BUCKET ENV SCRIPT_DIR=/app diff --git a/src/cc_catalog_airflow/requirements_dev.txt b/src/cc_catalog_airflow/requirements_dev.txt index 67f691dbe..c72f0da70 100644 --- a/src/cc_catalog_airflow/requirements_dev.txt +++ b/src/cc_catalog_airflow/requirements_dev.txt @@ -1,10 +1,10 @@ apache-airflow[amazon,postgres]==2.0.2 -boto3==1.15.13 +boto3==1.17.53 lxml==4.6.3 psycopg2-binary==2.8.6 tldextract==2.2.2 -ipython==7.23.1 +ipython==7.24.1 pytest==6.2.4 pytest-cov==2.11.1 pytest-mock==3.6.1 @@ -20,7 +20,7 @@ argcomplete==1.12.3 attrs==20.3.0 Babel==2.9.1 blinker==1.4 -botocore==1.18.18 +botocore==1.20.89 cached-property==1.5.2 cattrs==1.6.0 certifi==2020.12.5 diff --git a/src/cc_catalog_airflow/requirements_prod.txt b/src/cc_catalog_airflow/requirements_prod.txt index 5511e8c0f..93553cc5e 100644 --- a/src/cc_catalog_airflow/requirements_prod.txt +++ b/src/cc_catalog_airflow/requirements_prod.txt @@ -1,5 +1,5 @@ apache-airflow[amazon,postgres]==2.0.2 -boto3==1.15.13 +boto3==1.17.53 lxml==4.6.3 psycopg2-binary==2.8.6 tldextract==2.2.2 @@ -16,7 +16,7 @@ attrs==20.3.0 Babel==2.9.1 backcall==0.2.0 blinker==1.4 -botocore==1.18.18 +botocore==1.20.89 cached-property==1.5.2 cattrs==1.6.0 certifi==2020.12.5 From 012df2dbee95e5280e33a9327809b74ca2d629c6 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Wed, 9 Jun 2021 08:18:14 +0300 Subject: [PATCH 12/18] Make URL logging less verbose on success, more verbose on failure Signed-off-by: Olga Bulat --- src/cc_catalog_airflow/dags/common/requester.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/cc_catalog_airflow/dags/common/requester.py b/src/cc_catalog_airflow/dags/common/requester.py index e359941b1..f5fc008f4 100644 --- a/src/cc_catalog_airflow/dags/common/requester.py +++ b/src/cc_catalog_airflow/dags/common/requester.py @@ -33,9 +33,6 @@ def get(self, url, params=None, **kwargs): params: Dictionary of query string params **kwargs: Optional arguments that will be passed to `requests.get` """ - logger.info(f'Processing request for url: {url}') - logger.info(f'Using query parameters {params}') - logger.info(f'Using headers {kwargs.get("headers")}') self._delay_processing() self._last_request = time.time() try: @@ -50,8 +47,10 @@ def get(self, url, params=None, **kwargs): ) return response except Exception as e: - logger.error('There was an error with the request.') + logger.error(f'There was an error with the request for url: {url}.') logger.info(f'{type(e).__name__}: {e}') + logger.info(f'Using query parameters {params}') + logger.info(f'Using headers {kwargs.get("headers")}') return None def _delay_processing(self): From 387c973a4e58a99b0f0e10ef124b4c80834efd77 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Wed, 9 Jun 2021 08:31:37 +0300 Subject: [PATCH 13/18] Fix linting errors Signed-off-by: Olga Bulat --- src/cc_catalog_airflow/dags/common/__init__.py | 5 ++++- src/cc_catalog_airflow/dags/common/requester.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/cc_catalog_airflow/dags/common/__init__.py b/src/cc_catalog_airflow/dags/common/__init__.py index eccc02db2..16d56a20d 100644 --- a/src/cc_catalog_airflow/dags/common/__init__.py +++ b/src/cc_catalog_airflow/dags/common/__init__.py @@ -1,3 +1,6 @@ +# flake8: noqa from .licenses import constants, licenses -from .storage.image import Image, ImageStore, MockImageStore +from .storage.image import ( + Image, ImageStore, MockImageStore +) from .requester import DelayedRequester diff --git a/src/cc_catalog_airflow/dags/common/requester.py b/src/cc_catalog_airflow/dags/common/requester.py index f5fc008f4..a6f388f26 100644 --- a/src/cc_catalog_airflow/dags/common/requester.py +++ b/src/cc_catalog_airflow/dags/common/requester.py @@ -47,7 +47,7 @@ def get(self, url, params=None, **kwargs): ) return response except Exception as e: - logger.error(f'There was an error with the request for url: {url}.') + logger.error(f'Error with the request for url: {url}.') logger.info(f'{type(e).__name__}: {e}') logger.info(f'Using query parameters {params}') logger.info(f'Using headers {kwargs.get("headers")}') From e6a64c6f0e4d467ba3e972dea4bb015b366ad74f Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Wed, 9 Jun 2021 09:08:29 +0300 Subject: [PATCH 14/18] Run CI on push only on master Signed-off-by: Olga Bulat --- .github/workflows/lint.yml | 2 ++ .github/workflows/push_pull_request_test.yml | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b68bad987..feda2dfdf 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -2,6 +2,8 @@ name: flake8_lint on: push: + branches: + - master paths: - '**.py' pull_request: diff --git a/.github/workflows/push_pull_request_test.yml b/.github/workflows/push_pull_request_test.yml index 8ba4b4b86..7394af8b8 100644 --- a/.github/workflows/push_pull_request_test.yml +++ b/.github/workflows/push_pull_request_test.yml @@ -1,6 +1,10 @@ name: Automated testing workflow -on: [push, pull_request] +on: + push: + branches: + - master + pull_request: jobs: test: From 68172b9478079014b791f9b7cb12538ab95eefb7 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Wed, 9 Jun 2021 09:10:41 +0300 Subject: [PATCH 15/18] Run CI on push only on main Signed-off-by: Olga Bulat --- .github/workflows/lint.yml | 2 +- .github/workflows/push_pull_request_test.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index feda2dfdf..bc6905362 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -3,7 +3,7 @@ name: flake8_lint on: push: branches: - - master + - main paths: - '**.py' pull_request: diff --git a/.github/workflows/push_pull_request_test.yml b/.github/workflows/push_pull_request_test.yml index 7394af8b8..0f13c342e 100644 --- a/.github/workflows/push_pull_request_test.yml +++ b/.github/workflows/push_pull_request_test.yml @@ -3,7 +3,7 @@ name: Automated testing workflow on: push: branches: - - master + - main pull_request: jobs: From 99a6f0e7e7aa483e23f27afa50cbec72b85c0787 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Wed, 9 Jun 2021 11:18:10 +0300 Subject: [PATCH 16/18] Add more trailing zeros To allow for future expansion Signed-off-by: Olga Bulat --- ...01_airflow_user_db.sql => 0001_airflow_user_db.sql} | 0 .../{02_aws_s3_mock.sql => 0002_aws_s3_mock.sql} | 0 ...age_schema.sql => 0003_openledger_image_schema.sql} | 0 ...r_image_view.sql => 0004_openledger_image_view.sql} | 0 ...schema.sql => 0005_openledger_old_image_schema.sql} | 0 src/cc_catalog_airflow/local_postgres/Dockerfile | 10 +++++----- 6 files changed, 5 insertions(+), 5 deletions(-) rename src/cc_catalog_airflow/local_postgres/{01_airflow_user_db.sql => 0001_airflow_user_db.sql} (100%) rename src/cc_catalog_airflow/local_postgres/{02_aws_s3_mock.sql => 0002_aws_s3_mock.sql} (100%) rename src/cc_catalog_airflow/local_postgres/{03_openledger_image_schema.sql => 0003_openledger_image_schema.sql} (100%) rename src/cc_catalog_airflow/local_postgres/{04_openledger_image_view.sql => 0004_openledger_image_view.sql} (100%) rename src/cc_catalog_airflow/local_postgres/{05_openledger_old_image_schema.sql => 0005_openledger_old_image_schema.sql} (100%) diff --git a/src/cc_catalog_airflow/local_postgres/01_airflow_user_db.sql b/src/cc_catalog_airflow/local_postgres/0001_airflow_user_db.sql similarity index 100% rename from src/cc_catalog_airflow/local_postgres/01_airflow_user_db.sql rename to src/cc_catalog_airflow/local_postgres/0001_airflow_user_db.sql diff --git a/src/cc_catalog_airflow/local_postgres/02_aws_s3_mock.sql b/src/cc_catalog_airflow/local_postgres/0002_aws_s3_mock.sql similarity index 100% rename from src/cc_catalog_airflow/local_postgres/02_aws_s3_mock.sql rename to src/cc_catalog_airflow/local_postgres/0002_aws_s3_mock.sql diff --git a/src/cc_catalog_airflow/local_postgres/03_openledger_image_schema.sql b/src/cc_catalog_airflow/local_postgres/0003_openledger_image_schema.sql similarity index 100% rename from src/cc_catalog_airflow/local_postgres/03_openledger_image_schema.sql rename to src/cc_catalog_airflow/local_postgres/0003_openledger_image_schema.sql diff --git a/src/cc_catalog_airflow/local_postgres/04_openledger_image_view.sql b/src/cc_catalog_airflow/local_postgres/0004_openledger_image_view.sql similarity index 100% rename from src/cc_catalog_airflow/local_postgres/04_openledger_image_view.sql rename to src/cc_catalog_airflow/local_postgres/0004_openledger_image_view.sql diff --git a/src/cc_catalog_airflow/local_postgres/05_openledger_old_image_schema.sql b/src/cc_catalog_airflow/local_postgres/0005_openledger_old_image_schema.sql similarity index 100% rename from src/cc_catalog_airflow/local_postgres/05_openledger_old_image_schema.sql rename to src/cc_catalog_airflow/local_postgres/0005_openledger_old_image_schema.sql diff --git a/src/cc_catalog_airflow/local_postgres/Dockerfile b/src/cc_catalog_airflow/local_postgres/Dockerfile index fce37f37c..ee37f9082 100644 --- a/src/cc_catalog_airflow/local_postgres/Dockerfile +++ b/src/cc_catalog_airflow/local_postgres/Dockerfile @@ -2,9 +2,9 @@ FROM postgres:13.2 ENV POSTGRES_USER=deploy ENV POSTGRES_PASSWORD=deploy ENV POSTGRES_DB=openledger -ADD 01_airflow_user_db.sql /docker-entrypoint-initdb.d -ADD 02_aws_s3_mock.sql /docker-entrypoint-initdb.d -ADD 03_openledger_image_schema.sql /docker-entrypoint-initdb.d -ADD 04_openledger_image_view.sql /docker-entrypoint-initdb.d -ADD 05_openledger_old_image_schema.sql /docker-entrypoint-initdb.d +ADD 0001_airflow_user_db.sql /docker-entrypoint-initdb.d +ADD 0002_aws_s3_mock.sql /docker-entrypoint-initdb.d +ADD 0003_openledger_image_schema.sql /docker-entrypoint-initdb.d +ADD 0004_openledger_image_view.sql /docker-entrypoint-initdb.d +ADD 0005_openledger_old_image_schema.sql /docker-entrypoint-initdb.d RUN apt-get -y update && apt-get -y install python3-boto3 postgresql-plpython3-13 From 3eb439d8e4bd5b5ab2d1de9289ef6309d498929e Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Thu, 10 Jun 2021 14:52:35 +0300 Subject: [PATCH 17/18] Run release drafter action on push to main branch We change the default branch name from develop/master to main, this PR updates the action accordingly. --- .github/workflows/release-drafter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml index c997c6053..4d5e273da 100644 --- a/.github/workflows/release-drafter.yml +++ b/.github/workflows/release-drafter.yml @@ -3,7 +3,7 @@ name: Release Drafter on: push: branches: - - develop + - main jobs: update_release_draft: From 273ea6a63ba591724831db4e7d2da91461c8b629 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Thu, 3 Jun 2021 13:21:12 +0300 Subject: [PATCH 18/18] Extract common media functionality from ImageStore to abstract MediaStore Signed-off-by: Olga Bulat --- .../dags/common/storage/image.py | 405 ++++-------------- .../dags/common/storage/media.py | 275 ++++++++++++ .../dags/common/storage/test_image.py | 369 +++++++++------- .../provider_api_scripts/brooklyn_museum.py | 4 +- .../cleveland_museum_of_art.py | 27 +- .../dags/provider_api_scripts/nypl.py | 4 +- .../provider_api_scripts/wikimedia_commons.py | 4 +- .../dags/util/pg_cleaner.py | 6 +- .../dags/util/test_pg_cleaner.py | 4 +- 9 files changed, 601 insertions(+), 497 deletions(-) create mode 100644 src/cc_catalog_airflow/dags/common/storage/media.py diff --git a/src/cc_catalog_airflow/dags/common/storage/image.py b/src/cc_catalog_airflow/dags/common/storage/image.py index 49a4fee0c..c3588d706 100644 --- a/src/cc_catalog_airflow/dags/common/storage/image.py +++ b/src/cc_catalog_airflow/dags/common/storage/image.py @@ -1,15 +1,13 @@ from collections import namedtuple -from datetime import datetime import logging -import os +from typing import Optional, Dict, Union -from common.licenses import licenses -from common.storage import util from common.storage import columns +from common.storage.media import MediaStore logger = logging.getLogger(__name__) -_IMAGE_TSV_COLUMNS = [ +IMAGE_TSV_COLUMNS = [ # The order of this list maps to the order of the columns in the TSV. columns.StringColumn( name='foreign_identifier', required=False, size=3000, truncate=False @@ -66,37 +64,10 @@ ) ] -Image = namedtuple( - 'Image', - [c.NAME for c in _IMAGE_TSV_COLUMNS] -) +Image = namedtuple("Image", [c.NAME for c in IMAGE_TSV_COLUMNS]) -# Filter out tags that exactly match these terms. All terms should be -# lowercase. -TAG_BLACKLIST = { - 'no person', - 'squareformat' -} -# Filter out tags that contain the following terms. All entrees should be -# lowercase. -TAG_CONTAINS_BLACKLIST = { - 'flickriosapp', - 'uploaded', - ':', - '=', - 'cc0', - 'by', - 'by-nc', - 'by-nd', - 'by-sa', - 'by-nc-nd', - 'by-nc-sa', - 'pdm' -} - - -class ImageStore: +class ImageStore(MediaStore): """ A class that stores image information from a given provider. @@ -110,54 +81,48 @@ class ImageStore: """ def __init__( - self, - provider=None, - output_file=None, - output_dir=None, - buffer_length=100 + self, + provider=None, + output_file=None, + output_dir=None, + buffer_length=100, + media_type="image", + tsv_columns=None, ): - logger.info(f'Initialized with provider {provider}') - self._image_buffer = [] - self._total_images = 0 - self._PROVIDER = provider - self._BUFFER_LENGTH = buffer_length - self._NOW = datetime.now() - self._OUTPUT_PATH = self._initialize_output_path( - output_dir, - output_file, - provider, + super().__init__( + provider, output_file, output_dir, buffer_length, media_type ) + self.columns = IMAGE_TSV_COLUMNS \ + if tsv_columns is None else tsv_columns def add_item( - self, - foreign_landing_url=None, - image_url=None, - thumbnail_url=None, - license_url=None, - license_=None, - license_version=None, - foreign_identifier=None, - width=None, - height=None, - creator=None, - creator_url=None, - title=None, - meta_data=None, - raw_tags=None, - watermarked='f', - source=None + self, + foreign_landing_url: str, + image_url: str, + thumbnail_url: Optional[str] = None, + license_url: Optional[str] = None, + license_: Optional[str] = None, + license_version: Optional[str] = None, + foreign_identifier: Optional[str] = None, + width: Optional[int] = None, + height: Optional[int] = None, + creator: Optional[str] = None, + creator_url: Optional[str] = None, + title: Optional[str] = None, + meta_data: Optional[Union[Dict, str]] = None, + raw_tags=None, + watermarked: Optional[str] = "f", + source: Optional[str] = None, ): """ Add information for a single image to the ImageStore. Required Arguments: - foreign_landing_url: URL of page where the image lives on the source website. image_url: Direct link to the image file Semi-Required Arguments - license_url: URL of the license for the image on the Creative Commons website. license_: String representation of a Creative Commons @@ -165,7 +130,7 @@ def add_item( `common.license.constants.get_license_path_map()` license_version: Version of the given license. In the case of the `publicdomain` license, which has no - version, one shoud pass + version, one should pass `common.license.constants.NO_VERSION` here. Note on license arguments: These are 'semi-required' in that @@ -174,7 +139,6 @@ def add_item( image data will be discarded. Optional Arguments: - thumbnail_url: Direct link to a thumbnail-sized version of the image foreign_identifier: Unique identifier for the image on the @@ -201,205 +165,57 @@ def add_item( ImageStore init function is the specific provider of the image. """ - image = self._get_image( - foreign_landing_url=foreign_landing_url, - image_url=image_url, - thumbnail_url=thumbnail_url, - license_url=license_url, - license_=license_, - license_version=license_version, - foreign_identifier=foreign_identifier, - width=width, - height=height, - creator=creator, - creator_url=creator_url, - title=title, - meta_data=meta_data, - raw_tags=raw_tags, - watermarked=watermarked, - source=source - ) - tsv_row = self._create_tsv_row(image) - if tsv_row: - self._image_buffer.append(tsv_row) - self._total_images += 1 - if len(self._image_buffer) >= self._BUFFER_LENGTH: - self._flush_buffer() - - return self._total_images - - def commit(self): - """Writes all remaining images in the buffer to disk.""" - self._flush_buffer() - - return self._total_images - - def _initialize_output_path(self, output_dir, output_file, provider): - if output_dir is None: - logger.info( - 'No given output directory. ' - 'Using OUTPUT_DIR from environment.' - ) - output_dir = os.getenv('OUTPUT_DIR') - if output_dir is None: - logger.warning( - 'OUTPUT_DIR is not set in the enivronment. ' - 'Output will go to /tmp.' - ) - output_dir = '/tmp' - - if output_file is not None: - output_file = str(output_file) - else: - output_file = ( - f'{provider}_{datetime.strftime(self._NOW, "%Y%m%d%H%M%S")}' - f'.tsv' - ) - - output_path = os.path.join(output_dir, output_file) - logger.info(f'Output path: {output_path}') - return output_path - - def _get_total_images(self): - return self._total_images - - """Get total images for directly using in scripts.""" - total_images = property(_get_total_images) - - def _get_image( - self, - foreign_identifier, - foreign_landing_url, - image_url, - thumbnail_url, - width, - height, + valid_license, raw_license_url = self.get_valid_license_info( license_url, license_, - license_version, - creator, - creator_url, - title, - meta_data, - raw_tags, - watermarked, - source, - ): - valid_license_info = licenses.get_license_info( - license_url=license_url, - license_=license_, - license_version=license_version - ) - source = util.get_source(source, self._PROVIDER) - meta_data = self._enrich_meta_data( - meta_data, - license_url=valid_license_info.url, - raw_license_url=license_url - ) - tags = self._enrich_tags(raw_tags) - - return Image( - foreign_identifier=foreign_identifier, - foreign_landing_url=foreign_landing_url, - image_url=image_url, - thumbnail_url=thumbnail_url, - license_=valid_license_info.license, - license_version=valid_license_info.version, - width=width, - height=height, - filesize=None, - creator=creator, - creator_url=creator_url, - title=title, - meta_data=meta_data, - tags=tags, - watermarked=watermarked, - provider=self._PROVIDER, - source=source + license_version ) - - def _create_tsv_row( - self, - image, - columns=_IMAGE_TSV_COLUMNS - ): - row_length = len(columns) - prepared_strings = [ - columns[i].prepare_string(image[i]) for i in range(row_length) - ] - logger.debug(f'Prepared strings list:\n{prepared_strings}') - for i in range(row_length): - if columns[i].REQUIRED and prepared_strings[i] is None: - logger.warning(f'Row missing required {columns[i].NAME}') - return None - else: - return '\t'.join( - [s if s is not None else '\\N' for s in prepared_strings] - ) + '\n' - - def _flush_buffer(self): - buffer_length = len(self._image_buffer) - if buffer_length > 0: - logger.info( - f'Writing {buffer_length} lines from buffer to disk.' - ) - with open(self._OUTPUT_PATH, 'a') as f: - f.writelines(self._image_buffer) - self._image_buffer = [] - logger.debug( - f'Total Images Processed so far: {self._total_images}' - ) - else: - logger.debug('Empty buffer! Nothing to write.') - return buffer_length - - def _tag_blacklisted(self, tag): - """ - Tag is banned or contains a banned substring. - :param tag: the tag to be verified against the blacklist - :return: true if tag is blacklisted, else returns false - """ - if type(tag) == dict: # check if the tag is already enriched - tag = tag.get('name') - if tag in TAG_BLACKLIST: - return True - for blacklisted_substring in TAG_CONTAINS_BLACKLIST: - if blacklisted_substring in tag: - return True - return False - - def _enrich_meta_data(self, meta_data, license_url, raw_license_url): - if type(meta_data) != dict: + if valid_license.license is None: logger.debug( - f'`meta_data` is not a dictionary: {meta_data}' - ) - enriched_meta_data = { - 'license_url': license_url, 'raw_license_url': raw_license_url - } - else: - enriched_meta_data = meta_data - enriched_meta_data.update( - license_url=license_url, raw_license_url=raw_license_url - ) - return enriched_meta_data - - def _enrich_tags(self, raw_tags): - if type(raw_tags) != list: - logger.debug('`tags` is not a list.') + f"Invalid image license : {license_url}," + "{license_}, {license_version}") return None - else: - return [ - self._format_raw_tag(tag) for tag in raw_tags - if not self._tag_blacklisted(tag) - ] + image_data = { + 'foreign_landing_url': foreign_landing_url, + 'image_url': image_url, + 'thumbnail_url': thumbnail_url, + 'license_url': valid_license.url, + 'license_': valid_license.license, + 'license_version': valid_license.version, + 'raw_license_url': raw_license_url, + 'foreign_identifier': foreign_identifier, + 'width': width, + 'height': height, + 'creator': creator, + 'creator_url': creator_url, + 'title': title, + 'meta_data': meta_data, + 'raw_tags': raw_tags, + 'watermarked': watermarked, + 'source': source, + } + image = self._get_image(**image_data) + if image is not None: + self.save_item(image) + return self.total_items + + def _get_image(self, **kwargs) -> Image: + """Validates image information and returns Image namedtuple""" + + kwargs['source'] = self.get_source(kwargs['source']) + kwargs['meta_data'], kwargs['tags'] = self.parse_item_metadata( + kwargs['license_url'], + kwargs.get('raw_license_url'), + kwargs.get('meta_data'), + kwargs.get('raw_tags'), + ) + kwargs.pop('raw_tags', None) + kwargs.pop('raw_license_url', None) + kwargs.pop('license_url', None) + kwargs['provider'] = self._PROVIDER + kwargs['filesize'] = None - def _format_raw_tag(self, tag): - if type(tag) == dict and tag.get('name') and tag.get('provider'): - logger.debug(f'Tag already enriched: {tag}') - return tag - else: - logger.debug(f'Enriching tag: {tag}') - return {'name': tag, 'provider': self._PROVIDER} + return Image(**kwargs) class MockImageStore(ImageStore): @@ -416,62 +232,13 @@ class MockImageStore(ImageStore): """ def __init__( - self, - provider=None, - output_file=None, - output_dir=None, - buffer_length=100, - license_info=None + self, + provider=None, + output_file=None, + output_dir=None, + buffer_length=100, + license_info=None, ): - logger.info(f'Initialized with provider {provider}') + logger.info(f"Initialized with provider {provider}") super().__init__(provider=provider) self.license_info = license_info - - def _get_image( - self, - foreign_identifier, - foreign_landing_url, - image_url, - thumbnail_url, - width, - height, - license_url, - license_, - license_version, - creator, - creator_url, - title, - meta_data, - raw_tags, - watermarked, - source, - ): - valid_license_info = self.license_info - - source = util.get_source(source, self._PROVIDER) - meta_data = self._enrich_meta_data( - meta_data, - license_url=valid_license_info.url, - raw_license_url=license_url - ) - tags = self._enrich_tags(raw_tags) - - return Image( - foreign_identifier=foreign_identifier, - foreign_landing_url=foreign_landing_url, - image_url=image_url, - thumbnail_url=thumbnail_url, - license_=valid_license_info.license, - license_version=valid_license_info.version, - width=width, - height=height, - filesize=None, - creator=creator, - creator_url=creator_url, - title=title, - meta_data=meta_data, - tags=tags, - watermarked=watermarked, - provider=self._PROVIDER, - source=source - ) diff --git a/src/cc_catalog_airflow/dags/common/storage/media.py b/src/cc_catalog_airflow/dags/common/storage/media.py new file mode 100644 index 000000000..8deeca2f3 --- /dev/null +++ b/src/cc_catalog_airflow/dags/common/storage/media.py @@ -0,0 +1,275 @@ +import abc +from datetime import datetime +import logging +import os +from typing import Optional, Union + +from common.storage import util +from common.licenses import licenses + +logger = logging.getLogger(__name__) + +# Filter out tags that exactly match these terms. All terms should be +# lowercase. +TAG_BLACKLIST = {"no person", "squareformat"} + +# Filter out tags that contain the following terms. All entrees should be +# lowercase. +TAG_CONTAINS_BLACKLIST = { + "flickriosapp", + "uploaded", + ":", + "=", + "cc0", + "by", + "by-nc", + "by-nd", + "by-sa", + "by-nc-nd", + "by-nc-sa", + "pdm", +} + + +class MediaStore(metaclass=abc.ABCMeta): + """ + An abstract base class that stores media information from a given provider. + + Optional init arguments: + provider: String marking the provider in the `media` + (`image`, `audio` etc) table of the DB. + output_file: String giving a temporary .tsv filename (*not* the + full path) where the media info should be stored. + output_dir: String giving a path where `output_file` should be placed. + buffer_length: Integer giving the maximum number of media information rows + to store in memory before writing them to disk. + """ + + def __init__( + self, + provider: Optional[str] = None, + output_file: Optional[str] = None, + output_dir: Optional[str] = None, + buffer_length: int = 100, + media_type: Optional[str] = "generic", + ): + logger.info(f"Initialized {media_type} MediaStore" + " with provider {provider}") + self.media_type = media_type + self._media_buffer = [] + self._total_items = 0 + self._PROVIDER = provider + self._BUFFER_LENGTH = buffer_length + self._NOW = datetime.now() + self._OUTPUT_PATH = self._initialize_output_path( + output_dir, + output_file, + provider, + ) + self.columns = None + + def save_item(self, media) -> None: + """ + Appends item data to the buffer as a tsv row, + only if data is valid. + + Args: + media: a namedtuple with validated media metadata + """ + tsv_row = self._create_tsv_row(media) + if tsv_row: + self._media_buffer.append(tsv_row) + self._total_items += 1 + if len(self._media_buffer) >= self._BUFFER_LENGTH: + self._flush_buffer() + + @abc.abstractmethod + def add_item(self, **kwargs): + """ + Abstract method to clean the item data and add it to the store + """ + pass + + @staticmethod + def get_valid_license_info( + license_url, + license_, + license_version, + ): + valid_license_info = licenses.get_license_info( + license_url=license_url, + license_=license_, + license_version=license_version + ) + if valid_license_info.url != license_url: + raw_license_url = license_url + else: + raw_license_url = None + return valid_license_info, raw_license_url + + def get_source(self, source): + return util.get_source(source, self._PROVIDER) + + def parse_item_metadata( + self, + license_url, + raw_license_url, + meta_data, + raw_tags, + ): + meta_data = self._enrich_meta_data( + meta_data, + license_url=license_url, + raw_license_url=raw_license_url + ) + tags = self._enrich_tags(raw_tags) + return meta_data, tags + + def commit(self): + """Writes all remaining media items in the buffer to disk.""" + self._flush_buffer() + return self.total_items + + def _initialize_output_path( + self, + output_dir: Optional[str], + output_file: Optional[str], + provider: str, + ) -> str: + """Creates the path for the tsv file. + If output_dir and output_file ar not given, + the following filename is used: + `/tmp/{provider_name}_{media_type}_{timestamp}.tsv` + + Returns: + Path of the tsv file to write media data pulled from providers + """ + if output_dir is None: + logger.info("No given output directory. " + "Using OUTPUT_DIR from environment.") + output_dir = os.getenv("OUTPUT_DIR") + if output_dir is None: + logger.warning( + "OUTPUT_DIR is not set in the environment. " + "Output will go to /tmp." + ) + output_dir = "/tmp" + + if output_file is not None: + output_file = str(output_file) + else: + datetime_string = datetime.strftime( + self._NOW, '%Y%m%d%H%M%S') + output_file = ( + f"{provider}_{self.media_type}" + f"_{datetime_string}.tsv" + ) + + output_path = os.path.join(output_dir, output_file) + logger.info(f"Output path: {output_path}") + return output_path + + @property + def total_items(self): + """Get total items for directly using in scripts.""" + return self._total_items + + def _create_tsv_row(self, item): + row_length = len(self.columns) + prepared_strings = [ + self.columns[i].prepare_string(item[i]) for i in range(row_length) + ] + logger.debug(f"Prepared strings list:\n{prepared_strings}") + for i in range(row_length): + if self.columns[i].REQUIRED and prepared_strings[i] is None: + logger.warning(f"Row missing required {self.columns[i].NAME}") + return None + else: + return ( + "\t".join( + [s if s is not None else "\\N" + for s in prepared_strings]) + + "\n" + ) + + def _flush_buffer(self) -> int: + buffer_length = len(self._media_buffer) + if buffer_length > 0: + logger.info(f"Writing {buffer_length} lines from buffer to disk.") + with open(self._OUTPUT_PATH, "a") as f: + f.writelines(self._media_buffer) + self._media_buffer = [] + logger.debug( + f"Total Media Items Processed so far: {self._total_items}" + ) + else: + logger.debug("Empty buffer! Nothing to write.") + return buffer_length + + @staticmethod + def _tag_blacklisted(tag: Union[str, dict]) -> bool: + """ + Tag is banned or contains a banned substring. + + Args: + tag: the tag to be verified against the blacklist + + Returns: + True if tag is blacklisted, else False + """ + if type(tag) == dict: # check if the tag is already enriched + tag = tag.get("name") + if tag in TAG_BLACKLIST: + return True + for blacklisted_substring in TAG_CONTAINS_BLACKLIST: + if blacklisted_substring in tag: + return True + return False + + @staticmethod + def _enrich_meta_data( + meta_data, license_url, raw_license_url) -> dict: + """ + Makes sure that meta_data is a dictionary, and contains + license_url and raw_license_url + """ + if type(meta_data) != dict: + logger.debug(f"`meta_data` is not a dictionary: {meta_data}") + enriched_meta_data = { + "license_url": license_url, + "raw_license_url": raw_license_url, + } + else: + enriched_meta_data = meta_data + enriched_meta_data.update( + license_url=license_url, raw_license_url=raw_license_url + ) + return enriched_meta_data + + def _enrich_tags(self, raw_tags) -> Optional[list]: + """Takes a list of tags and adds provider information to them + + Args: + raw_tags: List of strings or dictionaries + + Returns: + A list of 'enriched' tags: + {"name": "tag_name", "provider": self._PROVIDER} + """ + if type(raw_tags) != list: + logger.debug("`tags` is not a list.") + return None + else: + return [ + self._format_raw_tag(tag) + for tag in raw_tags + if not self._tag_blacklisted(tag) + ] + + def _format_raw_tag(self, tag): + if type(tag) == dict and tag.get("name") and tag.get("provider"): + logger.debug(f"Tag already enriched: {tag}") + return tag + else: + logger.debug(f"Enriching tag: {tag}") + return {"name": tag, "provider": self._PROVIDER} diff --git a/src/cc_catalog_airflow/dags/common/storage/test_image.py b/src/cc_catalog_airflow/dags/common/storage/test_image.py index 78f6871bb..a6788c8cf 100644 --- a/src/cc_catalog_airflow/dags/common/storage/test_image.py +++ b/src/cc_catalog_airflow/dags/common/storage/test_image.py @@ -1,10 +1,15 @@ import logging +from unittest.mock import patch + import requests import pytest import tldextract +from common.licenses import licenses from common.storage import image +from common.storage import util + logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s: %(message)s', @@ -13,7 +18,7 @@ logger = logging.getLogger(__name__) # This avoids needing the internet for testing. -image.licenses.urls.tldextract.extract = tldextract.TLDExtract( +licenses.urls.tldextract.extract = tldextract.TLDExtract( suffix_list_urls=None ) image.columns.urls.tldextract.extract = tldextract.TLDExtract( @@ -31,7 +36,7 @@ def mock_rewriter(monkeypatch): def mock_rewrite_redirected_url(url_string): return url_string monkeypatch.setattr( - image.licenses.urls, + licenses.urls, 'rewrite_redirected_url', mock_rewrite_redirected_url, ) @@ -41,7 +46,7 @@ def mock_rewrite_redirected_url(url_string): def get_good(monkeypatch): def mock_get(url, timeout=60): return requests.Response() - monkeypatch.setattr(image.licenses.urls.requests, 'get', mock_get) + monkeypatch.setattr(licenses.urls.requests, 'get', mock_get) def test_ImageStore_uses_OUTPUT_DIR_variable( @@ -80,7 +85,7 @@ def test_ImageStore_add_item_adds_realistic_image_to_buffer( image_url='https://images.org/image01.jpg', license_url=license_url, ) - assert len(image_store._image_buffer) == 1 + assert len(image_store._media_buffer) == 1 def test_ImageStore_add_item_adds_multiple_images_to_buffer( @@ -107,7 +112,7 @@ def test_ImageStore_add_item_adds_multiple_images_to_buffer( image_url='https://images.org/image04.jpg', license_url='https://creativecommons.org/publicdomain/zero/1.0/' ) - assert len(image_store._image_buffer) == 4 + assert len(image_store._media_buffer) == 4 def test_ImageStore_add_item_flushes_buffer( @@ -145,7 +150,7 @@ def test_ImageStore_add_item_flushes_buffer( image_url='https://images.org/image04.jpg', license_url='https://creativecommons.org/publicdomain/zero/1.0/' ) - assert len(image_store._image_buffer) == 1 + assert len(image_store._media_buffer) == 1 with open(tmp_path_full) as f: lines = f.read().split('\n') assert len(lines) == 4 # recall the last '\n' will create an empty line. @@ -173,7 +178,7 @@ def test_ImageStore_produces_correct_total_images(mock_rewriter, setup_env): image_url='https://images.org/image03.jpg', license_url='https://creativecommons.org/publicdomain/zero/1.0/' ) - assert image_store.total_images == 3 + assert image_store.total_items == 3 def test_ImageStore_get_image_places_given_args( @@ -183,12 +188,12 @@ def test_ImageStore_get_image_places_given_args( image_store = image.ImageStore(provider='testing_provider') args_dict = { 'foreign_landing_url': 'https://landing_page.com', - 'image_url': 'http://imageurl.com', + 'image_url': 'https://imageurl.com', 'license_': 'testlicense', 'license_version': '1.0', 'license_url': None, 'foreign_identifier': 'foreign_id', - 'thumbnail_url': 'http://thumbnail.com', + 'thumbnail_url': 'https://thumbnail.com', 'width': 200, 'height': 500, 'creator': 'tyler', @@ -201,19 +206,20 @@ def test_ImageStore_get_image_places_given_args( } def mock_license_chooser(license_url, license_, license_version): - return image.licenses.LicenseInfo( + return licenses.LicenseInfo( license_, license_version, license_url - ) + ), license_url + monkeypatch.setattr( - image.licenses, - 'get_license_info', + image_store, + 'get_valid_license_info', mock_license_chooser ) def mock_get_source(source, provider): return source monkeypatch.setattr( - image.util, + util, 'get_source', mock_get_source ) @@ -234,28 +240,65 @@ def mock_enrich_tags(tags): assert actual_image == image.Image(**args_dict) -def test_ImageStore_get_image_calls_license_chooser( +def test_ImageStore_add_item_calls_license_chooser( monkeypatch, setup_env, ): image_store = image.ImageStore() def mock_license_chooser(license_url, license_, license_version): - return image.licenses.LicenseInfo( + return licenses.LicenseInfo( 'diff_license', None, license_url - ) - monkeypatch.setattr( - image.licenses, - 'get_license_info', - mock_license_chooser - ) + ), license_url + + def item_saver(arg): + return arg + + with patch.object( + image_store, + 'save_item', + side_effect=item_saver) as mock_save: + with patch.object( + image_store, + 'get_valid_license_info', + mock_license_chooser): + image_store.add_item( + license_url='https://license/url', + license_='license', + license_version='1.5', + foreign_landing_url='', + image_url='', + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data=None, + raw_tags=None, + watermarked=None, + source=None, + ) + actual_image = mock_save.call_args[0][0] + + assert actual_image.license_ == 'diff_license' + assert actual_image.meta_data['license_url'] == 'https://license/url' + assert mock_save.call_count == 1 + + +def test_ImageStore_returns_None_when_license_is_None( + monkeypatch, + setup_env, +): + image_store = image.ImageStore() - actual_image = image_store._get_image( + actual_image = image_store.add_item( license_url='https://license/url', - license_='license', + license_=None, license_version='1.5', - foreign_landing_url=None, - image_url=None, + foreign_landing_url='', + image_url='', thumbnail_url=None, foreign_identifier=None, width=None, @@ -268,7 +311,7 @@ def mock_license_chooser(license_url, license_, license_version): watermarked=None, source=None, ) - assert actual_image.license_ == 'diff_license' + assert actual_image is None def test_ImageStore_get_image_gets_source( @@ -279,12 +322,13 @@ def test_ImageStore_get_image_gets_source( def mock_get_source(source, provider): return 'diff_source' - monkeypatch.setattr(image.util, 'get_source', mock_get_source) + + monkeypatch.setattr(util, 'get_source', mock_get_source) actual_image = image_store._get_image( license_url='https://license/url', - license_='license', - license_version='1.5', + license_='by', + license_version='4.0', foreign_landing_url=None, image_url=None, thumbnail_url=None, @@ -302,146 +346,165 @@ def mock_get_source(source, provider): assert actual_image.source == 'diff_source' -def test_ImageStore_get_image_replaces_non_dict_meta_data_with_no_license_url( +def test_ImageStore_add_image_replaces_non_dict_meta_data_with_no_license_url( setup_env, ): image_store = image.ImageStore() - actual_image = image_store._get_image( - license_url=None, - license_='license', - license_version='1.5', - foreign_landing_url=None, - image_url=None, - thumbnail_url=None, - foreign_identifier=None, - width=None, - height=None, - creator=None, - creator_url=None, - title=None, - meta_data='notadict', - raw_tags=None, - watermarked=None, - source=None, - ) + def item_saver(arg): + pass + + with patch.object( + image_store, + 'save_item', + side_effect=item_saver) as mock_save: + image_store.add_item( + license_url=None, + license_='by-nc-nd', + license_version='4.0', + foreign_landing_url='', + image_url='', + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data='notadict', + raw_tags=None, + watermarked=None, + source=None, + ) + actual_image = mock_save.call_args[0][0] assert actual_image.meta_data == { - 'license_url': None, 'raw_license_url': None + 'license_url': 'https://creativecommons.org/licenses/by-nc-nd/4.0/', + 'raw_license_url': None, } -def test_ImageStore_get_image_creates_meta_data_with_valid_license_url( +def test_ImageStore_add_item_creates_meta_data_with_valid_license_url( monkeypatch, setup_env ): + image_store = image.ImageStore() + def mock_license_chooser(license_url, license_, license_version): - return image.licenses.LicenseInfo( - license_, license_version, license_url + return licenses.LicenseInfo(license_, license_version, license_url), license_url + + monkeypatch.setattr(image_store, 'get_valid_license_info', mock_license_chooser) + license_url = "https://my.license.url" + + def item_saver(arg): + pass + + with patch.object( + image_store, + 'save_item', + side_effect=item_saver) as mock_save: + image_store.add_item( + license_url=license_url, + license_='license', + license_version='1.5', + foreign_landing_url='', + image_url='', + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data=None, + raw_tags=None, + watermarked=None, + source=None, ) - monkeypatch.setattr( - image.licenses, - 'get_license_info', - mock_license_chooser - ) - license_url = 'https://my.license.url' - image_store = image.ImageStore() + actual_image = mock_save.call_args[0][0] - actual_image = image_store._get_image( - license_url=license_url, - license_='license', - license_version='1.5', - foreign_landing_url=None, - image_url=None, - thumbnail_url=None, - foreign_identifier=None, - width=None, - height=None, - creator=None, - creator_url=None, - title=None, - meta_data=None, - raw_tags=None, - watermarked=None, - source=None, - ) - assert actual_image.meta_data == { - 'license_url': license_url, 'raw_license_url': license_url - } + assert actual_image.meta_data == { + 'license_url': license_url, 'raw_license_url': license_url + } -def test_ImageStore_get_image_adds_valid_license_url_to_dict_meta_data( +def test_ImageStore_add_item_adds_valid_license_url_to_dict_meta_data( monkeypatch, setup_env ): - def mock_license_chooser(license_url, license_, license_version): - return image.licenses.LicenseInfo( - license_, license_version, license_url - ) - monkeypatch.setattr( - image.licenses, - 'get_license_info', - mock_license_chooser - ) image_store = image.ImageStore() - actual_image = image_store._get_image( - license_url='https://license/url', - license_='license', - license_version='1.5', - foreign_landing_url=None, - image_url=None, - thumbnail_url=None, - foreign_identifier=None, - width=None, - height=None, - creator=None, - creator_url=None, - title=None, - meta_data={'key1': 'val1'}, - raw_tags=None, - watermarked=None, - source=None, - ) - assert actual_image.meta_data == { - 'key1': 'val1', - 'license_url': 'https://license/url', - 'raw_license_url': 'https://license/url' - } + def item_saver(arg): + pass + + with patch.object( + image_store, + 'save_item', + side_effect=item_saver) as mock_save: + image_store.add_item( + license_url='https://license/url', + license_='by', + license_version='4.0', + foreign_landing_url='', + image_url='', + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data={'key1': 'val1'}, + raw_tags=None, + watermarked=None, + source=None, + ) + actual_image = mock_save.call_args[0][0] + + assert actual_image.meta_data == { + 'key1': 'val1', + 'license_url': 'https://creativecommons.org/licenses/by/4.0/', + 'raw_license_url': 'https://license/url' + } -def test_ImageStore_get_image_fixes_invalid_license_url( +def test_ImageStore_add_item_fixes_invalid_license_url( monkeypatch, setup_env ): - original_url = 'https://license/url', - updated_url = 'https://updatedurl.com' + image_store = image.ImageStore() + + original_url = "https://license/url" + updated_url = "https://updatedurl.com" def mock_license_chooser(license_url, license_, license_version): - return image.licenses.LicenseInfo( - license_, license_version, updated_url + return licenses.LicenseInfo(license_, license_version, updated_url), license_url + + monkeypatch.setattr(image_store, "get_valid_license_info", mock_license_chooser) + + def item_saver(arg): + pass + + with patch.object( + image_store, + 'save_item', + side_effect=item_saver) as mock_save: + image_store.add_item( + license_url=original_url, + license_='license', + license_version='1.5', + foreign_landing_url='', + image_url='', + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data={}, + raw_tags=None, + watermarked=None, + source=None, ) - monkeypatch.setattr( - image.licenses, - 'get_license_info', - mock_license_chooser - ) - image_store = image.ImageStore() + actual_image = mock_save.call_args[0][0] - actual_image = image_store._get_image( - license_url=original_url, - license_='license', - license_version='1.5', - foreign_landing_url=None, - image_url=None, - thumbnail_url=None, - foreign_identifier=None, - width=None, - height=None, - creator=None, - creator_url=None, - title=None, - meta_data={}, - raw_tags=None, - watermarked=None, - source=None, - ) assert actual_image.meta_data == { 'license_url': updated_url, 'raw_license_url': original_url } @@ -454,8 +517,8 @@ def test_ImageStore_get_image_enriches_singleton_tags( actual_image = image_store._get_image( license_url='https://license/url', - license_='license', - license_version='1.5', + license_='by-sa', + license_version='4.0', foreign_landing_url=None, image_url=None, thumbnail_url=None, @@ -492,8 +555,8 @@ def test_ImageStore_get_image_tag_blacklist( actual_image = image_store._get_image( license_url='https://license/url', - license_='license', - license_version='1.5', + license_='by', + license_version='4.0', foreign_landing_url=None, image_url=None, thumbnail_url=None, @@ -520,8 +583,8 @@ def test_ImageStore_get_image_enriches_multiple_tags( image_store = image.ImageStore('test_provider') actual_image = image_store._get_image( license_url='https://license/url', - license_='license', - license_version='1.5', + license_='by', + license_version='4.0', foreign_landing_url=None, image_url=None, thumbnail_url=None, @@ -556,8 +619,8 @@ def test_ImageStore_get_image_leaves_preenriched_tags( actual_image = image_store._get_image( license_url='https://license/url', - license_='license', - license_version='1.5', + license_='by', + license_version='4.0', foreign_landing_url=None, image_url=None, thumbnail_url=None, @@ -584,8 +647,8 @@ def test_ImageStore_get_image_nones_nonlist_tags( actual_image = image_store._get_image( license_url='https://license/url', - license_='license', - license_version='1.5', + license_='by', + license_version='4.0', foreign_landing_url=None, image_url=None, thumbnail_url=None, diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/brooklyn_museum.py b/src/cc_catalog_airflow/dags/provider_api_scripts/brooklyn_museum.py index 05a066b69..a685ac860 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/brooklyn_museum.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/brooklyn_museum.py @@ -45,12 +45,12 @@ def main(): logger.debug(len(objects_batch)) if type(objects_batch) == list and len(objects_batch) > 0: _process_objects_batch(objects_batch) - logger.debug(f"Images till now {image_store.total_images}") + logger.debug(f"Images till now {image_store.total_items}") offset += LIMIT else: condition = False image_store.commit() - logger.info(f"Total images recieved {image_store.total_images}") + logger.info(f"Total images received {image_store.total_items}") def _get_query_param( diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/cleveland_museum_of_art.py b/src/cc_catalog_airflow/dags/provider_api_scripts/cleveland_museum_of_art.py index 6e9ca1d97..7efac541c 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/cleveland_museum_of_art.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/cleveland_museum_of_art.py @@ -89,9 +89,8 @@ def _get_response( return response_json, total_images -def _handle_response( - batch - ): +def _handle_response(batch): + total_images = 0 for data in batch: license_ = data.get('share_license_status', '').lower() if license_ != 'cc0': @@ -121,17 +120,17 @@ def _handle_response( creator_name = '' total_images = image_store.add_item( - foreign_landing_url=foreign_landing_url, - image_url=image_url, - license_=license_, - license_version=license_version, - foreign_identifier=foreign_id, - width=width, - height=height, - title=title, - creator=creator_name, - meta_data=metadata, - ) + foreign_landing_url=foreign_landing_url, + image_url=image_url, + license_=license_, + license_version=license_version, + foreign_identifier=foreign_id, + width=width, + height=height, + title=title, + creator=creator_name, + meta_data=metadata, + ) return total_images diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py b/src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py index becbff900..d9c9c5041 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py @@ -56,12 +56,12 @@ def main(): results = request_response.get("result") if type(results) == list and len(results) > 0: _handle_results(results) - logger.info(f"{image_store.total_images} images till now") + logger.info(f"{image_store.total_items} images till now") page = page + 1 else: condition = False image_store.commit() - logger.info(f"total images {image_store.total_images}") + logger.info(f"total images {image_store.total_items}") def _get_query_param( diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py b/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py index 2ab31cd63..ee3eba490 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py @@ -86,13 +86,13 @@ def main(date): image_pages = _get_image_pages(image_batch) if image_pages: _process_image_pages(image_pages) - total_images = image_store.total_images + total_images = image_store.total_items logger.info(f'Total Images so far: {total_images}') if not continue_token: break image_store.commit() - total_images = image_store.total_images + total_images = image_store.total_items logger.info(f'Total images: {total_images}') logger.info('Terminated!') diff --git a/src/cc_catalog_airflow/dags/util/pg_cleaner.py b/src/cc_catalog_airflow/dags/util/pg_cleaner.py index ca85f211e..50410479d 100644 --- a/src/cc_catalog_airflow/dags/util/pg_cleaner.py +++ b/src/cc_catalog_airflow/dags/util/pg_cleaner.py @@ -197,7 +197,7 @@ def _select_records(postgres_conn_id, prefix, image_table=IMAGE_TABLE_NAME): def _clean_single_row(record, image_store_dict, prefix): dirty_row = ImageTableRow(*record) image_store = image_store_dict[(dirty_row.provider, prefix)] - total_images_before = image_store.total_images + total_images_before = image_store.total_items license_lower = dirty_row.license_.lower() if dirty_row.license_ else None tags_list = [t for t in dirty_row.tags if t] if dirty_row.tags else None image_store.add_item( @@ -218,7 +218,7 @@ def _clean_single_row(record, image_store_dict, prefix): watermarked=dirty_row.watermarked, source=dirty_row.source, ) - if not image_store.total_images - total_images_before == 1: + if not image_store.total_items - total_images_before == 1: logger.warning(f"Record {dirty_row} was not stored!") _save_failure_identifier(dirty_row.identifier) @@ -233,7 +233,7 @@ def _save_failure_identifier(identifier, output_path=OUTPUT_PATH): def _log_and_check_totals(total_rows, image_store_dict): - image_totals = {k: v.total_images for k, v in image_store_dict.items()} + image_totals = {k: v.total_items for k, v in image_store_dict.items()} total_images_sum = sum(image_totals.values()) logger.info(f"Total images cleaned: {total_images_sum}") logger.info(f"Image Totals breakdown: {image_totals}") diff --git a/src/cc_catalog_airflow/dags/util/test_pg_cleaner.py b/src/cc_catalog_airflow/dags/util/test_pg_cleaner.py index 93f8f2bbf..d9d76f6b7 100644 --- a/src/cc_catalog_airflow/dags/util/test_pg_cleaner.py +++ b/src/cc_catalog_airflow/dags/util/test_pg_cleaner.py @@ -1222,7 +1222,7 @@ def test_clean_single_row_doesnt_reuse_wrong_image_store_and_adds_row(): def test_log_and_check_totals_raises_when_number_of_images_cleaned_is_wrong( monkeypatch, ): - monkeypatch.setattr(pg_cleaner.image.ImageStore, "total_images", 1) + monkeypatch.setattr(pg_cleaner.image.ImageStore, "total_items", 1) expected_calls = [ call.info("Total images cleaned: 2"), call.info( @@ -1243,7 +1243,7 @@ def test_log_and_check_totals_raises_when_number_of_images_cleaned_is_wrong( def test_log_and_check_totals_logs(monkeypatch): - monkeypatch.setattr(pg_cleaner.image.ImageStore, "total_images", 1) + monkeypatch.setattr(pg_cleaner.image.ImageStore, "total_items", 1) expected_calls = [ call.info("Total images cleaned: 2"), call.info(