From 1134d50e047476a44e796b09da92ca7c04fff144 Mon Sep 17 00:00:00 2001 From: Katy Baulch <46493669+katybaulch@users.noreply.github.com> Date: Thu, 25 Jan 2024 10:17:39 +0000 Subject: [PATCH] PDCT-796 Create archive containing README and data dump CSV. (#215) --- app/api/api_v1/routers/search.py | 19 +++++++++------- app/core/download.py | 37 +++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/app/api/api_v1/routers/search.py b/app/api/api_v1/routers/search.py index 4f2010f5..c5aedc49 100644 --- a/app/api/api_v1/routers/search.py +++ b/app/api/api_v1/routers/search.py @@ -29,7 +29,7 @@ VESPA_SECRETS_LOCATION, VESPA_URL, ) -from app.core.download import generate_data_dump_as_csv +from app.core.download import create_data_download_zip_archive from app.core.lookups import get_countries_for_region, get_country_by_slug from app.core.search import ( ENCODER, @@ -188,7 +188,8 @@ def download_all_search_documents(db=Depends(get_db)) -> RedirectResponse: detail="Missing required environment variables", ) - data_dump_s3_key = "navigator/whole_data_dump.csv" + s3_prefix = "navigator/dumps" + data_dump_s3_key = f"{s3_prefix}/whole_data_dump-{INGEST_CYCLE_START}.zip" s3_client = get_s3_client() valid_credentials = s3_client.is_connected() @@ -209,24 +210,26 @@ def download_all_search_documents(db=Depends(get_db)) -> RedirectResponse: # upload a buffer, it starts from the position it is currently in. We need to # add the seek(0) to reset the buffer position to the beginning before writing # to S3 to avoid creating an empty file. - df_as_csv = generate_data_dump_as_csv(INGEST_CYCLE_START, db) - df_as_csv.seek(0) + zip_buffer = create_data_download_zip_archive(INGEST_CYCLE_START, db) + zip_buffer.seek(0) try: response = s3_client.upload_fileobj( bucket=DOC_CACHE_BUCKET, key=data_dump_s3_key, - content_type="application/csv", - fileobj=df_as_csv, + content_type="application/zip", + fileobj=zip_buffer, ) if response is False: - _LOGGER.error("Failed to upload object to s3: %s", response) + _LOGGER.error("Failed to upload archive to s3: %s", response) + else: + _LOGGER.info(f"Finished uploading data archive to {DOC_CACHE_BUCKET}") + except Exception as e: _LOGGER.error(e) s3_document = S3Document(DOC_CACHE_BUCKET, AWS_REGION, data_dump_s3_key) if s3_client.document_exists(s3_document): - _LOGGER.info(f"Finished uploading data dump to {DOC_CACHE_BUCKET}") _LOGGER.info("Redirecting to CDN data dump location...") redirect_url = f"https://{CDN_DOMAIN}/{data_dump_s3_key}" return RedirectResponse(redirect_url, status_code=status.HTTP_303_SEE_OTHER) diff --git a/app/core/download.py b/app/core/download.py index c4a27b84..3dc298af 100644 --- a/app/core/download.py +++ b/app/core/download.py @@ -1,6 +1,7 @@ """Functions to support browsing the RDS document structure""" -from io import BytesIO +import zipfile +from io import BytesIO, StringIO from logging import getLogger import pandas as pd @@ -274,4 +275,38 @@ def convert_dump_to_csv(df: pd.DataFrame): def generate_data_dump_as_csv(ingest_cycle_start: str, db=Depends(get_db)): df = get_whole_database_dump(ingest_cycle_start, db) csv = convert_dump_to_csv(df) + csv.seek(0) return csv + + +def generate_data_dump_readme(ingest_cycle_start: str): + file_buffer = StringIO( + "Thank you for downloading the full document dataset from Climate Policy Radar " + "and Climate Change Laws of the World!" + "\n\n" + "For more information including our data dictionary, methodology and " + "information about how to cite us, visit " + "\n" + "https://climatepolicyradar.notion.site/Readme-for-document-data-download-f2d55b7e238941b59559b9b1c4cc52c5" + ".\n\n" + "View our terms of use at https://app.climatepolicyradar.org/terms-of-use" + "\n\n" + f"Date data last updated: {ingest_cycle_start}" + ) + file_buffer.seek(0) + return file_buffer + + +def create_data_download_zip_archive(ingest_cycle_start: str, db=Depends(get_db)): + readme_buffer = generate_data_dump_readme(ingest_cycle_start) + csv_buffer = generate_data_dump_as_csv(ingest_cycle_start, db) + + zip_buffer = BytesIO() + with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file: + for file_name, data in [ + ("README.txt", readme_buffer), + (f"Document_Data_Download-{ingest_cycle_start}.csv", csv_buffer), + ]: + zip_file.writestr(file_name, data.getvalue()) + + return zip_buffer