PDCT-796 Create archive containing README and data dump CSV. (#215)

climatepolicyradar · Jan 25, 2024 · 1134d50 · 1134d50
1 parent b6389c2
commit 1134d50
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 9 deletions.
diff --git a/app/api/api_v1/routers/search.py b/app/api/api_v1/routers/search.py
@@ -29,7 +29,7 @@
     VESPA_SECRETS_LOCATION,
     VESPA_URL,
 )
-from app.core.download import generate_data_dump_as_csv
+from app.core.download import create_data_download_zip_archive
 from app.core.lookups import get_countries_for_region, get_country_by_slug
 from app.core.search import (
     ENCODER,
@@ -188,7 +188,8 @@ def download_all_search_documents(db=Depends(get_db)) -> RedirectResponse:
             detail="Missing required environment variables",
         )
 
-    data_dump_s3_key = "navigator/whole_data_dump.csv"
+    s3_prefix = "navigator/dumps"
+    data_dump_s3_key = f"{s3_prefix}/whole_data_dump-{INGEST_CYCLE_START}.zip"
 
     s3_client = get_s3_client()
     valid_credentials = s3_client.is_connected()
@@ -209,24 +210,26 @@ def download_all_search_documents(db=Depends(get_db)) -> RedirectResponse:
         # upload a buffer, it starts from the position it is currently in. We need to
         # add the seek(0) to reset the buffer position to the beginning before writing
         # to S3 to avoid creating an empty file.
-        df_as_csv = generate_data_dump_as_csv(INGEST_CYCLE_START, db)
-        df_as_csv.seek(0)
+        zip_buffer = create_data_download_zip_archive(INGEST_CYCLE_START, db)
+        zip_buffer.seek(0)
 
         try:
             response = s3_client.upload_fileobj(
                 bucket=DOC_CACHE_BUCKET,
                 key=data_dump_s3_key,
-                content_type="application/csv",
-                fileobj=df_as_csv,
+                content_type="application/zip",
+                fileobj=zip_buffer,
             )
             if response is False:
-                _LOGGER.error("Failed to upload object to s3: %s", response)
+                _LOGGER.error("Failed to upload archive to s3: %s", response)
+            else:
+                _LOGGER.info(f"Finished uploading data archive to {DOC_CACHE_BUCKET}")
+
         except Exception as e:
             _LOGGER.error(e)
 
     s3_document = S3Document(DOC_CACHE_BUCKET, AWS_REGION, data_dump_s3_key)
     if s3_client.document_exists(s3_document):
-        _LOGGER.info(f"Finished uploading data dump to {DOC_CACHE_BUCKET}")
         _LOGGER.info("Redirecting to CDN data dump location...")
         redirect_url = f"https://{CDN_DOMAIN}/{data_dump_s3_key}"
         return RedirectResponse(redirect_url, status_code=status.HTTP_303_SEE_OTHER)

diff --git a/app/core/download.py b/app/core/download.py
@@ -1,6 +1,7 @@
 """Functions to support browsing the RDS document structure"""
 
-from io import BytesIO
+import zipfile
+from io import BytesIO, StringIO
 from logging import getLogger
 
 import pandas as pd
@@ -274,4 +275,38 @@ def convert_dump_to_csv(df: pd.DataFrame):
 def generate_data_dump_as_csv(ingest_cycle_start: str, db=Depends(get_db)):
     df = get_whole_database_dump(ingest_cycle_start, db)
     csv = convert_dump_to_csv(df)
+    csv.seek(0)
     return csv
+
+
+def generate_data_dump_readme(ingest_cycle_start: str):
+    file_buffer = StringIO(
+        "Thank you for downloading the full document dataset from Climate Policy Radar "
+        "and Climate Change Laws of the World!"
+        "\n\n"
+        "For more information including our data dictionary, methodology and "
+        "information about how to cite us, visit "
+        "\n"
+        "https://climatepolicyradar.notion.site/Readme-for-document-data-download-f2d55b7e238941b59559b9b1c4cc52c5"
+        ".\n\n"
+        "View our terms of use at https://app.climatepolicyradar.org/terms-of-use"
+        "\n\n"
+        f"Date data last updated: {ingest_cycle_start}"
+    )
+    file_buffer.seek(0)
+    return file_buffer
+
+
+def create_data_download_zip_archive(ingest_cycle_start: str, db=Depends(get_db)):
+    readme_buffer = generate_data_dump_readme(ingest_cycle_start)
+    csv_buffer = generate_data_dump_as_csv(ingest_cycle_start, db)
+
+    zip_buffer = BytesIO()
+    with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file:
+        for file_name, data in [
+            ("README.txt", readme_buffer),
+            (f"Document_Data_Download-{ingest_cycle_start}.csv", csv_buffer),
+        ]:
+            zip_file.writestr(file_name, data.getvalue())
+
+    return zip_buffer