Skip to content

Commit

Permalink
PDCT-796 Create archive containing README and data dump CSV. (#215)
Browse files Browse the repository at this point in the history
  • Loading branch information
katybaulch authored Jan 25, 2024
1 parent b6389c2 commit 1134d50
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 9 deletions.
19 changes: 11 additions & 8 deletions app/api/api_v1/routers/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
VESPA_SECRETS_LOCATION,
VESPA_URL,
)
from app.core.download import generate_data_dump_as_csv
from app.core.download import create_data_download_zip_archive
from app.core.lookups import get_countries_for_region, get_country_by_slug
from app.core.search import (
ENCODER,
Expand Down Expand Up @@ -188,7 +188,8 @@ def download_all_search_documents(db=Depends(get_db)) -> RedirectResponse:
detail="Missing required environment variables",
)

data_dump_s3_key = "navigator/whole_data_dump.csv"
s3_prefix = "navigator/dumps"
data_dump_s3_key = f"{s3_prefix}/whole_data_dump-{INGEST_CYCLE_START}.zip"

s3_client = get_s3_client()
valid_credentials = s3_client.is_connected()
Expand All @@ -209,24 +210,26 @@ def download_all_search_documents(db=Depends(get_db)) -> RedirectResponse:
# upload a buffer, it starts from the position it is currently in. We need to
# add the seek(0) to reset the buffer position to the beginning before writing
# to S3 to avoid creating an empty file.
df_as_csv = generate_data_dump_as_csv(INGEST_CYCLE_START, db)
df_as_csv.seek(0)
zip_buffer = create_data_download_zip_archive(INGEST_CYCLE_START, db)
zip_buffer.seek(0)

try:
response = s3_client.upload_fileobj(
bucket=DOC_CACHE_BUCKET,
key=data_dump_s3_key,
content_type="application/csv",
fileobj=df_as_csv,
content_type="application/zip",
fileobj=zip_buffer,
)
if response is False:
_LOGGER.error("Failed to upload object to s3: %s", response)
_LOGGER.error("Failed to upload archive to s3: %s", response)
else:
_LOGGER.info(f"Finished uploading data archive to {DOC_CACHE_BUCKET}")

except Exception as e:
_LOGGER.error(e)

s3_document = S3Document(DOC_CACHE_BUCKET, AWS_REGION, data_dump_s3_key)
if s3_client.document_exists(s3_document):
_LOGGER.info(f"Finished uploading data dump to {DOC_CACHE_BUCKET}")
_LOGGER.info("Redirecting to CDN data dump location...")
redirect_url = f"https://{CDN_DOMAIN}/{data_dump_s3_key}"
return RedirectResponse(redirect_url, status_code=status.HTTP_303_SEE_OTHER)
Expand Down
37 changes: 36 additions & 1 deletion app/core/download.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Functions to support browsing the RDS document structure"""

from io import BytesIO
import zipfile
from io import BytesIO, StringIO
from logging import getLogger

import pandas as pd
Expand Down Expand Up @@ -274,4 +275,38 @@ def convert_dump_to_csv(df: pd.DataFrame):
def generate_data_dump_as_csv(ingest_cycle_start: str, db=Depends(get_db)):
df = get_whole_database_dump(ingest_cycle_start, db)
csv = convert_dump_to_csv(df)
csv.seek(0)
return csv


def generate_data_dump_readme(ingest_cycle_start: str):
file_buffer = StringIO(
"Thank you for downloading the full document dataset from Climate Policy Radar "
"and Climate Change Laws of the World!"
"\n\n"
"For more information including our data dictionary, methodology and "
"information about how to cite us, visit "
"\n"
"https://climatepolicyradar.notion.site/Readme-for-document-data-download-f2d55b7e238941b59559b9b1c4cc52c5"
".\n\n"
"View our terms of use at https://app.climatepolicyradar.org/terms-of-use"
"\n\n"
f"Date data last updated: {ingest_cycle_start}"
)
file_buffer.seek(0)
return file_buffer


def create_data_download_zip_archive(ingest_cycle_start: str, db=Depends(get_db)):
readme_buffer = generate_data_dump_readme(ingest_cycle_start)
csv_buffer = generate_data_dump_as_csv(ingest_cycle_start, db)

zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file:
for file_name, data in [
("README.txt", readme_buffer),
(f"Document_Data_Download-{ingest_cycle_start}.csv", csv_buffer),
]:
zip_file.writestr(file_name, data.getvalue())

return zip_buffer

0 comments on commit 1134d50

Please sign in to comment.