Skip to content

Commit

Permalink
Pdct 788/redirect to cdn instead of streaming response (#214)
Browse files Browse the repository at this point in the history
* PDCT-788 Redirect to CDN instead of returning streaming response.

* PDCT-788 Explicitly specified UTF-8 encoding.

* PDCT-788 Added pyright config to pyproject.toml.

* PDCT-788 Fixed empty file upload to S3 with seek(0).

* PDCT-788 Removed vague CD directory command for generalisation.

* PDCT-788 Moved create logic into download route.

* PDCT-788 Updated bool conversion logic for DEVELOPMENT_MODE variable.

* PDCT-788 Updated str comparison to bool.

* PDCT-788 Removed debug prints & added time range to query.
  • Loading branch information
katybaulch authored Jan 24, 2024
1 parent aa38cac commit b6389c2
Show file tree
Hide file tree
Showing 8 changed files with 298 additions and 281 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ include ./makefile-docker.defs

git_hooks:
# Install git pre-commit hooks
cd backend/; poetry run pre-commit install --install-hooks
poetry run pre-commit install --install-hooks
70 changes: 38 additions & 32 deletions app/api/api_v1/routers/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
"""
import json
import logging
from datetime import datetime
from io import BytesIO
from typing import Mapping, Sequence

Expand All @@ -16,21 +15,21 @@
from fastapi import APIRouter, Depends, HTTPException, Request, status
from fastapi.responses import StreamingResponse
from sqlalchemy.orm import Session
from starlette.responses import RedirectResponse

from app.api.api_v1.schemas.search import SearchRequestBody, SearchResponse, SortField
from app.core.aws import S3Document, get_s3_client
from app.core.browse import BrowseArgs, browse_rds_families
from app.core.config import (
AWS_REGION,
CDN_DOMAIN,
DOC_CACHE_BUCKET,
INGEST_CYCLE_START,
PUBLIC_APP_URL,
VESPA_SECRETS_LOCATION,
VESPA_URL,
)
from app.core.download import (
generate_data_dump_as_csv,
)
from app.core.download import generate_data_dump_as_csv
from app.core.lookups import get_countries_for_region, get_country_by_slug
from app.core.search import (
ENCODER,
Expand Down Expand Up @@ -173,7 +172,7 @@ def download_search_documents(


@search_router.get("/searches/download-all-data")
def download_all_search_documents(db=Depends(get_db)) -> StreamingResponse:
def download_all_search_documents(db=Depends(get_db)) -> RedirectResponse:
"""Download a CSV containing details of all the documents in the corpus."""
_LOGGER.info("Whole data download request")

Expand All @@ -189,44 +188,51 @@ def download_all_search_documents(db=Depends(get_db)) -> StreamingResponse:
detail="Missing required environment variables",
)

aws_environment = "production" if "dev" not in PUBLIC_APP_URL else "staging"
data_dump_s3_key = f"navigator/{aws_environment}_data_dump_{INGEST_CYCLE_START}.csv"
data_dump_s3_key = "navigator/whole_data_dump.csv"

s3_client = get_s3_client()
valid_credentials = s3_client.is_connected()
if not valid_credentials:
_LOGGER.info("Error connecting to S3 AWS")
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED, detail="Error connecting to AWS"
)

s3_document = S3Document(DOC_CACHE_BUCKET, AWS_REGION, data_dump_s3_key)
if not s3_client.document_exists(s3_document):
_LOGGER.info(f"Generating dump for ingest cycle w/c {INGEST_CYCLE_START}...")
df_as_csv = generate_data_dump_as_csv(db)
if valid_credentials is True and (not s3_client.document_exists(s3_document)):
aws_env = "production" if "dev" not in PUBLIC_APP_URL else "staging"
_LOGGER.info(
f"Generating {aws_env} dump for ingest cycle w/c {INGEST_CYCLE_START}..."
)

if valid_credentials is False:
_LOGGER.error("Cannot connect to AWS.")
else:
# After writing to a file buffer the position stays at the end whereas when you
# upload a buffer, it starts from the position it is currently in. We need to
# add the seek(0) to reset the buffer position to the beginning before writing
# to S3 to avoid creating an empty file.
df_as_csv = generate_data_dump_as_csv(INGEST_CYCLE_START, db)
df_as_csv.seek(0)

try:
response = s3_client.upload_fileobj(
df_as_csv, DOC_CACHE_BUCKET, data_dump_s3_key
bucket=DOC_CACHE_BUCKET,
key=data_dump_s3_key,
content_type="application/csv",
fileobj=df_as_csv,
)
if response is False:
_LOGGER.error("Failed to upload object to s3: %s", response)
except Exception as e:
_LOGGER.error(e)

if s3_client.document_exists(s3_document):
_LOGGER.debug("Finished uploading data dump to s3")

else:
_LOGGER.debug("File already exists in S3. Fetching...")

s3_file = s3_client.download_file(s3_document)

_LOGGER.debug(f"Downloading all documents as of '{INGEST_CYCLE_START}' as CSV")
timestamp = datetime.now()
filename = f"whole_database_dump-{timestamp}.csv"
return StreamingResponse(
content=BytesIO(s3_file.read()),
headers={
"Content-Type": "text/csv",
"Content-Disposition": f"attachment; filename={filename}",
},
)
s3_document = S3Document(DOC_CACHE_BUCKET, AWS_REGION, data_dump_s3_key)
if s3_client.document_exists(s3_document):
_LOGGER.info(f"Finished uploading data dump to {DOC_CACHE_BUCKET}")
_LOGGER.info("Redirecting to CDN data dump location...")
redirect_url = f"https://{CDN_DOMAIN}/{data_dump_s3_key}"
return RedirectResponse(redirect_url, status_code=status.HTTP_303_SEE_OTHER)

_LOGGER.info(f"Can't find data dump for {INGEST_CYCLE_START} in {DOC_CACHE_BUCKET}")
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND)


def _get_browse_args_from_search_request_body(
Expand Down
13 changes: 6 additions & 7 deletions app/core/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from botocore.exceptions import ClientError, UnauthorizedSSOTokenError
from botocore.response import StreamingBody

from app.core.config import AWS_REGION
from app.core.config import AWS_REGION, DEVELOPMENT_MODE

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -46,25 +46,25 @@ class S3Client:
"""Helper class to connect to S3 and perform actions on buckets and documents."""

def __init__(self, dev_mode: bool): # noqa: D107
if dev_mode is True:
logger.info("***************** IN DEVELOPMENT MODE *****************")
if dev_mode is False:
logger.info("***************** IN DEPLOYMENT MODE *****************")
self.client = boto3.client(
"s3",
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
aws_session_token=os.getenv("AWS_SESSION_TOKEN"),
config=botocore.client.Config(
signature_version="s3v4",
region_name=AWS_REGION,
connect_timeout=10,
),
)
else:
logger.info("***************** IN DEPLOYMENT MODE *****************")
logger.info("***************** IN DEVELOPMENT MODE *****************")
self.client = boto3.client(
"s3",
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
aws_session_token=os.getenv("AWS_SESSION_TOKEN"),
config=botocore.client.Config(
signature_version="s3v4",
region_name=AWS_REGION,
Expand Down Expand Up @@ -304,5 +304,4 @@ def document_exists(self, s3_document: S3Document) -> bool:

def get_s3_client():
"""Get s3 client for API."""
dev_mode = t.cast(bool, os.getenv("DEVELOPMENT_MODE", "False"))
return S3Client(dev_mode)
return S3Client(DEVELOPMENT_MODE)
2 changes: 2 additions & 0 deletions app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,6 @@
# Whole database dump
INGEST_CYCLE_START = os.getenv("INGEST_CYCLE_START")
DOC_CACHE_BUCKET = os.getenv("DOCUMENT_CACHE_BUCKET")
DEVELOPMENT_MODE: bool = os.getenv("DEVELOPMENT_MODE", "False").lower() == "true"
AWS_REGION = os.getenv("AWS_REGION", "eu-west-1")
CDN_DOMAIN = os.getenv("CDN_DOMAIN")
Loading

0 comments on commit b6389c2

Please sign in to comment.