Skip to content

Commit

Permalink
Try loading the bucket's attributes
Browse files Browse the repository at this point in the history
  • Loading branch information
krysal committed May 9, 2024
1 parent 7c6fefe commit 98024a7
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 4 deletions.
12 changes: 9 additions & 3 deletions ingestion_server/ingestion_server/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,12 +291,14 @@ class CleanDataUploader:
}

def __init__(self):
self.date = time.strftime("%Y-%m-%d")
self.buffer_size = config("CLEANUP_BUFFER_SIZE", default=10_000_000, cast=int)
bucket_name = config("OPENVERSE_BUCKET", default="openverse-catalog")
try:
self.s3 = self._get_s3_resource()
self.s3_bucket = self.s3.Bucket(bucket_name)
self.date = time.strftime("%Y-%m-%d")
# Try loading the bucket's attributes to check the connection works.
self.s3_bucket.load()
except Exception as e:
log.error(f"Error connecting to S3 or creating bucket: {e}")
self.s3 = None
Expand All @@ -319,6 +321,10 @@ def _get_s3_resource():
)

def _upload_to_s3(self, field: str):
if not self.s3_bucket:
log.warning("No S3 bucket available, skipping upload.")
return

part_number = self.buffer[field].part
log.info(f"Uploading file part {part_number} of `{field}` to S3...")
s3_file_name = f"{self.s3_path}/{self.date}_{field}_{part_number}.tsv"
Expand All @@ -336,7 +342,7 @@ def _upload_to_s3(self, field: str):

def save(self, result: dict) -> dict[str, int]:
for field, cleaned_items in result.items():
if not cleaned_items or not self.s3_bucket:
if not cleaned_items:
continue

self.buffer[field].rows += cleaned_items
Expand All @@ -348,7 +354,7 @@ def save(self, result: dict) -> dict[str, int]:
def flush(self):
log.info("Clearing buffer.")
for field in self.buffer:
if self.buffer[field].rows and self.s3_bucket is not None:
if self.buffer[field].rows:
self._upload_to_s3(field)


Expand Down
2 changes: 1 addition & 1 deletion ingestion_server/justfile
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ create-and-populate-filtered-index model="image" destination_suffix="init":
#########

# Run ingestion-server tests locally
test-local *args="--exitfirst":
test-local *args="--verbose --exitfirst":
# populate the tldextract cache before running tests to prevent unnecessary network requests during tests
# and from needing to mock essentially unmockable responses
pipenv run tldextract --update
Expand Down

0 comments on commit 98024a7

Please sign in to comment.