Skip to content

Commit

Permalink
Include no quotes in files of cleaned data
Browse files Browse the repository at this point in the history
Co-authored-by: Madison Swain-Bowden <[email protected]>
  • Loading branch information
krysal and AetherUnbound committed May 8, 2024
1 parent ed6baed commit 9e1fff0
Showing 1 changed file with 12 additions and 1 deletion.
13 changes: 12 additions & 1 deletion ingestion_server/ingestion_server/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def _upload_to_s3(self, field: str):
s3_file_name = f"{self.s3_path}/{self.date}_{field}_{part_number}.tsv"
tsv_file = f"{field}.tsv"
with open(tsv_file, "w") as f:
csv_writer = csv.writer(f, delimiter="\t")
csv_writer = csv.writer(f, delimiter="\t", quoting=csv.QUOTE_NONE)
csv_writer.writerows(self.buffer[field].rows)
try:
self.s3_bucket.upload_file(tsv_file, s3_file_name)
Expand All @@ -334,12 +334,23 @@ def _upload_to_s3(self, field: str):
self.buffer[field].part += 1
self.buffer[field].rows = []

@staticmethod
def _trim_quotes(value: str):
if value.startswith(("'", '"')) and value.endswith(("'", '"')):
log.debug(f"Trimmed quotes from {value} returning {value[1:-1]}")
return value[1:-1]
return value

def save(self, result: dict) -> dict[str, int]:
for field, cleaned_items in result.items():
if not cleaned_items or not self.s3_bucket:
continue

for i, (identifier, value) in enumerate(cleaned_items):
cleaned_items[i] = (identifier, self._trim_quotes(value))

self.buffer[field].rows += cleaned_items

if len(self.buffer[field].rows) >= self.buffer_size:
self._upload_to_s3(field)

Expand Down

0 comments on commit 9e1fff0

Please sign in to comment.