Skip to content

Commit

Permalink
Remove single quotes in values of Ingestion Server's TSV files (#4471)
Browse files Browse the repository at this point in the history
  • Loading branch information
krysal authored Jun 26, 2024
1 parent d5d4070 commit 3cfdb8f
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 16 deletions.
42 changes: 28 additions & 14 deletions ingestion_server/ingestion_server/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import csv
import logging as log
import multiprocessing
import pathlib
import shutil
import time
import uuid
from urllib.parse import urlparse
Expand Down Expand Up @@ -62,8 +64,12 @@
"www.eol.org": True,
".digitaltmuseum.org": True,
"collections.musee-mccord.qc.ca": False,
".stocksnap.io": True,
"cdn.stocksnap.io": True,
}

TMP_DIR = pathlib.Path("/tmp/cleaned_data").resolve()


def _tag_denylisted(tag):
"""Check if a tag is banned or contains a banned substring."""
Expand Down Expand Up @@ -106,9 +112,9 @@ def cleanup_url(url, tls_support):
log.debug(f"Tested domain {_tld}")

if tls_supported:
return f"'https://{url}'"
return f"https://{url}"
else:
return f"'http://{url}'"
return f"http://{url}"
else:
return None

Expand Down Expand Up @@ -141,6 +147,7 @@ def cleanup_tags(tags):

if update_required:
fragment = Json(tag_output)
log.debug(f"Tags fragment: {fragment}")
return fragment
else:
return None
Expand Down Expand Up @@ -200,7 +207,7 @@ def test_tls_supported(cls, url):
https = url.replace("http://", "https://")
try:
res = re.get(https, timeout=2)
log.info(f"{https}:{res.status_code}")
log.info(f"tls_test - {https}:{res.status_code}")
return 200 <= res.status_code < 400
except re.RequestException:
return False
Expand Down Expand Up @@ -243,23 +250,27 @@ def _clean_data_worker(rows, temp_table, sources_config, all_fields: list[str]):
if clean:
cleaned_data[update_field] = clean
log.debug(
f"Updated {update_field} for {identifier} "
f"from '{dirty_value}' to '{clean}'"
f"Updated {update_field} for {identifier}\n\t"
f"from '{dirty_value}' \n\tto '{clean}'"
)
# Generate SQL update for all the fields we just cleaned
update_field_expressions = []
for field, clean_value in cleaned_data.items():
update_field_expressions.append(f"{field} = {clean_value}")
# Save cleaned values for later
# (except for tags, which take up too much space)
if field == "tags":
# The `clean_value` for tags already includes the single quotes,
# so it's not necessary to add them, and they're omitted in
# `cleaned_values` to save in files later because they take up
# too much disk space.
update_field_expressions.append(f"{field} = {clean_value}")
continue
update_field_expressions.append(f"{field} = '{clean_value}'")
cleaned_values[field].append((identifier, clean_value))

if len(update_field_expressions) > 0:
update_query = f"""UPDATE {temp_table} SET
{', '.join(update_field_expressions)} WHERE id = {_id}
"""
log.debug(f"Executing update query: \n\t{update_query}")
write_cur.execute(update_query)
log.info(f"TLS cache: {TLS_CACHE}")
log.info("Worker committing changes...")
Expand All @@ -273,18 +284,17 @@ def _clean_data_worker(rows, temp_table, sources_config, all_fields: list[str]):


def save_cleaned_data(result: dict) -> dict[str, int]:
log.info("Saving cleaned data...")
start_time = time.perf_counter()

cleanup_counts = {field: len(items) for field, items in result.items()}
for field, cleaned_items in result.items():
# Skip the tag field because the file is too large and fills up the disk
if field == "tag":
if field == "tag" or not cleaned_items:
continue
if cleaned_items:
with open(f"{field}.tsv", "a") as f:
csv_writer = csv.writer(f, delimiter="\t")
csv_writer.writerows(cleaned_items)

with open(TMP_DIR.joinpath(f"{field}.tsv"), "a", encoding="utf-8") as f:
csv_writer = csv.writer(f, delimiter="\t")
csv_writer.writerows(cleaned_items)

end_time = time.perf_counter()
total_time = end_time - start_time
Expand All @@ -300,6 +310,10 @@ def clean_image_data(table):
:return: None
"""

# Recreate directory where cleaned data is stored
shutil.rmtree(TMP_DIR, ignore_errors=True)
TMP_DIR.mkdir(parents=True)

# Map each table to the fields that need to be cleaned up. Then, map each
# field to its cleanup function.
log.info("Cleaning up data...")
Expand Down
4 changes: 2 additions & 2 deletions ingestion_server/test/unit_tests/test_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ def test_url_protocol_fix():
tls_support_cache = {}
pook.get("https://flickr.com").reply(200)
result = CleanupFunctions.cleanup_url(bad_url, tls_support_cache)
expected = "'https://flickr.com'"
expected = "https://flickr.com"

bad_http = "neverssl.com"
pook.get("https://neverssl.com").reply(500)
result_http = CleanupFunctions.cleanup_url(bad_http, tls_support_cache)
expected_http = "'http://neverssl.com'"
expected_http = "http://neverssl.com"
assert result == expected
assert result_http == expected_http

Expand Down

0 comments on commit 3cfdb8f

Please sign in to comment.