Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove single quotes in values of Ingestion Server's TSV files #4471

Merged
merged 4 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 28 additions & 14 deletions ingestion_server/ingestion_server/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import csv
import logging as log
import multiprocessing
import pathlib
import shutil
import time
import uuid
from urllib.parse import urlparse
Expand Down Expand Up @@ -62,8 +64,12 @@
"www.eol.org": True,
".digitaltmuseum.org": True,
"collections.musee-mccord.qc.ca": False,
".stocksnap.io": True,
"cdn.stocksnap.io": True,
}

TMP_DIR = pathlib.Path("/tmp/cleaned_data").resolve()


def _tag_denylisted(tag):
"""Check if a tag is banned or contains a banned substring."""
Expand Down Expand Up @@ -106,9 +112,9 @@ def cleanup_url(url, tls_support):
log.debug(f"Tested domain {_tld}")

if tls_supported:
return f"'https://{url}'"
return f"https://{url}"
else:
return f"'http://{url}'"
return f"http://{url}"
else:
return None

Expand Down Expand Up @@ -141,6 +147,7 @@ def cleanup_tags(tags):

if update_required:
fragment = Json(tag_output)
log.debug(f"Tags fragment: {fragment}")
return fragment
else:
return None
Expand Down Expand Up @@ -200,7 +207,7 @@ def test_tls_supported(cls, url):
https = url.replace("http://", "https://")
try:
res = re.get(https, timeout=2)
log.info(f"{https}:{res.status_code}")
log.info(f"tls_test - {https}:{res.status_code}")
return 200 <= res.status_code < 400
except re.RequestException:
return False
Expand Down Expand Up @@ -243,23 +250,27 @@ def _clean_data_worker(rows, temp_table, sources_config, all_fields: list[str]):
if clean:
cleaned_data[update_field] = clean
log.debug(
f"Updated {update_field} for {identifier} "
f"from '{dirty_value}' to '{clean}'"
f"Updated {update_field} for {identifier}\n\t"
f"from '{dirty_value}' \n\tto '{clean}'"
)
# Generate SQL update for all the fields we just cleaned
update_field_expressions = []
for field, clean_value in cleaned_data.items():
update_field_expressions.append(f"{field} = {clean_value}")
# Save cleaned values for later
# (except for tags, which take up too much space)
if field == "tags":
# The `clean_value` for tags already includes the single quotes,
# so it's not necessary to add them, and they're omitted in
# `cleaned_values` to save in files later because they take up
# too much disk space.
update_field_expressions.append(f"{field} = {clean_value}")
continue
update_field_expressions.append(f"{field} = '{clean_value}'")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why was this moved? I think this was easier to parse when the comment was before the continue, as well.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The URL fields had single quotes added twice. As you can see above in the file (lines 117 and 119), I removed the quotes from the cleaned value so they won't appear in the file either (single quotes as the quoting character are problematic for later loading in the DB).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also confused about this 😕

Where were the duplicate quotes coming from, exactly? And why does the clean_value for tags not need to be quoted but everything else does?

Maybe avoiding the continue would help the clarity of this block:

for field, clean_value in cleaned_data.items():
    if field != "tags":
        # Save cleaned values for later
        # (except for tags, which take up too much space)
        cleaned_values[field].append((identifier, clean_value))

    update_field_expressions.append(f"{field} = {clean_value}")

Never mind that I don't understand where the difference is coming from for the format of the string added to update_field_expressions, this version is a lot easier to understand, to me.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where were the duplicate quotes coming from, exactly?

The cleaned values have quotes added in the return of the cleaning function:

if tls_supported:
return f"'https://{url}'"
else:
return f"'http://{url}'"

And why does the clean_value for tags not need to be quoted but everything else does?

The psycopg2.extras.Json function for tags already adapts a Python object to :sql:json data type, so they don't need extra quotes wrapping.

The difference in the string added to update_field_expressions is the quotes. I assume this was previously done that way to avoid this confusion, but it's necessary to have the cleaned values without these quotes in the file so they don't interfere with the copy upload to a DB table later.

An alternative would be to perform more string manipulation to remove the quotes before saving the rows in files, but this seemed unnecessary and more prone to error to me.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the explanation! That's fine with me. I don't think it's worth re-working this too much since these steps are going to be removed afterward anyway, but it would be good to expand that comment and move it before the continue to make this more clear, since multiple people found it confusing!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with Staci, but I also don't want to block on it because of this code being deleted soon anyway 🤷

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated the comment. If it's still not clear, it's open to suggestions. I didn't think it would bring so much confusion 😅

cleaned_values[field].append((identifier, clean_value))

if len(update_field_expressions) > 0:
update_query = f"""UPDATE {temp_table} SET
{', '.join(update_field_expressions)} WHERE id = {_id}
"""
log.debug(f"Executing update query: \n\t{update_query}")
write_cur.execute(update_query)
log.info(f"TLS cache: {TLS_CACHE}")
log.info("Worker committing changes...")
Expand All @@ -273,18 +284,17 @@ def _clean_data_worker(rows, temp_table, sources_config, all_fields: list[str]):


def save_cleaned_data(result: dict) -> dict[str, int]:
log.info("Saving cleaned data...")
start_time = time.perf_counter()

cleanup_counts = {field: len(items) for field, items in result.items()}
for field, cleaned_items in result.items():
# Skip the tag field because the file is too large and fills up the disk
if field == "tag":
if field == "tag" or not cleaned_items:
continue
if cleaned_items:
with open(f"{field}.tsv", "a") as f:
csv_writer = csv.writer(f, delimiter="\t")
csv_writer.writerows(cleaned_items)

with open(TMP_DIR.joinpath(f"{field}.tsv"), "a", encoding="utf-8") as f:
krysal marked this conversation as resolved.
Show resolved Hide resolved
csv_writer = csv.writer(f, delimiter="\t")
csv_writer.writerows(cleaned_items)

end_time = time.perf_counter()
total_time = end_time - start_time
Expand All @@ -300,6 +310,10 @@ def clean_image_data(table):
:return: None
"""

# Recreate directory where cleaned data is stored
shutil.rmtree(TMP_DIR, ignore_errors=True)
TMP_DIR.mkdir(parents=True)

# Map each table to the fields that need to be cleaned up. Then, map each
# field to its cleanup function.
log.info("Cleaning up data...")
Expand Down
4 changes: 2 additions & 2 deletions ingestion_server/test/unit_tests/test_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ def test_url_protocol_fix():
tls_support_cache = {}
pook.get("https://flickr.com").reply(200)
result = CleanupFunctions.cleanup_url(bad_url, tls_support_cache)
expected = "'https://flickr.com'"
expected = "https://flickr.com"

bad_http = "neverssl.com"
pook.get("https://neverssl.com").reply(500)
result_http = CleanupFunctions.cleanup_url(bad_http, tls_support_cache)
expected_http = "'http://neverssl.com'"
expected_http = "http://neverssl.com"
assert result == expected
assert result_http == expected_http

Expand Down