From 6717e2a961f8820d48c7f9864deb6dcfd8a1a03a Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Thu, 9 Jun 2022 06:50:31 +0200 Subject: [PATCH] Add the changes from #6643 --- openlibrary/data/dump.py | 5 ++--- scripts/oldump.sh | 9 +++++++-- scripts/sitemaps/sitemap.py | 27 ++++++++++++++++----------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/openlibrary/data/dump.py b/openlibrary/data/dump.py index 6a35d5bc7c1..649ea750c12 100644 --- a/openlibrary/data/dump.py +++ b/openlibrary/data/dump.py @@ -30,9 +30,8 @@ def log(*args) -> None: - msg = f"{datetime.now():%Y-%m-%d %H:%M:%S} [openlibrary.dump] " + " ".join( - str(a) for a in args - ) + args_str = " ".join(str(a) for a in args) + msg = f"{datetime.now():%Y-%m-%d %H:%M:%S} [openlibrary.dump] {args_str}" logger.info(msg) print(msg, file=sys.stderr) diff --git a/scripts/oldump.sh b/scripts/oldump.sh index 7645e4217de..0ab6a87b301 100755 --- a/scripts/oldump.sh +++ b/scripts/oldump.sh @@ -48,7 +48,7 @@ yyyymm=${yyyymmdd:0:7} # 2022-05-31 --> 2022-05 cdump=ol_cdump_$yyyymmdd dump=ol_dump_$yyyymmdd -if [ $# -lt 1 ] +if [[ $# -lt 1 ]] then echo "USAGE: $0 yyyy-mm-dd [--archive] [--overwrite]" 1>&2 exit 1 @@ -57,6 +57,7 @@ fi function cleanup() { rm -f $TMPDIR/dumps/data.txt.gz rm -rf $TMPDIR/dumps/ol_* + rm -rf $TMPDIR/sitemaps } function log() { @@ -185,10 +186,14 @@ ls -lhR # Archival # ======== # Only archive if that caller has requested it and we are not testing. -if [ $@ == *'--archive'* ]; then +if [[ $@ == *'--archive'* ]]; then if [[ -z $OLDUMP_TESTING ]]; then archive_dumps + else + log "Skipping archival: Test mode" fi +else + log "Skipping archival: Option omitted" fi # ================= diff --git a/scripts/sitemaps/sitemap.py b/scripts/sitemaps/sitemap.py index 752151b776d..a93dc572058 100755 --- a/scripts/sitemaps/sitemap.py +++ b/scripts/sitemaps/sitemap.py @@ -6,13 +6,13 @@ python sitemaps.py suffix dump.txt.gz """ -import datetime import gzip import itertools import json +import logging import os import re -import time +from datetime import datetime import web @@ -38,10 +38,20 @@ """ +logger = logging.getLogger(__file__) +logger.setLevel(logging.DEBUG) + sitemap = web.template.Template(t_sitemap) siteindex = web.template.Template(t_siteindex) +def log(*args) -> None: + args_str = " ".join(str(a) for a in args) + msg = f"{datetime.now():%Y-%m-%d %H:%M:%S} [openlibrary.dump] {args_str}" + logger.info(msg) + print(msg, file=sys.stderr) + + def xopen(filename): return gzip.open(filename) if filename.endswith(".gz") else open(filename) @@ -60,7 +70,7 @@ def urlsafe(name): space = ' \n\r' unsafe = reserved + delims + unwise + space - pattern = '[%s]+' % "".join(re.escape(c) for c in unsafe) + pattern = f"[{''.join(re.escape(c) for c in unsafe)}]+" safepath_re = re.compile(pattern) return safepath_re.sub('_', name).replace(' ', '-').strip('_')[:100] @@ -116,14 +126,14 @@ def generate_sitemaps(filename): things.append(web.storage(path=path, last_modified=last_modified)) if things: - write("sitemaps/sitemap_%s.xml.gz" % sortkey, sitemap(things)) + write(f"sitemaps/sitemap_{sortkey}.xml.gz", sitemap(things)) def generate_siteindex(): filenames = sorted(os.listdir("sitemaps")) if "siteindex.xml.gz" in filenames: filenames.remove("siteindex.xml.gz") - timestamp = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S') + 'Z' + timestamp = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S') + 'Z' index = siteindex(filenames, timestamp) write("sitemaps/siteindex.xml.gz", index) @@ -135,7 +145,7 @@ def write(path, text): with gzip.open(path, 'w') as f: f.write(text.encode()) except Exception as e: - print(f'write fail {e}') + log(f'write fail {e}') # os.system("gzip " + path) @@ -163,11 +173,6 @@ def system(cmd): raise Exception("%r failed with exit status: %d" % (cmd, status)) -def log(*args): - msg = " ".join(map(str, args)) - print(f"{time.asctime()} {msg}") - - def main(dumpfile): system("rm -rf sitemaps sitemaps_data.txt*; mkdir sitemaps")