Skip to content

Commit

Permalink
Add the changes from internetarchive#6643
Browse files Browse the repository at this point in the history
  • Loading branch information
cclauss committed Jun 9, 2022
1 parent e4ebd45 commit 6717e2a
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 16 deletions.
5 changes: 2 additions & 3 deletions openlibrary/data/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@


def log(*args) -> None:
msg = f"{datetime.now():%Y-%m-%d %H:%M:%S} [openlibrary.dump] " + " ".join(
str(a) for a in args
)
args_str = " ".join(str(a) for a in args)
msg = f"{datetime.now():%Y-%m-%d %H:%M:%S} [openlibrary.dump] {args_str}"
logger.info(msg)
print(msg, file=sys.stderr)

Expand Down
9 changes: 7 additions & 2 deletions scripts/oldump.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ yyyymm=${yyyymmdd:0:7} # 2022-05-31 --> 2022-05
cdump=ol_cdump_$yyyymmdd
dump=ol_dump_$yyyymmdd

if [ $# -lt 1 ]
if [[ $# -lt 1 ]]
then
echo "USAGE: $0 yyyy-mm-dd [--archive] [--overwrite]" 1>&2
exit 1
Expand All @@ -57,6 +57,7 @@ fi
function cleanup() {
rm -f $TMPDIR/dumps/data.txt.gz
rm -rf $TMPDIR/dumps/ol_*
rm -rf $TMPDIR/sitemaps
}

function log() {
Expand Down Expand Up @@ -185,10 +186,14 @@ ls -lhR
# Archival
# ========
# Only archive if that caller has requested it and we are not testing.
if [ $@ == *'--archive'* ]; then
if [[ $@ == *'--archive'* ]]; then
if [[ -z $OLDUMP_TESTING ]]; then
archive_dumps
else
log "Skipping archival: Test mode"
fi
else
log "Skipping archival: Option omitted"
fi

# =================
Expand Down
27 changes: 16 additions & 11 deletions scripts/sitemaps/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
python sitemaps.py suffix dump.txt.gz
"""

import datetime
import gzip
import itertools
import json
import logging
import os
import re
import time
from datetime import datetime

import web

Expand All @@ -38,10 +38,20 @@
</sitemapindex>
"""

logger = logging.getLogger(__file__)
logger.setLevel(logging.DEBUG)

sitemap = web.template.Template(t_sitemap)
siteindex = web.template.Template(t_siteindex)


def log(*args) -> None:
args_str = " ".join(str(a) for a in args)
msg = f"{datetime.now():%Y-%m-%d %H:%M:%S} [openlibrary.dump] {args_str}"
logger.info(msg)
print(msg, file=sys.stderr)


def xopen(filename):
return gzip.open(filename) if filename.endswith(".gz") else open(filename)

Expand All @@ -60,7 +70,7 @@ def urlsafe(name):
space = ' \n\r'

unsafe = reserved + delims + unwise + space
pattern = '[%s]+' % "".join(re.escape(c) for c in unsafe)
pattern = f"[{''.join(re.escape(c) for c in unsafe)}]+"
safepath_re = re.compile(pattern)
return safepath_re.sub('_', name).replace(' ', '-').strip('_')[:100]

Expand Down Expand Up @@ -116,14 +126,14 @@ def generate_sitemaps(filename):
things.append(web.storage(path=path, last_modified=last_modified))

if things:
write("sitemaps/sitemap_%s.xml.gz" % sortkey, sitemap(things))
write(f"sitemaps/sitemap_{sortkey}.xml.gz", sitemap(things))


def generate_siteindex():
filenames = sorted(os.listdir("sitemaps"))
if "siteindex.xml.gz" in filenames:
filenames.remove("siteindex.xml.gz")
timestamp = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S') + 'Z'
timestamp = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S') + 'Z'
index = siteindex(filenames, timestamp)
write("sitemaps/siteindex.xml.gz", index)

Expand All @@ -135,7 +145,7 @@ def write(path, text):
with gzip.open(path, 'w') as f:
f.write(text.encode())
except Exception as e:
print(f'write fail {e}')
log(f'write fail {e}')
# os.system("gzip " + path)


Expand Down Expand Up @@ -163,11 +173,6 @@ def system(cmd):
raise Exception("%r failed with exit status: %d" % (cmd, status))


def log(*args):
msg = " ".join(map(str, args))
print(f"{time.asctime()} {msg}")


def main(dumpfile):
system("rm -rf sitemaps sitemaps_data.txt*; mkdir sitemaps")

Expand Down

0 comments on commit 6717e2a

Please sign in to comment.