Skip to content

Commit

Permalink
[postprocessor:metadata] implement archive options (#2421)
Browse files Browse the repository at this point in the history
'archive', 'archive-format', and 'archive-prefix'
  • Loading branch information
mikf committed Mar 20, 2022
1 parent be34927 commit 9bd27b1
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 18 deletions.
17 changes: 16 additions & 1 deletion docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,9 @@ Description
memory requirements are significantly lower when the
amount of stored IDs gets reasonably large.

Note: archive paths support regular `format string`_ replacements,
Note: Archive files that do not already exist get generated automatically.

Note: Archive paths support regular `format string`_ replacements,
but be aware that using external inputs for building local paths
may pose a security risk.

Expand Down Expand Up @@ -3139,6 +3141,19 @@ Description
Note: Only applies for ``"mode": "custom"``.


metadata.archive
----------------
Type
|Path|_
Description
File to store IDs of generated metadata files in,
similar to `extractor.*.archive`_.

``archive-format`` and ``archive-prefix`` options,
akin to `extractor.*.archive-format`_ and `extractor.*.archive-prefix`_,
are supported as well.


metadata.mtime
--------------
Type
Expand Down
22 changes: 12 additions & 10 deletions gallery_dl/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,8 +389,10 @@ def get_downloader(self, scheme):

def initialize(self, kwdict=None):
"""Delayed initialization of PathFormat, etc."""
cfg = self.extractor.config
pathfmt = self.pathfmt = path.PathFormat(self.extractor)
extr = self.extractor
cfg = extr.config

pathfmt = self.pathfmt = path.PathFormat(extr)
if kwdict:
pathfmt.set_directory(kwdict)

Expand All @@ -403,17 +405,18 @@ def initialize(self, kwdict=None):
archive = cfg("archive")
if archive:
archive = util.expand_path(archive)
archive_format = (cfg("archive-prefix", extr.category) +
cfg("archive-format", extr.archive_fmt))
try:
if "{" in archive:
archive = formatter.parse(archive).format_map(kwdict)
self.archive = util.DownloadArchive(archive, self.extractor)
self.archive = util.DownloadArchive(archive, archive_format)
except Exception as exc:
self.extractor.log.warning(
extr.log.warning(
"Failed to open download archive at '%s' ('%s: %s')",
archive, exc.__class__.__name__, exc)
else:
self.extractor.log.debug(
"Using download archive '%s'", archive)
extr.log.debug("Using download archive '%s'", archive)

skip = cfg("skip", True)
if skip:
Expand All @@ -435,7 +438,7 @@ def initialize(self, kwdict=None):
if self.archive:
self.archive.check = pathfmt.exists

postprocessors = self.extractor.config_accumulate("postprocessors")
postprocessors = extr.config_accumulate("postprocessors")
if postprocessors:
self.hooks = collections.defaultdict(list)
pp_log = self.get_logger("postprocessor")
Expand All @@ -453,7 +456,7 @@ def initialize(self, kwdict=None):
clist = pp_dict.get("blacklist")
negate = True
if clist and not util.build_extractor_filter(
clist, negate)(self.extractor):
clist, negate)(extr):
continue

name = pp_dict.get("name")
Expand All @@ -471,8 +474,7 @@ def initialize(self, kwdict=None):
pp_list.append(pp_obj)

if pp_list:
self.extractor.log.debug(
"Active postprocessor modules: %s", pp_list)
extr.log.debug("Active postprocessor modules: %s", pp_list)
if "init" in self.hooks:
for callback in self.hooks["init"]:
callback(pathfmt)
Expand Down
29 changes: 29 additions & 0 deletions gallery_dl/postprocessor/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,35 @@ def __init__(self, job, options):
events = events.split(",")
job.register_hooks({event: self.run for event in events}, options)

archive = options.get("archive")
if archive:
extr = job.extractor
archive = util.expand_path(archive)
archive_format = (
options.get("archive-prefix", extr.category) +
options.get("archive-format", "_MD_" + extr.archive_fmt))
try:
if "{" in archive:
archive = formatter.parse(archive).format_map(
job.pathfmt.kwdict)
self.archive = util.DownloadArchive(
archive, archive_format, "_archive_metadata")
except Exception as exc:
self.log.warning(
"Failed to open download archive at '%s' ('%s: %s')",
archive, exc.__class__.__name__, exc)
else:
self.log.debug("Using download archive '%s'", archive)
else:
self.archive = None

self.mtime = options.get("mtime")

def run(self, pathfmt):
archive = self.archive
if archive and archive.check(pathfmt.kwdict):
return

directory = self._directory(pathfmt)
path = directory + self._filename(pathfmt)

Expand All @@ -73,6 +99,9 @@ def run(self, pathfmt):
with open(path, "w", encoding="utf-8") as fp:
self.write(fp, pathfmt.kwdict)

if archive:
archive.add(pathfmt.kwdict)

if self.mtime:
mtime = pathfmt.kwdict.get("_mtime")
if mtime:
Expand Down
13 changes: 6 additions & 7 deletions gallery_dl/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,11 +672,14 @@ def __str__(self):

class DownloadArchive():

def __init__(self, path, extractor):
def __init__(self, path, format_string, cache_key="_archive_key"):
con = sqlite3.connect(path, timeout=60, check_same_thread=False)
con.isolation_level = None

self.close = con.close
self.cursor = con.cursor()
self.keygen = format_string.format_map
self._cache_key = cache_key

try:
self.cursor.execute("CREATE TABLE IF NOT EXISTS archive "
Expand All @@ -685,20 +688,16 @@ def __init__(self, path, extractor):
# fallback for missing WITHOUT ROWID support (#553)
self.cursor.execute("CREATE TABLE IF NOT EXISTS archive "
"(entry PRIMARY KEY)")
self.keygen = (
extractor.config("archive-prefix", extractor.category) +
extractor.config("archive-format", extractor.archive_fmt)
).format_map

def check(self, kwdict):
"""Return True if the item described by 'kwdict' exists in archive"""
key = kwdict["_archive_key"] = self.keygen(kwdict)
key = kwdict[self._cache_key] = self.keygen(kwdict)
self.cursor.execute(
"SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,))
return self.cursor.fetchone()

def add(self, kwdict):
"""Add item described by 'kwdict' to archive"""
key = kwdict.get("_archive_key") or self.keygen(kwdict)
key = kwdict.get(self._cache_key) or self.keygen(kwdict)
self.cursor.execute(
"INSERT OR IGNORE INTO archive VALUES (?)", (key,))

0 comments on commit 9bd27b1

Please sign in to comment.