Skip to content

Commit

Permalink
[kemonoparty] skip duplicate files (#2032, #1991, #1899)
Browse files Browse the repository at this point in the history
Extract the SHA-256 file hash from URLs
and skip files with the same hash in the same post.

- provide a 'hash' metadata field (empty string if not available)
- remove 'patreon-skip-file' option
  • Loading branch information
mikf committed Nov 17, 2021
1 parent d4ec245 commit d433735
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 17 deletions.
10 changes: 0 additions & 10 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1370,16 +1370,6 @@ Description
Extract ``username`` metadata


extractor.kemonoparty.patreon-skip-file
---------------------------------------
Type
``bool``
Default
``true``
Description
Skip main files in Patreon posts to avoid duplicates.


extractor.khinsider.format
--------------------------
Type
Expand Down
27 changes: 20 additions & 7 deletions gallery_dl/extractor/kemonoparty.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ def items(self):
self._find_inline = re.compile(
r'src="(?:https?://kemono\.party)?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
self._skip_service = \
"patreon" if self.config("patreon-skip-file", True) else None
find_hash = re.compile("/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})").match
generators = self._build_file_generators(self.config("files"))
comments = self.config("comments")

Expand All @@ -61,12 +60,25 @@ def items(self):
post["comments"] = self._extract_comments(post)
yield Message.Directory, post

hashes = set()
post["num"] = 0
for file in itertools.chain.from_iterable(
g(post) for g in generators):
url = file["path"]

match = find_hash(url)
if match:
post["hash"] = hash = match.group(1)
if hash in hashes:
self.log.debug("Skipping %s (duplicate)", url)
continue
hashes.add(hash)
else:
post["hash"] = ""

post["type"] = file["type"]
post["num"] += 1
url = file["path"]

if url[0] == "/":
url = self.root + "/data" + url
elif url.startswith("https://kemono.party"):
Expand Down Expand Up @@ -99,8 +111,6 @@ def _postfile(self, post):
if not file:
return ()
file["type"] = "file"
if post["service"] == self._skip_service and post["attachments"]:
return ()
return (file,)

def _attachments(self, post):
Expand Down Expand Up @@ -196,6 +206,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
"embed": dict,
"extension": "jpeg",
"filename": "P058kDFYus7DbqAkGlfWTlOr",
"hash": "210f35388e28bbcf756db18dd516e2d8"
"2ce758e0d32881eeee76d43e1716d382",
"id": "506575",
"num": 1,
"published": "Sun, 11 Aug 2019 02:09:04 GMT",
Expand All @@ -211,6 +223,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
("https://kemono.party/fanbox/user/7356311/post/802343", {
"pattern": r"https://kemono\.party/data/47/b5/47b5c014ecdcfabdf2c8"
r"5eec53f1133a76336997ae8596f332e97d956a460ad2\.jpg",
"keyword": {"hash": "47b5c014ecdcfabdf2c85eec53f1133a"
"76336997ae8596f332e97d956a460ad2"},
}),
# kemono.party -> data.kemono.party
("https://kemono.party/gumroad/user/trylsc/post/IURjT", {
Expand All @@ -223,10 +237,9 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
"options": (("metadata", True),),
"keyword": {"username": "Kudalyn's Creations"},
}),
# skip patreon main file (#1667, #1689)
# skip patreon duplicates
("https://kemono.party/patreon/user/4158582/post/32099982", {
"count": 2,
"keyword": {"type": "attachment"},
}),
("https://kemono.party/subscribestar/user/alcorart/post/184330"),
)
Expand Down

2 comments on commit d433735

@iceo18
Copy link

@iceo18 iceo18 commented on d433735 Mar 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way to not force the skip via configurations? I'm downloading a gallery that has multiple variations and would sometimes use the same frame/image one ach variation. I've tried setting skip to false and adding page numbers to each file but it will still skip duplicates.

I'm currently working around it by keeping an older version, but I wonder if it can be done evenwith updated exes.

@thatfuckingbird
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way to not force the skip via configurations? I'm downloading a gallery that has multiple variations and would sometimes use the same frame/image one ach variation. I've tried setting skip to false and adding page numbers to each file but it will still skip duplicates.

I'm currently working around it by keeping an older version, but I wonder if it can be done evenwith updated exes.

Doesn't look like it from the code above.
Btw you should open an issue, commenting here is not very visible (only saw it since I have all notifications for this repo on).

Please sign in to comment.