Skip to content

Commit

Permalink
update archive IDs
Browse files Browse the repository at this point in the history
... to behave in a more straightforward way when dealing with
bookmarks/favourites/etc.

specific IDs are now grouped by their owner, album-id, ... to
allow for duplicates when it would be expected.
  • Loading branch information
mikf committed Mar 1, 2018
1 parent 829ddf4 commit 5008e10
Show file tree
Hide file tree
Showing 11 changed files with 59 additions and 27 deletions.
20 changes: 13 additions & 7 deletions gallery_dl/extractor/booru.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ class BooruExtractor(SharedConfigExtractor):
"""Base class for all booru extractors"""
basecategory = "booru"
filename_fmt = "{category}_{id}_{md5}.{extension}"
archive_fmt = "{id}"
api_url = ""
per_page = 50
page_start = 1
Expand All @@ -39,20 +38,23 @@ def skip(self, num):
return pages * self.per_page

def items(self):
data = self.get_metadata()

yield Message.Version, 1
yield Message.Directory, self.get_metadata()
yield Message.Directory, data

self.reset_page()
while True:
images = self.parse_response(
self.request(self.api_url, params=self.params))

for data in images:
for image in images:
try:
url = data["file_url"]
url = image["file_url"]
if url.startswith("/"):
url = urljoin(self.api_url, url)
yield Message.Url, url, text.nameext_from_url(url, data)
image.update(data)
yield Message.Url, url, text.nameext_from_url(url, image)
except KeyError:
continue

Expand Down Expand Up @@ -115,7 +117,8 @@ def update_page(self, data):
class TagMixin():
"""Extraction of images based on search-tags"""
subcategory = "tag"
directory_fmt = ["{category}", "{tags}"]
directory_fmt = ["{category}", "{search_tags}"]
archive_fmt = "t_{search_tags}_{id}"

def __init__(self, match):
super().__init__(match)
Expand All @@ -124,13 +127,14 @@ def __init__(self, match):
self.params["limit"] = self.per_page

def get_metadata(self):
return {"tags": self.tags}
return {"search_tags": self.tags}


class PoolMixin():
"""Extraction of image-pools"""
subcategory = "pool"
directory_fmt = ["{category}", "pool", "{pool}"]
archive_fmt = "p_{pool}_{id}"

def __init__(self, match):
super().__init__(match)
Expand All @@ -145,6 +149,7 @@ def get_metadata(self):
class PostMixin():
"""Extraction of a single image-post"""
subcategory = "post"
archive_fmt = "{id}"

def __init__(self, match):
super().__init__(match)
Expand All @@ -156,6 +161,7 @@ class PopularMixin():
"""Extraction and metadata handling for Danbooru v2"""
subcategory = "popular"
directory_fmt = ["{category}", "popular", "{scale}", "{date}"]
archive_fmt = "P_{scale[0]}_{date}_{id}"
page_start = None
sort = True

Expand Down
10 changes: 8 additions & 2 deletions gallery_dl/extractor/deviantart.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ class DeviantartExtractor(Extractor):
category = "deviantart"
directory_fmt = ["{category}", "{author[username]!l}"]
filename_fmt = "{category}_{index}_{title}.{extension}"
archive_fmt = "{index}.{extension}"

def __init__(self, match=None):
Extractor.__init__(self)
Expand Down Expand Up @@ -166,6 +165,8 @@ def _folder_urls(self, folders, category):
class DeviantartGalleryExtractor(DeviantartExtractor):
"""Extractor for all deviations from an artist's gallery"""
subcategory = "gallery"
archive_fmt = "g_{username}_{index}.{extension}"

pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com"
r"(?:/(?:gallery/?(?:\?catpath=/)?)?)?$"]
test = [
Expand All @@ -192,6 +193,7 @@ class DeviantartFolderExtractor(DeviantartExtractor):
"""Extractor for deviations inside an artist's gallery folder"""
subcategory = "folder"
directory_fmt = ["{category}", "{folder[owner]}", "{folder[title]}"]
archive_fmt = "F_{folder[index]}_{index}.{extension}"
pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com"
r"/gallery/(\d+)/([^/?&#]+)"]
test = [
Expand Down Expand Up @@ -225,6 +227,7 @@ def prepare(self, deviation):
class DeviantartDeviationExtractor(DeviantartExtractor):
"""Extractor for single deviations"""
subcategory = "deviation"
archive_fmt = "{index}.{extension}"
pattern = [(r"(?:https?://)?([^.]+\.deviantart\.com/"
r"(?:art|journal)/[^/?&#]+-\d+)"),
(r"(?:https?://)?(sta\.sh/[a-z0-9]+)")]
Expand Down Expand Up @@ -268,6 +271,7 @@ class DeviantartFavoriteExtractor(DeviantartExtractor):
"""Extractor for an artist's favorites"""
subcategory = "favorite"
directory_fmt = ["{category}", "{username}", "Favourites"]
archive_fmt = "f_{username}_{index}.{extension}"
pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com"
r"/favourites/?(?:\?catpath=/)?$"]
test = [
Expand Down Expand Up @@ -295,12 +299,13 @@ class DeviantartCollectionExtractor(DeviantartExtractor):
subcategory = "collection"
directory_fmt = ["{category}", "{collection[owner]}",
"Favourites", "{collection[title]}"]
archive_fmt = "C_{collection[index]}_{index}.{extension}"
pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com"
r"/favourites/(\d+)/([^/?&#]+)"]
test = [(("https://pencilshadings.deviantart.com"
"/favourites/70595441/3D-Favorites"), {
"url": "742f92199d5bc6a89cda6ec6133d46c7a523824d",
"keyword": "9210c976b5274eff6ea1d2b8a4f891c9f35ce340",
"keyword": "5da3a16e85150d2a09e074b2b2ee916099b52737",
"options": (("original", False),),
})]

Expand All @@ -324,6 +329,7 @@ class DeviantartJournalExtractor(DeviantartExtractor):
"""Extractor for an artist's journals"""
subcategory = "journal"
directory_fmt = ["{category}", "{username}", "Journal"]
archive_fmt = "j_{username}_{index}.{extension}"
pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com"
r"/(?:journal|blog)/?(?:\?catpath=/)?$"]
test = [
Expand Down
8 changes: 7 additions & 1 deletion gallery_dl/extractor/flickr.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ class FlickrExtractor(Extractor):
"""Base class for flickr extractors"""
category = "flickr"
filename_fmt = "{category}_{id}.{extension}"
archive_fmt = "{id}"

def __init__(self, match):
Extractor.__init__(self)
Expand Down Expand Up @@ -45,6 +44,7 @@ def photos(self):
class FlickrImageExtractor(FlickrExtractor):
"""Extractor for individual images from flickr.com"""
subcategory = "image"
archive_fmt = "{id}"
pattern = [r"(?:https?://)?(?:www\.|m\.)?flickr\.com/photos/[^/]+/(\d+)",
r"(?:https?://)?[^.]+\.static\.?flickr\.com/(?:\d+/)+(\d+)_",
r"(?:https?://)?flic\.kr/(p)/([A-Za-z1-9]+)"]
Expand Down Expand Up @@ -108,6 +108,7 @@ class FlickrAlbumExtractor(FlickrExtractor):
subcategory = "album"
directory_fmt = ["{category}", "{subcategory}s",
"{album[id]} - {album[title]}"]
archive_fmt = "a_{album[id]}_{id}"
pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/"
r"photos/([^/]+)/(?:album|set)s/(\d+)"]
test = [(("https://www.flickr.com/photos/"
Expand Down Expand Up @@ -143,6 +144,7 @@ class FlickrGalleryExtractor(FlickrExtractor):
subcategory = "gallery"
directory_fmt = ["{category}", "galleries",
"{user[username]} {gallery[id]}"]
archive_fmt = "g_{gallery[id]}_{id}"
pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/"
r"photos/([^/]+)/galleries/(\d+)"]
test = [(("https://www.flickr.com/photos/flickr/"
Expand Down Expand Up @@ -171,6 +173,7 @@ class FlickrGroupExtractor(FlickrExtractor):
"""Extractor for group pools from flickr.com"""
subcategory = "group"
directory_fmt = ["{category}", "{subcategory}s", "{group[groupname]}"]
archive_fmt = "G_{group[nsid]}_{id}"
pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)"]
test = [("https://www.flickr.com/groups/bird_headshots/", {
"pattern": (r"https?://farm\d+\.staticflickr\.com"
Expand All @@ -189,6 +192,7 @@ class FlickrUserExtractor(FlickrExtractor):
"""Extractor for the photostream of a flickr user"""
subcategory = "user"
directory_fmt = ["{category}", "{user[username]}"]
archive_fmt = "u_{user[nsid]}_{id}"
pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$"]
test = [("https://www.flickr.com/photos/shona_s/", {
"url": "d125b536cd8c4229363276b6c84579c394eec3a2",
Expand All @@ -203,6 +207,7 @@ class FlickrFavoriteExtractor(FlickrExtractor):
"""Extractor for favorite photos of a flickr user"""
subcategory = "favorite"
directory_fmt = ["{category}", "{subcategory}s", "{user[username]}"]
archive_fmt = "f_{user[nsid]}_{id}"
pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites"]
test = [("https://www.flickr.com/photos/shona_s/favorites", {
"url": "5129b3f5bfa83cc25bdae3ce476036de1488dad2",
Expand All @@ -217,6 +222,7 @@ class FlickrSearchExtractor(FlickrExtractor):
"""Extractor for flickr photos based on search results"""
subcategory = "search"
directory_fmt = ["{category}", "{subcategory}", "{search[text]}"]
archive_fmt = "s_{search}_{id}"
pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)"]
test = [
(("https://flickr.com/search/?text=mountain"), None),
Expand Down
13 changes: 9 additions & 4 deletions gallery_dl/extractor/gelbooru.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ class GelbooruExtractor(SharedConfigExtractor):
basecategory = "booru"
category = "gelbooru"
filename_fmt = "{category}_{id}_{md5}.{extension}"
archive_fmt = "{id}"
api_url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index"

def __init__(self):
Expand All @@ -29,15 +28,18 @@ def __init__(self):
self.get_post_data = self.get_post_data_api

def items(self):
data = self.get_metadata()

yield Message.Version, 1
yield Message.Directory, self.get_metadata()
yield Message.Directory, data

for post in util.advance(self.get_posts(), self.start_post):
if isinstance(post, str):
post = self.get_post_data(post)
for key in ("id", "width", "height", "score", "change"):
post[key] = util.safe_int(post[key])
url = post["file_url"]
post.update(data)
yield Message.Url, url, text.nameext_from_url(url, post)

def skip(self, num):
Expand Down Expand Up @@ -85,7 +87,8 @@ def get_post_data_api(self, post_id):
class GelbooruTagExtractor(GelbooruExtractor):
"""Extractor for images from gelbooru.com based on search-tags"""
subcategory = "tag"
directory_fmt = ["{category}", "{tags}"]
directory_fmt = ["{category}", "{search_tags}"]
archive_fmt = "t_{search_tags}_{id}"
pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=post&s=list&tags=([^&]+)"]
test = [
Expand All @@ -111,7 +114,7 @@ def skip(self, num):
return num

def get_metadata(self):
return {"tags": self.tags}
return {"search_tags": self.tags}

def get_posts(self):
if self.use_api:
Expand Down Expand Up @@ -149,6 +152,7 @@ class GelbooruPoolExtractor(GelbooruExtractor):
"""Extractor for image-pools from gelbooru.com"""
subcategory = "pool"
directory_fmt = ["{category}", "pool", "{pool}"]
archive_fmt = "p_{pool}_{id}"
pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=pool&s=show&id=(\d+)"]
test = [("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
Expand Down Expand Up @@ -182,6 +186,7 @@ def get_posts(self):
class GelbooruPostExtractor(GelbooruExtractor):
"""Extractor for single images from gelbooru.com"""
subcategory = "post"
archive_fmt = "{id}"
pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=post&s=view&id=(\d+)"]
test = [("https://gelbooru.com/index.php?page=post&s=view&id=313638", {
Expand Down
2 changes: 1 addition & 1 deletion gallery_dl/extractor/imagebam.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class ImagebamGalleryExtractor(AsynchronousExtractor):
subcategory = "gallery"
directory_fmt = ["{category}", "{title} - {gallery_key}"]
filename_fmt = "{num:>03}-{name}.{extension}"
archive_fmt = "{image_id}"
archive_fmt = "{gallery_key}_{image_id}"
pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+)"]
test = [(("http://www.imagebam.com/"
"gallery/adz2y0f9574bjpmonaismyrhtjgvey4o"), {
Expand Down
3 changes: 2 additions & 1 deletion gallery_dl/extractor/imgbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
class ImgboxExtractor(Extractor):
"""Base class for imgbox extractors"""
category = "imgbox"
archive_fmt = "{image_key}"
root = "https://imgbox.com"

def items(self):
Expand Down Expand Up @@ -64,6 +63,7 @@ class ImgboxGalleryExtractor(AsynchronousExtractor, ImgboxExtractor):
subcategory = "gallery"
directory_fmt = ["{category}", "{title} - {gallery_key}"]
filename_fmt = "{num:>03}-{name}.{extension}"
archive_fmt = "{gallery_key}_{image_key}"
pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/g/([A-Za-z0-9]{10})"]
test = [
("https://imgbox.com/g/JaX5V5HX7g", {
Expand Down Expand Up @@ -106,6 +106,7 @@ def get_image_keys(self):
class ImgboxImageExtractor(ImgboxExtractor):
"""Extractor for single images from imgbox.com"""
subcategory = "image"
archive_fmt = "{image_key}"
pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/([A-Za-z0-9]{8})"]
test = [
("https://imgbox.com/qHhw7lpG", {
Expand Down
7 changes: 4 additions & 3 deletions gallery_dl/extractor/imgchili.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
class ImgchiliExtractor(Extractor):
"""Base class for imgchili extractors"""
category = "imgchili"
archive_fmt = "{image_id}"
root = "https://imgchili.net"

def __init__(self, match):
Expand Down Expand Up @@ -45,6 +44,7 @@ def get_images(self, page):
class ImgchiliImageExtractor(ImgchiliExtractor):
"""Extractor for single images from imgchili.net"""
subcategory = "image"
archive_fmt = "{image_id}"
pattern = [r"(?:https?://)?(?:www\.)?imgchili\.net/show/\d+/(\d+)_[^/]+"]
test = [(("http://imgchili.net/show/89427/"
"89427136_test___quot;___gt;.png"), {
Expand All @@ -71,7 +71,8 @@ def get_images(self, page):
class ImgchiliAlbumExtractor(ImgchiliExtractor):
"""Extractor for image-albums from imgchili.net"""
subcategory = "album"
directory_fmt = ["{category}", "{title} - {key}"]
directory_fmt = ["{category}", "{title} - {album_id}"]
archive_fmt = "{album_id}_{image_id}"
filename_fmt = "{num:>03} {filename}"
pattern = [r"(?:https?://)?(?:www\.)?imgchili\.net/album/([^/]+)"]
test = [("http://imgchili.net/album/7a3824c59f77c8d39b260f9168d4b49b", {
Expand All @@ -83,7 +84,7 @@ def get_job_metadata(self, page):
title = text.extract(page, "<h1>", "</h1>")[0]
return {
"title": text.unescape(title),
"key": self.match.group(1),
"album_id": self.match.group(1),
}

def get_images(self, page):
Expand Down
Loading

5 comments on commit 5008e10

@Hrxn
Copy link
Contributor

@Hrxn Hrxn commented on 5008e10 Mar 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For tumblr.py, this archive_fmt = "f_{blog[name]}_{id}_{offset}" is currently only defined for the class TumblrLikesExtractor. Is this intentional?

Not sure because of TumblrUserExtractor for example... or are the IDs always the same?

@mikf
Copy link
Owner Author

@mikf mikf commented on 5008e10 Mar 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The other Tumblr extractors are currently using archive_fmt = "{id}_{offset}" (defined in the TumblrExtractor base class).

I thought it would be OK to group the results of user, post and tag extractors together, since they all download their media into the same directory anyway.

I could change the archive-format of user-extractors, for example, to u_{blog[name]}_{id}_{offset}, just to be on the safe side and to have a separate "namespace" for each blog, but that would be redundant, I think.

@Hrxn
Copy link
Contributor

@Hrxn Hrxn commented on 5008e10 Mar 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed, and the base class is using directory_fmt = ["{category}", "{name}"] for the output directory, so how would this work with reblogs, for example? If you download blog A, and that has a specific post X, and then download blog B, which has a reblog of that post X, would it end up in the archive (and thus missing in the directory for blog b)? Or are the IDs always unique across different blogs?

@mikf
Copy link
Owner Author

@mikf mikf commented on 5008e10 Mar 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A reblog has a different ID than the original post.

For example:
original: https://mikf123.tumblr.com/post/167623548569
its reblog: https://mikf123.tumblr.com/post/169341068404

@Hrxn
Copy link
Contributor

@Hrxn Hrxn commented on 5008e10 Mar 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same for successive reblogs on another blog.. okay, so this seems safe to use. Also the same for re-uploads.

Good, this is fine then. Although Tumblr's ID format still does not make any sense to me..

Please sign in to comment.