[xhamster] add gallery & user extractor (#281)

mikf · Jun 5, 2019 · 0960093 · 0960093
1 parent 208202b
commit 0960093
Show file tree

Hide file tree

Showing 4 changed files with 174 additions and 0 deletions.
diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst
@@ -104,6 +104,7 @@ Warosu               https://warosu.org/                 Threads
 Weibo                https://www.weibo.com/              Images from Users, Images from Statuses
 WikiArt.org          https://www.wikiart.org/            Artists, Artworks
 World Three          http://www.slide.world-three.org/   Chapters, Manga
+xHamster             https://xhamster.com/               Images from Users, Galleries
 XVideos              https://www.xvideos.com/            Images from Users, Galleries
 Yandere              https://yande.re/                   Pools, Popular Images, Posts, Tag-Searches
 yaplog!              https://yaplog.jp/                  Blogs, Posts

diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -93,6 +93,7 @@
     "warosu",
     "weibo",
     "wikiart",
+    "xhamster",
     "xvideos",
     "yandere",
     "yaplog",

diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py
@@ -0,0 +1,171 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://xhamster.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?xhamster\.(?:com|one|desi)"
+
+
+class XhamsterExtractor(Extractor):
+    """Base class for xhamster extractors"""
+    category = "xhamster"
+    root = "https://xhamster.com"
+
+
+class XhamsterGalleryExtractor(XhamsterExtractor):
+    """Extractor for image galleries on xhamster.com"""
+    subcategory = "gallery"
+    directory_fmt = ("{category}", "{user[name]}",
+                     "{gallery[id]} {gallery[title]}")
+    filename_fmt = "{num:>03}_{id}.{extension}"
+    archive_fmt = "{id}"
+    pattern = BASE_PATTERN + r"(/photos/gallery/[^/?&#]+)"
+    test = (
+        ("https://xhamster.com/photos/gallery/11748968", {
+            "pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$",
+            "count": 143,
+            "keyword": {
+                "comments": int,
+                "count": 143,
+                "favorite": bool,
+                "id": int,
+                "num": int,
+                "height": int,
+                "width": int,
+                "imageURL": str,
+                "pageURL": str,
+                "thumbURL": str,
+                "gallery": {
+                    "date": "type:datetime",
+                    "description": "",
+                    "dislikes": int,
+                    "id": 11748968,
+                    "likes": int,
+                    "tags": ["NON-Porn"],
+                    "thumbnail": str,
+                    "title": "Make the world better.",
+                    "views": int,
+                },
+                "user": {
+                    "id": 16874672,
+                    "name": "Anonymousrants",
+                    "retired": bool,
+                    "subscribers": int,
+                    "url": "https://xhamster.com/users/anonymousrants",
+                    "verified": bool,
+                },
+            },
+        }),
+        ("https://xhamster.com/photos/gallery/make-the-world-better-11748968"),
+        ("https://xhamster.com/photos/gallery/11748968"),
+        ("https://xhamster.one/photos/gallery/11748968"),
+        ("https://xhamster.desi/photos/gallery/11748968"),
+        ("https://en.xhamster.com/photos/gallery/11748968"),
+    )
+
+    def __init__(self, match):
+        XhamsterExtractor.__init__(self, match)
+        self.path = match.group(1)
+        self.data = None
+
+    def items(self):
+        data = self.metadata()
+        yield Message.Version, 1
+        yield Message.Directory, data
+        for num, image in enumerate(self.images(), 1):
+            url = image["imageURL"]
+            image.update(data)
+            image["num"] = num
+            yield Message.Url, url, text.nameext_from_url(url, image)
+
+    def metadata(self):
+        self.data = self._data(self.root + self.path)
+        user = self.data["authorModel"]
+        imgs = self.data["photosGalleryModel"]
+
+        return {
+            "user":
+            {
+                "id"         : text.parse_int(user["id"]),
+                "url"        : user["pageURL"],
+                "name"       : user["name"],
+                "retired"    : user["retired"],
+                "verified"   : user["verified"],
+                "subscribers": user["subscribers"],
+            },
+            "gallery":
+            {
+                "id"         : text.parse_int(imgs["id"]),
+                "tags"       : [c["name"] for c in imgs["categories"]],
+                "date"       : text.parse_timestamp(imgs["created"]),
+                "views"      : text.parse_int(imgs["views"]),
+                "likes"      : text.parse_int(imgs["rating"]["likes"]),
+                "dislikes"   : text.parse_int(imgs["rating"]["dislikes"]),
+                "title"      : imgs["title"],
+                "description": imgs["description"],
+                "thumbnail"  : imgs["thumbURL"],
+            },
+            "count": text.parse_int(imgs["quantity"]),
+        }
+
+    def images(self):
+        data = self.data
+        self.data = None
+
+        while True:
+            for image in data["photosGalleryModel"]["photos"]:
+                del image["modelName"]
+                yield image
+
+            pgntn = data["pagination"]
+            if pgntn["active"] == pgntn["maxPage"]:
+                return
+            url = pgntn["pageLinkTemplate"][:-3] + str(pgntn["next"])
+            data = self._data(url)
+
+    def _data(self, url):
+        page = self.request(url).text
+        return json.loads(text.extract(
+            page, "window.initials =", "</script>")[0].rstrip("\n\r;"))
+
+
+class XhamsterUserExtractor(XhamsterExtractor):
+    """Extractor for all galleries of an xhamster user"""
+    subcategory = "user"
+    pattern = BASE_PATTERN + r"/users/([^/?&#]+)(?:/photos)?/?(?:$|[?#])"
+    test = (
+        ("https://xhamster.com/users/nickname68/photos", {
+            "pattern": XhamsterGalleryExtractor.pattern,
+            "count": 50,
+            "range": "1-50",
+        }),
+        ("https://xhamster.com/users/nickname68"),
+    )
+
+    def __init__(self, match):
+        XhamsterExtractor.__init__(self, match)
+        self.user = match.group(1)
+
+    def items(self):
+        yield Message.Version, 1
+        url = "{}/users/{}/photos".format(self.root, self.user)
+        data = {"_extractor": XhamsterGalleryExtractor}
+
+        while url:
+            extr = text.extract_from(self.request(url).text)
+            while True:
+                url = extr('thumb-image-container" href="', '"')
+                if not url:
+                    break
+                yield Message.Queue, url, data
+            url = extr('data-page="next" href="', '"')
diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py
@@ -69,6 +69,7 @@
     "thebarchive"    : "The /b/ Archive",
     "wikiart"        : "WikiArt.org",
     "worldthree"     : "World Three",
+    "xhamster"       : "xHamster",
     "xvideos"        : "XVideos",
     "yaplog"         : "yaplog!",
     "yuki"           : "yuki.la 4chan archive",