[comicvine] add extractor (closes #1712)

mikf · Jul 23, 2021 · da7297c · da7297c · phanirithvij · Jul 23, 2021
1 parent e4788fa
commit da7297c
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 0 deletions.
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
@@ -97,6 +97,12 @@ Consider all sites to be NSFW unless otherwise known.
     <td>Blogs, Posts, Search Results</td>
     <td></td>
 </tr>
+<tr>
+    <td>Comic Vine</td>
+    <td>https://comicvine.gamespot.com/</td>
+    <td>Tag Searches</td>
+    <td></td>
+</tr>
 <tr>
     <td>Cyberdrop</td>
     <td>https://cyberdrop.me/</td>

diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -24,6 +24,7 @@
     "bcy",
     "behance",
     "blogger",
+    "comicvine",
     "cyberdrop",
     "danbooru",
     "deviantart",

diff --git a/gallery_dl/extractor/comicvine.py b/gallery_dl/extractor/comicvine.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://comicvine.gamespot.com/"""
+
+from .booru import BooruExtractor
+from .. import text
+import operator
+
+
+class ComicvineTagExtractor(BooruExtractor):
+    """Extractor for a gallery on comicvine.gamespot.com"""
+    category = "comicvine"
+    subcategory = "tag"
+    basecategory = ""
+    root = "https://comicvine.gamespot.com"
+    per_page = 1000
+    directory_fmt = ("{category}", "{tag}")
+    filename_fmt = "{filename}.{extension}"
+    archive_fmt = "{id}"
+    pattern = (r"(?:https?://)?comicvine\.gamespot\.com"
+               r"(/([^/?#]+)/(\d+-\d+)/images/.*)")
+    test = (
+        ("https://comicvine.gamespot.com/jock/4040-5653/images/", {
+            "pattern": r"https://comicvine\.gamespot\.com/a/uploads"
+                       r"/original/\d+/\d+/\d+-.+\.(jpe?g|png)",
+            "count": ">= 140",
+        }),
+        (("https://comicvine.gamespot.com/batman/4005-1699"
+          "/images/?tag=Fan%20Art%20%26%20Cosplay"), {
+            "pattern": r"https://comicvine\.gamespot\.com/a/uploads"
+                       r"/original/\d+/\d+/\d+-.+",
+            "count": ">= 450",
+        }),
+    )
+
+    def __init__(self, match):
+        BooruExtractor.__init__(self, match)
+        self.path, self.object_name, self.object_id = match.groups()
+
+    def metadata(self):
+        return {"tag": text.unquote(self.object_name)}
+
+    def posts(self):
+        url = self.root + "/js/image-data.json"
+        params = {
+            "images": text.extract(
+                self.request(self.root + self.path).text,
+                'data-gallery-id="', '"')[0],
+            "start" : self.page_start,
+            "count" : self.per_page,
+            "object": self.object_id,
+        }
+
+        while True:
+            images = self.request(url, params=params).json()["images"]
+            yield from images
+
+            if len(images) < self.per_page:
+                return
+            params["start"] += self.per_page
+
+    def skip(self, num):
+        self.page_start = num
+        return num
+
+    _file_url = operator.itemgetter("original")
+
+    @staticmethod
+    def _prepare(post):
+        post["date"] = text.parse_datetime(
+            post["dateCreated"], "%a, %b %d %Y")
+        post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]]
diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py
@@ -24,6 +24,7 @@
     "baraag"         : "baraag",
     "bbc"            : "BBC",
     "bcy"            : "半次元",
+    "comicvine"      : "Comic Vine",
     "deviantart"     : "DeviantArt",
     "dokireader"     : "Doki Reader",
     "drawfriends"    : "Draw Friends",