[instagram] add 'include' option (closes #1180)

Split the functionality of the old 'user' extractor into separate 'posts' and 'highlights' extractors, which respond to virtual URLs ('/<user>/posts' and '/<user>/highlights')
mikf · Dec 21, 2020 · bf629a2 · bf629a2
1 parent 7806165
commit bf629a2
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 83 deletions.
diff --git a/docs/configuration.rst b/docs/configuration.rst
@@ -1045,15 +1045,22 @@ Description
     for details)
 
 
-extractor.instagram.highlights
-------------------------------
+extractor.instagram.include
+---------------------------
 Type
-    ``bool``
+    ``string`` or ``list`` of ``strings``
 Default
-    ``false``
+    ``"posts"``
+Example
+    ``"stories,highlights,posts"`` or ``["stories", "highlights", "posts"]``
 Description
-    Include *Story Highlights* when downloading a user profile.
-    (requires authentication)
+    A (comma-separated) list of subcategories to include
+    when processing a user profile.
+
+    Possible values are
+    ``"posts"``, ``"stories"``, ``"highlights"``, ``"channel"``.
+
+    You can use ``"all"`` instead of listing all values separately.
 
 
 extractor.instagram.videos

diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst
@@ -157,7 +157,7 @@ Turboimagehost       https://www.turboimagehost.com/     individual Images
 .. |furaffinity-C| replace:: Favorites, Galleries, Posts, Scraps, Search Results, User Profiles
 .. |hentaifoundry-C| replace:: Favorites, individual Images, Pictures, Popular Images, Recent Images, Scraps, Stories, User Profiles
 .. |imgur-C| replace:: Albums, Favorites, Galleries, individual Images, Search Results, Subreddits, Tag Searches, User Profiles
-.. |instagram-C| replace:: Channels, Posts, Saved Posts, Stories, Tag Searches, User Profiles
+.. |instagram-C| replace:: Channels, Highlights, Posts, Saved Posts, Stories, Tag Searches, User Profiles
 .. |newgrounds-C| replace:: Art, Audio, Favorites, individual Images, Media Files, Movies, User Profiles
 .. |nijie-C| replace:: Doujin, Favorites, Illustrations, individual Images, User Profiles
 .. |pixiv-C| replace:: Favorites, Follows, pixiv.me Links, Rankings, Search Results, User Profiles, individual Images

diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
@@ -12,11 +12,13 @@
 from .common import Extractor, Message
 from .. import text, util, exception
 from ..cache import cache
-import itertools
 import json
 import time
 import re
 
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com"
+USER_PATTERN = BASE_PATTERN + r"/(?!(?:p|tv|reel|explore|stories)/)([^/?#]+)"
+
 
 class InstagramExtractor(Extractor):
     """Base class for instagram extractors"""
@@ -31,6 +33,7 @@ class InstagramExtractor(Extractor):
 
     def __init__(self, match):
         Extractor.__init__(self, match)
+        self.item = match.group(1)
         self.www_claim = "0"
         self.csrf_token = util.generate_csrf_token()
         self._find_tags = re.compile(r"#\w+").findall
@@ -68,15 +71,18 @@ def posts(self):
 
     def request(self, url, **kwargs):
         response = Extractor.request(self, url, **kwargs)
+
         if response.history and "/accounts/login/" in response.request.url:
             if self._cursor:
                 self.log.info("Use '-o cursor=%s' to continue downloading "
                               "from the current position", self._cursor)
             raise exception.StopExtraction(
-                "Redirected to login page (%s)", response.request.url)
+                "HTTP redirect to login page (%s)", response.request.url)
+
         www_claim = response.headers.get("x-ig-set-www-claim")
         if www_claim is not None:
             self.www_claim = www_claim
+
         return response
 
     def _api_request(self, endpoint, params):
@@ -340,9 +346,9 @@ def _pagination(self, query_hash, variables, data):
             if not info["has_next_page"]:
                 return
             elif not data["edges"] and "_virtual" not in info:
-                s = "" if self.user.endswith("s") else "s"
+                s = "" if self.item.endswith("s") else "s"
                 raise exception.StopExtraction(
-                    "%s'%s posts are private", self.user, s)
+                    "%s'%s posts are private", self.item, s)
 
             variables["after"] = self._cursor = info["end_cursor"]
             self.log.debug("Cursor: %s", self._cursor)
@@ -351,80 +357,62 @@ def _pagination(self, query_hash, variables, data):
 
 
 class InstagramUserExtractor(InstagramExtractor):
-    """Extractor for ProfilePage"""
+    """Extractor for an Instagram user profile"""
     subcategory = "user"
-    pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
-               r"/(?!(?:p|explore|directory|accounts|stories|tv|reel)/)"
-               r"([^/?#]+)/?(?:$|[?#])")
+    pattern = USER_PATTERN + r"/?(?:$|[?#])"
     test = (
-        ("https://www.instagram.com/instagram/", {
-            "range": "1-16",
-            "count": ">= 16",
-        }),
-        #  ("https://www.instagram.com/instagram/", {
-        #  "options": (("highlights", True),),
-        #  "pattern": InstagramStoriesExtractor.pattern,
-        #  "range": "1-2",
-        #  "count": 2,
-        #  }),
+        ("https://www.instagram.com/instagram/"),
         ("https://www.instagram.com/instagram/?hl=en"),
     )
 
-    def __init__(self, match):
-        InstagramExtractor.__init__(self, match)
-        self.user = match.group(1)
+    def items(self):
+        if self.config("highlights"):
+            self.log.warning("'highlights' is deprecated, "
+                             "use '\"include\": \"…,highlights\"' instead")
+            default = ("highlights", "posts")
+        else:
+            default = ("posts",)
+
+        base = "{}/{}/".format(self.root, self.item)
+        stories = "{}/stories/{}/".format(self.root, self.item)
+        return self._dispatch_extractors((
+            (InstagramStoriesExtractor   , stories),
+            (InstagramHighlightsExtractor, base + "highlights/"),
+            (InstagramPostsExtractor     , base + "posts/"),
+            (InstagramChannelExtractor   , base + "channel/"),
+        ), default)
+
+
+class InstagramPostsExtractor(InstagramExtractor):
+    """Extractor for ProfilePage posts"""
+    subcategory = "posts"
+    pattern = USER_PATTERN + r"/posts"
+    test = ("https://www.instagram.com/instagram/posts/", {
+        "range": "1-16",
+        "count": ">= 16",
+    })
 
     def posts(self):
-        url = "{}/{}/".format(self.root, self.user)
+        url = "{}/{}/".format(self.root, self.item)
         user = self._extract_profile_page(url)
 
-        if user.get("highlight_reel_count") and self.config("highlights"):
-            query_hash = "d4d88dc1500312af6f937f7b804c68c3"
-            variables = {
-                "user_id": user["id"],
-                "include_chaining": False,
-                "include_reel": True,
-                "include_suggested_users": False,
-                "include_logged_out_extras": False,
-                "include_highlight_reels": True,
-                "include_live_status": True,
-            }
-            data = self._graphql_request(query_hash, variables)
-            highlights = [
-                {
-                    "__typename": "GraphReel",
-                    "id"        : "highlight:" + edge["node"]["id"],
-                }
-                for edge in data["user"]["edge_highlight_reels"]["edges"]
-            ]
-        else:
-            highlights = None
-
         query_hash = "003056d32c2554def87228bc3fd9668a"
         variables = {"id": user["id"], "first": 50}
         edge = self._get_edge_data(user, "edge_owner_to_timeline_media")
-        posts = self._pagination(query_hash, variables, edge)
-
-        return itertools.chain(highlights, posts) if highlights else posts
+        return self._pagination(query_hash, variables, edge)
 
 
 class InstagramChannelExtractor(InstagramExtractor):
     """Extractor for ProfilePage channel"""
     subcategory = "channel"
-    pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
-               r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
-               r"([^/?#]+)/channel")
+    pattern = USER_PATTERN + r"/channel"
     test = ("https://www.instagram.com/instagram/channel/", {
         "range": "1-16",
         "count": ">= 16",
     })
 
-    def __init__(self, match):
-        InstagramExtractor.__init__(self, match)
-        self.user = match.group(1)
-
     def posts(self):
-        url = "{}/{}/channel/".format(self.root, self.user)
+        url = "{}/{}/channel/".format(self.root, self.item)
         user = self._extract_profile_page(url)
 
         query_hash = "bc78b344a68ed16dd5d7f264681c4c76"
@@ -436,17 +424,11 @@ def posts(self):
 class InstagramSavedExtractor(InstagramExtractor):
     """Extractor for ProfilePage saved media"""
     subcategory = "saved"
-    pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
-               r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
-               r"([^/?#]+)/saved")
+    pattern = USER_PATTERN + r"([^/?#]+)/saved"
     test = ("https://www.instagram.com/instagram/saved/",)
 
-    def __init__(self, match):
-        InstagramExtractor.__init__(self, match)
-        self.user = match.group(1)
-
     def posts(self):
-        url = "{}/{}/saved/".format(self.root, self.user)
+        url = "{}/{}/saved/".format(self.root, self.item)
         user = self._extract_profile_page(url)
 
         query_hash = "2ce1d673055b99250e93b6f88f878fde"
@@ -459,22 +441,17 @@ class InstagramTagExtractor(InstagramExtractor):
     """Extractor for TagPage"""
     subcategory = "tag"
     directory_fmt = ("{category}", "{subcategory}", "{tag}")
-    pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
-               r"/explore/tags/([^/?#]+)")
+    pattern = BASE_PATTERN + r"/explore/tags/([^/?#]+)"
     test = ("https://www.instagram.com/explore/tags/instagram/", {
         "range": "1-16",
         "count": ">= 16",
     })
 
-    def __init__(self, match):
-        InstagramExtractor.__init__(self, match)
-        self.tag = match.group(1)
-
     def metadata(self):
-        return {"tag": self.tag}
+        return {"tag": self.item}
 
     def posts(self):
-        url = "{}/explore/tags/{}/".format(self.root, self.tag)
+        url = "{}/explore/tags/{}/".format(self.root, self.item)
         data = self._extract_shared_data(url)
         hashtag = data["entry_data"]["TagPage"][0]["graphql"]["hashtag"]
 
@@ -604,14 +581,10 @@ class InstagramPostExtractor(InstagramExtractor):
         ("https://www.instagram.com/reel/CDg_6Y1pxWu/"),
     )
 
-    def __init__(self, match):
-        InstagramExtractor.__init__(self, match)
-        self.shortcode = match.group(1)
-
     def posts(self):
         query_hash = "a9441f24ac73000fa17fe6e6da11d59d"
         variables = {
-            "shortcode"            : self.shortcode,
+            "shortcode"            : self.item,
             "child_comment_count"  : 3,
             "fetch_comment_count"  : 40,
             "parent_comment_count" : 24,
@@ -652,3 +625,34 @@ def posts(self):
             reel_id = user["id"]
 
         return ({"__typename": "GraphReel", "id": reel_id},)
+
+
+class InstagramHighlightsExtractor(InstagramExtractor):
+    """Extractor for all Instagram story highlights of a user"""
+    subcategory = "highlights"
+    pattern = USER_PATTERN + r"/highlights"
+    test = ("https://www.instagram.com/instagram/highlights",)
+
+    def posts(self):
+        url = "{}/{}/".format(self.root, self.item)
+        user = self._extract_profile_page(url)
+
+        query_hash = "d4d88dc1500312af6f937f7b804c68c3"
+        variables = {
+            "user_id": user["id"],
+            "include_chaining": False,
+            "include_reel": True,
+            "include_suggested_users": False,
+            "include_logged_out_extras": False,
+            "include_highlight_reels": True,
+            "include_live_status": True,
+        }
+        data = self._graphql_request(query_hash, variables)
+
+        return [
+            {
+                "__typename": "GraphReel",
+                "id"        : "highlight:" + edge["node"]["id"],
+            }
+            for edge in data["user"]["edge_highlight_reels"]["edges"]
+        ]
diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py
@@ -123,6 +123,7 @@
         "story": "",
     },
     "instagram": {
+        "posts": "",
         "saved": "Saved Posts",
     },
     "newgrounds": {