Skip to content

Commit

Permalink
[instagram] add 'include' option (closes #1180)
Browse files Browse the repository at this point in the history
Split the functionality of the old 'user' extractor into separate
'posts' and 'highlights' extractors, which respond to virtual URLs
('/<user>/posts' and '/<user>/highlights')
  • Loading branch information
mikf committed Dec 21, 2020
1 parent 7806165 commit bf629a2
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 83 deletions.
19 changes: 13 additions & 6 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1045,15 +1045,22 @@ Description
for details)


extractor.instagram.highlights
------------------------------
extractor.instagram.include
---------------------------
Type
``bool``
``string`` or ``list`` of ``strings``
Default
``false``
``"posts"``
Example
``"stories,highlights,posts"`` or ``["stories", "highlights", "posts"]``
Description
Include *Story Highlights* when downloading a user profile.
(requires authentication)
A (comma-separated) list of subcategories to include
when processing a user profile.

Possible values are
``"posts"``, ``"stories"``, ``"highlights"``, ``"channel"``.

You can use ``"all"`` instead of listing all values separately.


extractor.instagram.videos
Expand Down
2 changes: 1 addition & 1 deletion docs/supportedsites.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ Turboimagehost https://www.turboimagehost.com/ individual Images
.. |furaffinity-C| replace:: Favorites, Galleries, Posts, Scraps, Search Results, User Profiles
.. |hentaifoundry-C| replace:: Favorites, individual Images, Pictures, Popular Images, Recent Images, Scraps, Stories, User Profiles
.. |imgur-C| replace:: Albums, Favorites, Galleries, individual Images, Search Results, Subreddits, Tag Searches, User Profiles
.. |instagram-C| replace:: Channels, Posts, Saved Posts, Stories, Tag Searches, User Profiles
.. |instagram-C| replace:: Channels, Highlights, Posts, Saved Posts, Stories, Tag Searches, User Profiles
.. |newgrounds-C| replace:: Art, Audio, Favorites, individual Images, Media Files, Movies, User Profiles
.. |nijie-C| replace:: Doujin, Favorites, Illustrations, individual Images, User Profiles
.. |pixiv-C| replace:: Favorites, Follows, pixiv.me Links, Rankings, Search Results, User Profiles, individual Images
Expand Down
156 changes: 80 additions & 76 deletions gallery_dl/extractor/instagram.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@
from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import itertools
import json
import time
import re

BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com"
USER_PATTERN = BASE_PATTERN + r"/(?!(?:p|tv|reel|explore|stories)/)([^/?#]+)"


class InstagramExtractor(Extractor):
"""Base class for instagram extractors"""
Expand All @@ -31,6 +33,7 @@ class InstagramExtractor(Extractor):

def __init__(self, match):
Extractor.__init__(self, match)
self.item = match.group(1)
self.www_claim = "0"
self.csrf_token = util.generate_csrf_token()
self._find_tags = re.compile(r"#\w+").findall
Expand Down Expand Up @@ -68,15 +71,18 @@ def posts(self):

def request(self, url, **kwargs):
response = Extractor.request(self, url, **kwargs)

if response.history and "/accounts/login/" in response.request.url:
if self._cursor:
self.log.info("Use '-o cursor=%s' to continue downloading "
"from the current position", self._cursor)
raise exception.StopExtraction(
"Redirected to login page (%s)", response.request.url)
"HTTP redirect to login page (%s)", response.request.url)

www_claim = response.headers.get("x-ig-set-www-claim")
if www_claim is not None:
self.www_claim = www_claim

return response

def _api_request(self, endpoint, params):
Expand Down Expand Up @@ -340,9 +346,9 @@ def _pagination(self, query_hash, variables, data):
if not info["has_next_page"]:
return
elif not data["edges"] and "_virtual" not in info:
s = "" if self.user.endswith("s") else "s"
s = "" if self.item.endswith("s") else "s"
raise exception.StopExtraction(
"%s'%s posts are private", self.user, s)
"%s'%s posts are private", self.item, s)

variables["after"] = self._cursor = info["end_cursor"]
self.log.debug("Cursor: %s", self._cursor)
Expand All @@ -351,80 +357,62 @@ def _pagination(self, query_hash, variables, data):


class InstagramUserExtractor(InstagramExtractor):
"""Extractor for ProfilePage"""
"""Extractor for an Instagram user profile"""
subcategory = "user"
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
r"/(?!(?:p|explore|directory|accounts|stories|tv|reel)/)"
r"([^/?#]+)/?(?:$|[?#])")
pattern = USER_PATTERN + r"/?(?:$|[?#])"
test = (
("https://www.instagram.com/instagram/", {
"range": "1-16",
"count": ">= 16",
}),
# ("https://www.instagram.com/instagram/", {
# "options": (("highlights", True),),
# "pattern": InstagramStoriesExtractor.pattern,
# "range": "1-2",
# "count": 2,
# }),
("https://www.instagram.com/instagram/"),
("https://www.instagram.com/instagram/?hl=en"),
)

def __init__(self, match):
InstagramExtractor.__init__(self, match)
self.user = match.group(1)
def items(self):
if self.config("highlights"):
self.log.warning("'highlights' is deprecated, "
"use '\"include\": \"…,highlights\"' instead")
default = ("highlights", "posts")
else:
default = ("posts",)

base = "{}/{}/".format(self.root, self.item)
stories = "{}/stories/{}/".format(self.root, self.item)
return self._dispatch_extractors((
(InstagramStoriesExtractor , stories),
(InstagramHighlightsExtractor, base + "highlights/"),
(InstagramPostsExtractor , base + "posts/"),
(InstagramChannelExtractor , base + "channel/"),
), default)


class InstagramPostsExtractor(InstagramExtractor):
"""Extractor for ProfilePage posts"""
subcategory = "posts"
pattern = USER_PATTERN + r"/posts"
test = ("https://www.instagram.com/instagram/posts/", {
"range": "1-16",
"count": ">= 16",
})

def posts(self):
url = "{}/{}/".format(self.root, self.user)
url = "{}/{}/".format(self.root, self.item)
user = self._extract_profile_page(url)

if user.get("highlight_reel_count") and self.config("highlights"):
query_hash = "d4d88dc1500312af6f937f7b804c68c3"
variables = {
"user_id": user["id"],
"include_chaining": False,
"include_reel": True,
"include_suggested_users": False,
"include_logged_out_extras": False,
"include_highlight_reels": True,
"include_live_status": True,
}
data = self._graphql_request(query_hash, variables)
highlights = [
{
"__typename": "GraphReel",
"id" : "highlight:" + edge["node"]["id"],
}
for edge in data["user"]["edge_highlight_reels"]["edges"]
]
else:
highlights = None

query_hash = "003056d32c2554def87228bc3fd9668a"
variables = {"id": user["id"], "first": 50}
edge = self._get_edge_data(user, "edge_owner_to_timeline_media")
posts = self._pagination(query_hash, variables, edge)

return itertools.chain(highlights, posts) if highlights else posts
return self._pagination(query_hash, variables, edge)


class InstagramChannelExtractor(InstagramExtractor):
"""Extractor for ProfilePage channel"""
subcategory = "channel"
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
r"([^/?#]+)/channel")
pattern = USER_PATTERN + r"/channel"
test = ("https://www.instagram.com/instagram/channel/", {
"range": "1-16",
"count": ">= 16",
})

def __init__(self, match):
InstagramExtractor.__init__(self, match)
self.user = match.group(1)

def posts(self):
url = "{}/{}/channel/".format(self.root, self.user)
url = "{}/{}/channel/".format(self.root, self.item)
user = self._extract_profile_page(url)

query_hash = "bc78b344a68ed16dd5d7f264681c4c76"
Expand All @@ -436,17 +424,11 @@ def posts(self):
class InstagramSavedExtractor(InstagramExtractor):
"""Extractor for ProfilePage saved media"""
subcategory = "saved"
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
r"([^/?#]+)/saved")
pattern = USER_PATTERN + r"([^/?#]+)/saved"
test = ("https://www.instagram.com/instagram/saved/",)

def __init__(self, match):
InstagramExtractor.__init__(self, match)
self.user = match.group(1)

def posts(self):
url = "{}/{}/saved/".format(self.root, self.user)
url = "{}/{}/saved/".format(self.root, self.item)
user = self._extract_profile_page(url)

query_hash = "2ce1d673055b99250e93b6f88f878fde"
Expand All @@ -459,22 +441,17 @@ class InstagramTagExtractor(InstagramExtractor):
"""Extractor for TagPage"""
subcategory = "tag"
directory_fmt = ("{category}", "{subcategory}", "{tag}")
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
r"/explore/tags/([^/?#]+)")
pattern = BASE_PATTERN + r"/explore/tags/([^/?#]+)"
test = ("https://www.instagram.com/explore/tags/instagram/", {
"range": "1-16",
"count": ">= 16",
})

def __init__(self, match):
InstagramExtractor.__init__(self, match)
self.tag = match.group(1)

def metadata(self):
return {"tag": self.tag}
return {"tag": self.item}

def posts(self):
url = "{}/explore/tags/{}/".format(self.root, self.tag)
url = "{}/explore/tags/{}/".format(self.root, self.item)
data = self._extract_shared_data(url)
hashtag = data["entry_data"]["TagPage"][0]["graphql"]["hashtag"]

Expand Down Expand Up @@ -604,14 +581,10 @@ class InstagramPostExtractor(InstagramExtractor):
("https://www.instagram.com/reel/CDg_6Y1pxWu/"),
)

def __init__(self, match):
InstagramExtractor.__init__(self, match)
self.shortcode = match.group(1)

def posts(self):
query_hash = "a9441f24ac73000fa17fe6e6da11d59d"
variables = {
"shortcode" : self.shortcode,
"shortcode" : self.item,
"child_comment_count" : 3,
"fetch_comment_count" : 40,
"parent_comment_count" : 24,
Expand Down Expand Up @@ -652,3 +625,34 @@ def posts(self):
reel_id = user["id"]

return ({"__typename": "GraphReel", "id": reel_id},)


class InstagramHighlightsExtractor(InstagramExtractor):
"""Extractor for all Instagram story highlights of a user"""
subcategory = "highlights"
pattern = USER_PATTERN + r"/highlights"
test = ("https://www.instagram.com/instagram/highlights",)

def posts(self):
url = "{}/{}/".format(self.root, self.item)
user = self._extract_profile_page(url)

query_hash = "d4d88dc1500312af6f937f7b804c68c3"
variables = {
"user_id": user["id"],
"include_chaining": False,
"include_reel": True,
"include_suggested_users": False,
"include_logged_out_extras": False,
"include_highlight_reels": True,
"include_live_status": True,
}
data = self._graphql_request(query_hash, variables)

return [
{
"__typename": "GraphReel",
"id" : "highlight:" + edge["node"]["id"],
}
for edge in data["user"]["edge_highlight_reels"]["edges"]
]
1 change: 1 addition & 0 deletions scripts/supportedsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@
"story": "",
},
"instagram": {
"posts": "",
"saved": "Saved Posts",
},
"newgrounds": {
Expand Down

0 comments on commit bf629a2

Please sign in to comment.