From 7a74feda7837bc54b6b0ccfed980ca9bf7c8a906 Mon Sep 17 00:00:00 2001 From: xarantolus Date: Fri, 19 Jun 2020 14:57:57 +0200 Subject: [PATCH 01/49] [youtube] Fix extraction of search urls (closes ytdl-org/youtube-dl#25696) --- youtube_dl/extractor/youtube.py | 35 ++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1bc79e01478..1f16012b20c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3146,7 +3146,40 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P[^"]+))?' + _PLAYLIST_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + + def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): + playlist_json = self._search_regex(self._PLAYLIST_DATA, page, 'ytInitialData') + playlist_response = self._parse_json(playlist_json, None) + + result_items = try_get( + playlist_response, + lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) + + # plobj either contains a 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'shelfRenderer' or 'searchPyvRenderer' (promoted video/ad) + for plobj in result_items: + video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) + video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) + + if video_id is None or video_title is None: + # we do not have a videoRenderer or it is empty + continue + + video_title = video_title.strip() + + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) + return zip(ids_in_page, titles_in_page) class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): From 6dad89289cb2713065d8d28bd6adaf819188dc28 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 19 Jun 2020 21:29:47 +0200 Subject: [PATCH 02/49] [youtube] Move search URL extraction to appropriate extractor --- youtube_dl/extractor/youtube.py | 67 ++++++++++++++++----------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1f16012b20c..bb20f74c7bc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3146,40 +3146,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _PLAYLIST_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' - - def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): - playlist_json = self._search_regex(self._PLAYLIST_DATA, page, 'ytInitialData') - playlist_response = self._parse_json(playlist_json, None) - - result_items = try_get( - playlist_response, - lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) - - # plobj either contains a 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'shelfRenderer' or 'searchPyvRenderer' (promoted video/ad) - for plobj in result_items: - video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) - video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) - - if video_id is None or video_title is None: - # we do not have a videoRenderer or it is empty - continue - - video_title = video_title.strip() - - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): @@ -3243,6 +3210,7 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' + _SEARCH_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -3254,6 +3222,37 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): 'only_matching': True, }] + def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): + search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) + + result_items = try_get( + search_response, + lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) + + for plobj in result_items: + video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) + video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) + + if video_id is None or video_title is None: + # we do not have a videoRenderer or it is empty + continue + + video_title = video_title.strip() + + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) + return zip(ids_in_page, titles_in_page) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) From 57f72370c510607273157d4ea319adacb6273c58 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Sun, 21 Jun 2020 09:31:04 +0200 Subject: [PATCH 03/49] [youtube] Fix feed extraction This moves feed extraction from using html content to json metadata. However, loading additional pages no longer works. The _extract_video_info function also returns a continuation object that contains some metadata that - together with an API key that is in the page source - might be used to request the next page. --- youtube_dl/extractor/youtube.py | 110 ++++++++++++++++++++++++++------ 1 file changed, 89 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bb20f74c7bc..29012bcbee8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3282,10 +3282,12 @@ def _real_extract(self, url): class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ Base class for feed extractors - Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. + Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties as well as an _extract_video_info function. """ _LOGIN_REQUIRED = True + _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + @property def IE_NAME(self): return 'youtube:%s' % self._FEED_NAME @@ -3296,34 +3298,41 @@ def _real_initialize(self): def _entries(self, page): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page + info = [] + for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) + search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) + + video_info, continuation = self._extract_video_info(search_response) + + new_info = [] + + for v in video_info: + v_id = try_get(v, lambda x: x['videoId']) + if not v_id: + continue + + have_video = False + for old in info: + if old['videoId'] == v_id: + have_video = True + break - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: + if not have_video: + new_info.append(v) + + if not new_info: break - ids.extend(new_ids) + info.extend(new_info) - for entry in self._ids_to_results(new_ids): - yield entry + for video in new_info: + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['simpleText'])) - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: + if not continuation: break - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] + # TODO: Fix continuation request to download more pages def _real_extract(self, url): page = self._download_webpage( @@ -3372,6 +3381,32 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' + def _extract_video_info(self, initial_data): + videos = [] + continuation_renderer = None + + renderers = try_get( + initial_data, + lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['richGridRenderer']['contents']) + + for renderer in renderers: + vid = try_get(renderer, lambda x: x['richItemRenderer']['content']['videoRenderer']) + if vid is not None: + videos.append(vid) + continue + + if 'richSectionRenderer' in renderer: + vids = try_get(renderer, lambda x: x['richSectionRenderer']['content']['richShelfRenderer']['contents']) + for v in vids: + vid = try_get(v, lambda x: x['richItemRenderer']['content']['videoRenderer']) + if vid is not None: + videos.append(vid) + continue + + if 'continuationItemRenderer' in renderer: + continuation_renderer = renderer + + return videos, continuation_renderer class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' @@ -3379,6 +3414,23 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = 'Youtube Subscriptions' + def _extract_video_info(self, initial_data): + videos = [] + continuation_renderer = None + + renderers = try_get( + initial_data, + lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) + for renderer in renderers: + for item in try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['shelfRenderer']['content']['gridRenderer']['items']): + vid = try_get(item, lambda x: x['gridVideoRenderer']) + if vid is not None: + videos.append(vid) + + if 'continuationItemRenderer' in renderer: + continuation_renderer = renderer + + return videos, continuation_renderer class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' @@ -3386,6 +3438,22 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'history' _PLAYLIST_TITLE = 'Youtube History' + def _extract_video_info(self, initial_data): + videos = [] + continuation_renderer = None + + renderers = try_get( + initial_data, + lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) + for renderer in renderers: + vid = try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['videoRenderer']) + if vid is not None: + videos.append(vid) + + if 'continuationItemRenderer' in renderer: + continuation_renderer = renderer + + return videos, continuation_renderer class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' From b3fd4b155e7460ffd21e87eb29bc8a95902a429a Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Sun, 21 Jun 2020 09:41:42 +0200 Subject: [PATCH 04/49] run flake8 --- youtube_dl/extractor/youtube.py | 37 ++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 29012bcbee8..bd83584629c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3286,7 +3286,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ _LOGIN_REQUIRED = True - _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' @property def IE_NAME(self): @@ -3299,20 +3299,20 @@ def _entries(self, page): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index info = [] - + for page_num in itertools.count(1): search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) video_info, continuation = self._extract_video_info(search_response) - - new_info = [] - + + new_info = [] + for v in video_info: v_id = try_get(v, lambda x: x['videoId']) if not v_id: continue - - have_video = False + + have_video = False for old in info: if old['videoId'] == v_id: have_video = True @@ -3386,15 +3386,15 @@ def _extract_video_info(self, initial_data): continuation_renderer = None renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['richGridRenderer']['contents']) + initial_data, + lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['richGridRenderer']['contents']) for renderer in renderers: vid = try_get(renderer, lambda x: x['richItemRenderer']['content']['videoRenderer']) if vid is not None: videos.append(vid) - continue - + continue + if 'richSectionRenderer' in renderer: vids = try_get(renderer, lambda x: x['richSectionRenderer']['content']['richShelfRenderer']['contents']) for v in vids: @@ -3402,12 +3402,13 @@ def _extract_video_info(self, initial_data): if vid is not None: videos.append(vid) continue - + if 'continuationItemRenderer' in renderer: continuation_renderer = renderer return videos, continuation_renderer + class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' @@ -3419,8 +3420,8 @@ def _extract_video_info(self, initial_data): continuation_renderer = None renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) + initial_data, + lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) for renderer in renderers: for item in try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['shelfRenderer']['content']['gridRenderer']['items']): vid = try_get(item, lambda x: x['gridVideoRenderer']) @@ -3432,6 +3433,7 @@ def _extract_video_info(self, initial_data): return videos, continuation_renderer + class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' @@ -3441,10 +3443,10 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): def _extract_video_info(self, initial_data): videos = [] continuation_renderer = None - + renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) + initial_data, + lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) for renderer in renderers: vid = try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['videoRenderer']) if vid is not None: @@ -3455,6 +3457,7 @@ def _extract_video_info(self, initial_data): return videos, continuation_renderer + class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list From 6a3cc8939415e246eacd5a6cc8007d6900f48079 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Tue, 23 Jun 2020 08:56:21 +0200 Subject: [PATCH 05/49] [youtube] Make search extraction less dependent on json schema. If an object looks like a video (it has a `videoId` key), assume that it is. --- youtube_dl/extractor/youtube.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bd83584629c..69cc4a0170f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3222,16 +3222,37 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): 'only_matching': True, }] + def _find_videos_in_json(self, extracted): + videos = [] + + def _real_find(obj): + if obj is None or isinstance(obj, str): + return + + if type(obj) is list: + for elem in obj: + _real_find(elem) + + if type(obj) is dict: + if "videoId" in obj: + videos.append(obj) + return + + for _, o in obj.items(): + _real_find(o) + + _real_find(extracted) + + return videos + def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) - result_items = try_get( - search_response, - lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) + result_items = self._find_videos_in_json(search_response) for plobj in result_items: - video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) - video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) + video_id = try_get(plobj, lambda x: x['videoId']) + video_title = try_get(plobj, lambda x: x['title']['runs'][0]['text']) if video_id is None or video_title is None: # we do not have a videoRenderer or it is empty From 5cbe7563bece11e52c833a79b0197ca4444ffe37 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Tue, 23 Jun 2020 11:27:02 +0200 Subject: [PATCH 06/49] [youtube] Return to old feed extraction code as it *seems* like that change was reverted The old code now works again, but it downloads without limit. This is why a limit of 1000 videos is added, it can be overwritten with the `--max-downloads` option - that way, only so many ids will be extracted as videos downloaded --- youtube_dl/extractor/youtube.py | 115 ++++++++------------------------ 1 file changed, 28 insertions(+), 87 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 69cc4a0170f..745e14fa3ce 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3303,7 +3303,7 @@ def _real_extract(self, url): class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ Base class for feed extractors - Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties as well as an _extract_video_info function. + Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True @@ -3319,41 +3319,44 @@ def _real_initialize(self): def _entries(self, page): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index - info = [] + limit = self._downloader.params.get('max_downloads') or 1000 + ids = [] + more_widget_html = content_html = page for page_num in itertools.count(1): - search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) + matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - video_info, continuation = self._extract_video_info(search_response) + # 'recommended' feed has infinite 'load more' and each new portion spins + # the same videos in (sometimes) slightly different order, so we'll check + # for unicity and break when portion has no new videos + new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) + if not new_ids: + break - new_info = [] + done = False + if len(new_ids) + len(ids) > limit: + new_ids = new_ids[:limit - len(ids)] + done = True - for v in video_info: - v_id = try_get(v, lambda x: x['videoId']) - if not v_id: - continue - - have_video = False - for old in info: - if old['videoId'] == v_id: - have_video = True - break + ids.extend(new_ids) - if not have_video: - new_info.append(v) + for entry in self._ids_to_results(new_ids): + yield entry - if not new_info: + if done: break - info.extend(new_info) - - for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['simpleText'])) - - if not continuation: + mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) + if not mobj: break - # TODO: Fix continuation request to download more pages + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape, + headers=self._YOUTUBE_CLIENT_HEADERS) + content_html = more['content_html'] + more_widget_html = more['load_more_widget_html'] def _real_extract(self, url): page = self._download_webpage( @@ -3402,33 +3405,6 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' - def _extract_video_info(self, initial_data): - videos = [] - continuation_renderer = None - - renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['richGridRenderer']['contents']) - - for renderer in renderers: - vid = try_get(renderer, lambda x: x['richItemRenderer']['content']['videoRenderer']) - if vid is not None: - videos.append(vid) - continue - - if 'richSectionRenderer' in renderer: - vids = try_get(renderer, lambda x: x['richSectionRenderer']['content']['richShelfRenderer']['contents']) - for v in vids: - vid = try_get(v, lambda x: x['richItemRenderer']['content']['videoRenderer']) - if vid is not None: - videos.append(vid) - continue - - if 'continuationItemRenderer' in renderer: - continuation_renderer = renderer - - return videos, continuation_renderer - class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' @@ -3436,24 +3412,6 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = 'Youtube Subscriptions' - def _extract_video_info(self, initial_data): - videos = [] - continuation_renderer = None - - renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) - for renderer in renderers: - for item in try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['shelfRenderer']['content']['gridRenderer']['items']): - vid = try_get(item, lambda x: x['gridVideoRenderer']) - if vid is not None: - videos.append(vid) - - if 'continuationItemRenderer' in renderer: - continuation_renderer = renderer - - return videos, continuation_renderer - class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' @@ -3461,23 +3419,6 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'history' _PLAYLIST_TITLE = 'Youtube History' - def _extract_video_info(self, initial_data): - videos = [] - continuation_renderer = None - - renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) - for renderer in renderers: - vid = try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['videoRenderer']) - if vid is not None: - videos.append(vid) - - if 'continuationItemRenderer' in renderer: - continuation_renderer = renderer - - return videos, continuation_renderer - class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' From c37ca4732bf806113e2645efaebd037a6bcc0b5c Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 10 Jul 2020 11:47:13 +0200 Subject: [PATCH 07/49] [youtube] Remote download limit --- youtube_dl/extractor/youtube.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 745e14fa3ce..b53376d3106 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3319,8 +3319,6 @@ def _real_initialize(self): def _entries(self, page): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index - limit = self._downloader.params.get('max_downloads') or 1000 - ids = [] more_widget_html = content_html = page for page_num in itertools.count(1): @@ -3333,19 +3331,11 @@ def _entries(self, page): if not new_ids: break - done = False - if len(new_ids) + len(ids) > limit: - new_ids = new_ids[:limit - len(ids)] - done = True - ids.extend(new_ids) for entry in self._ids_to_results(new_ids): yield entry - if done: - break - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) if not mobj: break From 7fa0a67cc1e5b5607fb6d30291a549e59e12c9b9 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 10 Jul 2020 11:50:50 +0200 Subject: [PATCH 08/49] Remove unused variable --- youtube_dl/extractor/youtube.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b53376d3106..ade6625f3f8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3307,8 +3307,6 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ _LOGIN_REQUIRED = True - _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' - @property def IE_NAME(self): return 'youtube:%s' % self._FEED_NAME From 2bd94127a2319a88b5d98719f5e655682aed8b01 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 23 Jun 2020 15:08:50 +0100 Subject: [PATCH 09/49] [bellmedia] add support for cp24.com clip URLs(closes #25764) --- youtube_dl/extractor/bellmedia.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index 485173774d9..9f9de96c613 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -25,8 +25,8 @@ class BellMediaIE(InfoExtractor): etalk| marilyn )\.ca| - much\.com - )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})''' + (?:much|cp24)\.com + )/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})''' _TESTS = [{ 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', 'md5': '36d3ef559cfe8af8efe15922cd3ce950', @@ -62,6 +62,9 @@ class BellMediaIE(InfoExtractor): }, { 'url': 'http://www.etalk.ca/video?videoid=663455', 'only_matching': True, + }, { + 'url': 'https://www.cp24.com/video?clipId=1982548', + 'only_matching': True, }] _DOMAINS = { 'thecomedynetwork': 'comedy', From 255f31b5cb42b5c13c1f775b0fa88737283d4526 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 28 Jun 2020 10:30:03 +0700 Subject: [PATCH 10/49] [youtube:playlists] Extend _VALID_URL (closes #25810) --- youtube_dl/extractor/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ade6625f3f8..974e0093447 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3116,7 +3116,7 @@ def _real_extract(self, url): class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com user/channel playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' + _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists' IE_NAME = 'youtube:playlists' _TESTS = [{ @@ -3142,6 +3142,9 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): 'title': 'Chem Player', }, 'skip': 'Blocked', + }, { + 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', + 'only_matching': True, }] From bb2c950b8eaac989032725923f8855be73d38596 Mon Sep 17 00:00:00 2001 From: Glenn Slayden <5589855+glenn-slayden@users.noreply.github.com> Date: Tue, 30 Jun 2020 12:56:16 -0700 Subject: [PATCH 11/49] [youtube] Prevent excess HTTP 301 (#25786) --- youtube_dl/extractor/youtube.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 974e0093447..dd6f38e6273 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -303,7 +303,7 @@ def _entries(self, page, playlist_id): # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, @@ -2776,7 +2776,7 @@ def _extract_mix(self, playlist_id): ids = [] last_id = playlist_id[-11:] for n in itertools.count(1): - url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) + url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) webpage = self._download_webpage( url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) new_ids = orderedSet(re.findall( @@ -3342,7 +3342,7 @@ def _entries(self, page): break more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, transform_source=uppercase_escape, headers=self._YOUTUBE_CLIENT_HEADERS) From 9fa728f4e89d0d6882a76cb27902029d60455993 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 11 Jul 2020 18:27:19 +0700 Subject: [PATCH 12/49] [wistia] Restrict embed regex (closes #25969) --- youtube_dl/extractor/wistia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 168e5e90152..77febd2eb1b 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -56,7 +56,7 @@ def _extract_urls(webpage): urls.append(unescapeHTML(match.group('url'))) for match in re.finditer( r'''(?sx) - <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]{10})\b.*?\2 + <div[^>]+class=(["'])(?:(?!\1).)*?\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1 ''', webpage): urls.append('wistia:%s' % match.group('id')) for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage): From 54ffcbb8eb06eeeb6b295f08b653e5449c373d47 Mon Sep 17 00:00:00 2001 From: MRWITEK <mrvvitek@gmail.com> Date: Tue, 14 Jul 2020 14:01:15 +0300 Subject: [PATCH 13/49] [youtube] Improve description extraction (closes #25937) (#25980) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index dd6f38e6273..368952a69d9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1930,7 +1930,7 @@ def replace_url(m): ''', replace_url, video_description) video_description = clean_html(video_description) else: - video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription') + video_description = video_details.get('shortDescription') or self._html_search_meta('description', video_webpage) if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): From 49004a6b59e3e09ce4533618e832cd94b242ba0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 28 Jul 2020 05:04:50 +0700 Subject: [PATCH 14/49] [youtube] Fix sigfunc name extraction (closes #26134, closes #26135, closes #26136, closes #26137) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 368952a69d9..cf910ae5234 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1384,7 +1384,7 @@ def _parse_sig_js(self, jscode): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', From f4492c48904d441cbacdbc40bf978f674df3a3f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 28 Jul 2020 05:07:54 +0700 Subject: [PATCH 15/49] [ChangeLog] Actualize [ci skip] --- ChangeLog | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ChangeLog b/ChangeLog index 07d6ccd69d6..a49904c89f3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +version <unreleased> + +Extractors +* [youtube] Fix sigfunc name extraction (#26134, #26135, #26136, #26137) +* [youtube] Improve description extraction (#25937, #25980) +* [wistia] Restrict embed regular expression (#25969) +* [youtube] Prevent excess HTTP 301 (#25786) ++ [youtube:playlists] Extend URL regular expression (#25810) ++ [bellmedia] Add support for cp24.com clip URLs (#25764) +* [brightcove] Improve embed detection (#25674) + + version 2020.06.16.1 Extractors From de722d3cd76c6fd4ba166c98cc681689534ee1a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 28 Jul 2020 05:13:03 +0700 Subject: [PATCH 16/49] release 2020.07.28 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index d29d5366fbb..f2260db465e 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.06.16.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.06.16.1** +- [ ] I've verified that I'm running youtube-dl version **2020.07.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.06.16.1 + [debug] youtube-dl version 2020.07.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index ee882f98cf7..8bc05c4ba73 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.06.16.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.06.16.1** +- [ ] I've verified that I'm running youtube-dl version **2020.07.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 23033fe13d6..98348e0cd69 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.06.16.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.06.16.1** +- [ ] I've verified that I'm running youtube-dl version **2020.07.28** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 5975313300c..86706f5289d 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.06.16.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.06.16.1** +- [ ] I've verified that I'm running youtube-dl version **2020.07.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.06.16.1 + [debug] youtube-dl version 2020.07.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 5cfcb931862..52c2709f943 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.06.16.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.06.16.1** +- [ ] I've verified that I'm running youtube-dl version **2020.07.28** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index a49904c89f3..bf515f784b2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2020.07.28 Extractors * [youtube] Fix sigfunc name extraction (#26134, #26135, #26136, #26137) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6b88eb38cae..17101fa4750 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.06.16.1' +__version__ = '2020.07.28' From c449f709653bbd28293c8973f14a1c0a38600e58 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Thu, 30 Jul 2020 16:34:48 +0200 Subject: [PATCH 17/49] [youtube] Fix feed extraction In order to extract videos from further pages, we need to get various variables that are in an argument to the `ytcfg.set` call in a script on the feed page. --- youtube_dl/extractor/youtube.py | 96 ++++++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index cf910ae5234..de70772c7e5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3309,6 +3309,8 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True + _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property def IE_NAME(self): @@ -3317,37 +3319,91 @@ def IE_NAME(self): def _real_initialize(self): self._login() + + def _find_videos_in_json(self, extracted): + videos = [] + continuation = None + + def _real_find(obj): + if obj is None or isinstance(obj, str): + return + + if type(obj) is list: + for elem in obj: + _real_find(elem) + + if type(obj) is dict: + if "videoId" in obj: + videos.append(obj) + return + + if "nextContinuationData" in obj: + nonlocal continuation + continuation = obj["nextContinuationData"] + return + + for _, o in obj.items(): + _real_find(o) + + _real_find(extracted) + + return videos, continuation + def _entries(self, page): - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page + info = [] + + yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set'), None) + + search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) + for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) + video_info, continuation = self._find_videos_in_json(search_response) - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: + new_info = [] + + for v in video_info: + v_id = try_get(v, lambda x: x['videoId']) + if not v_id: + continue + + have_video = False + for old in info: + if old['videoId'] == v_id: + have_video = True + break + + if not have_video: + new_info.append(v) + + if not new_info: break - ids.extend(new_ids) + info.extend(new_info) - for entry in self._ids_to_results(new_ids): - yield entry + for video in new_info: + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['simpleText'])) - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: + if not continuation: break - more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + search_response = self._download_json( + 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] + query={ + "ctoken": try_get(continuation, lambda x: x["continuation"]), + "continuation": try_get(continuation, lambda x: x["continuation"]), + "itct": try_get(continuation, lambda x: x["clickTrackingParams"]) + }, + headers={ + "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]), + "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]), + "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]), + "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]), + "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]), + "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]), + "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), + }) def _real_extract(self, url): page = self._download_webpage( From 4f37c60bf5f2af245985d314f0f64f473644feef Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Thu, 30 Jul 2020 16:38:56 +0200 Subject: [PATCH 18/49] Run formatter --- youtube_dl/extractor/youtube.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index de70772c7e5..f6bed3f6837 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3310,7 +3310,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ _LOGIN_REQUIRED = True _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' - _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" + _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property def IE_NAME(self): @@ -3319,10 +3319,9 @@ def IE_NAME(self): def _real_initialize(self): self._login() - def _find_videos_in_json(self, extracted): videos = [] - continuation = None + continuation = None def _real_find(obj): if obj is None or isinstance(obj, str): @@ -3336,19 +3335,19 @@ def _real_find(obj): if "videoId" in obj: videos.append(obj) return - + if "nextContinuationData" in obj: nonlocal continuation continuation = obj["nextContinuationData"] - return - + return + for _, o in obj.items(): _real_find(o) _real_find(extracted) return videos, continuation - + def _entries(self, page): info = [] @@ -3359,14 +3358,14 @@ def _entries(self, page): for page_num in itertools.count(1): video_info, continuation = self._find_videos_in_json(search_response) - new_info = [] + new_info = [] for v in video_info: v_id = try_get(v, lambda x: x['videoId']) if not v_id: continue - have_video = False + have_video = False for old in info: if old['videoId'] == v_id: have_video = True @@ -3402,7 +3401,7 @@ def _entries(self, page): "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]), "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]), "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]), - "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), + "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), }) def _real_extract(self, url): From a5e386d9feb0e54013ec5aa1ba106869240fb995 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 31 Jul 2020 10:05:11 +0200 Subject: [PATCH 19/49] Fix python2 compatibility and title extraction --- youtube_dl/extractor/youtube.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f6bed3f6837..ad8db2c2d80 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3321,7 +3321,7 @@ def _real_initialize(self): def _find_videos_in_json(self, extracted): videos = [] - continuation = None + c = {} def _real_find(obj): if obj is None or isinstance(obj, str): @@ -3337,8 +3337,7 @@ def _real_find(obj): return if "nextContinuationData" in obj: - nonlocal continuation - continuation = obj["nextContinuationData"] + c["continuation"] = obj["nextContinuationData"] return for _, o in obj.items(): @@ -3346,7 +3345,7 @@ def _real_find(obj): _real_find(extracted) - return videos, continuation + return videos, try_get(c, lambda x: x["continuation"]) def _entries(self, page): info = [] @@ -3380,7 +3379,7 @@ def _entries(self, page): info.extend(new_info) for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['simpleText'])) + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text'])) if not continuation: break From 7d743516b541cf448bbaaa35ac95f8ecc8139432 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Mon, 24 Aug 2020 14:29:16 +0200 Subject: [PATCH 20/49] [youtube] Make `ytcfg.set` config extraction non-fatal If the markup of the page changes in the future, it might be possible that _FEED_DATA still works, but the other regex does not. SInce it is not necessary for the first page of videos, we make sure the program doesn't exit before extracting them. TL;DR: Extract the first video page even if there are problems --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ad8db2c2d80..ee8a4626d98 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3350,7 +3350,7 @@ def _real_find(obj): def _entries(self, page): info = [] - yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set'), None) + yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False) search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) @@ -3381,7 +3381,7 @@ def _entries(self, page): for video in new_info: yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text'])) - if not continuation: + if not continuation or not yt_conf: break search_response = self._download_json( From 94255fa0b165d0646ae42e9b114f9dddaebc3123 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Mon, 24 Aug 2020 14:30:08 +0200 Subject: [PATCH 21/49] [youtube] More general title extraction Seems like this attribute is moved every few weeks, so we just extract both and use the one that is present. --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ee8a4626d98..8f622662ab4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3379,7 +3379,7 @@ def _entries(self, page): info.extend(new_info) for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text'])) + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText'])) if not continuation or not yt_conf: break From 4c47858c0584f5e38904871f8543f7271d703cc2 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Thu, 3 Sep 2020 20:41:45 +0200 Subject: [PATCH 22/49] Fix regex for other variable declaration type This now supports declarations like `window["ytInitialData"] = ...` and `var ytInitialData = ...` --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8f622662ab4..e62096bb27f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3309,7 +3309,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property From b948643f9c069da5bfbe89e2b311c91ca0313262 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 19 Jun 2020 14:57:57 +0200 Subject: [PATCH 23/49] [youtube] Fix extraction of search urls (closes ytdl-org/youtube-dl#25696) --- youtube_dl/extractor/youtube.py | 35 ++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6ae2e58c176..eafd8b7af08 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3153,7 +3153,40 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' + _PLAYLIST_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + + def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): + playlist_json = self._search_regex(self._PLAYLIST_DATA, page, 'ytInitialData') + playlist_response = self._parse_json(playlist_json, None) + + result_items = try_get( + playlist_response, + lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) + + # plobj either contains a 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'shelfRenderer' or 'searchPyvRenderer' (promoted video/ad) + for plobj in result_items: + video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) + video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) + + if video_id is None or video_title is None: + # we do not have a videoRenderer or it is empty + continue + + video_title = video_title.strip() + + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) + return zip(ids_in_page, titles_in_page) class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): From 19f671f88b2f45c833a9fc7f6f2f7d9016eccc86 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 19 Jun 2020 21:29:47 +0200 Subject: [PATCH 24/49] [youtube] Move search URL extraction to appropriate extractor --- youtube_dl/extractor/youtube.py | 67 ++++++++++++++++----------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eafd8b7af08..22064616a88 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3153,40 +3153,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _PLAYLIST_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' - - def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): - playlist_json = self._search_regex(self._PLAYLIST_DATA, page, 'ytInitialData') - playlist_response = self._parse_json(playlist_json, None) - - result_items = try_get( - playlist_response, - lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) - - # plobj either contains a 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'shelfRenderer' or 'searchPyvRenderer' (promoted video/ad) - for plobj in result_items: - video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) - video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) - - if video_id is None or video_title is None: - # we do not have a videoRenderer or it is empty - continue - - video_title = video_title.strip() - - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): @@ -3250,6 +3217,7 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' + _SEARCH_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -3261,6 +3229,37 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): 'only_matching': True, }] + def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): + search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) + + result_items = try_get( + search_response, + lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) + + for plobj in result_items: + video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) + video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) + + if video_id is None or video_title is None: + # we do not have a videoRenderer or it is empty + continue + + video_title = video_title.strip() + + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) + return zip(ids_in_page, titles_in_page) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) From e03b4f3e056b80b99dd4ab4eed12c7089fb80a43 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Tue, 23 Jun 2020 08:56:21 +0200 Subject: [PATCH 25/49] [youtube] Make search extraction less dependent on json schema. If an object looks like a video (it has a `videoId` key), assume that it is. --- youtube_dl/extractor/youtube.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 22064616a88..be04459627f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3229,16 +3229,37 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): 'only_matching': True, }] + def _find_videos_in_json(self, extracted): + videos = [] + + def _real_find(obj): + if obj is None or isinstance(obj, str): + return + + if type(obj) is list: + for elem in obj: + _real_find(elem) + + if type(obj) is dict: + if "videoId" in obj: + videos.append(obj) + return + + for _, o in obj.items(): + _real_find(o) + + _real_find(extracted) + + return videos + def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) - result_items = try_get( - search_response, - lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) + result_items = self._find_videos_in_json(search_response) for plobj in result_items: - video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) - video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) + video_id = try_get(plobj, lambda x: x['videoId']) + video_title = try_get(plobj, lambda x: x['title']['runs'][0]['text']) if video_id is None or video_title is None: # we do not have a videoRenderer or it is empty From 5c430b67bd6befe4c5f257ba40b8d51979c1028c Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Thu, 30 Jul 2020 16:34:48 +0200 Subject: [PATCH 26/49] [youtube] Fix feed extraction In order to extract videos from further pages, we need to get various variables that are in an argument to the `ytcfg.set` call in a script on the feed page. --- youtube_dl/extractor/youtube.py | 96 ++++++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index be04459627f..64c4ef32cf2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3313,6 +3313,8 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True + _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property def IE_NAME(self): @@ -3321,37 +3323,91 @@ def IE_NAME(self): def _real_initialize(self): self._login() + + def _find_videos_in_json(self, extracted): + videos = [] + continuation = None + + def _real_find(obj): + if obj is None or isinstance(obj, str): + return + + if type(obj) is list: + for elem in obj: + _real_find(elem) + + if type(obj) is dict: + if "videoId" in obj: + videos.append(obj) + return + + if "nextContinuationData" in obj: + nonlocal continuation + continuation = obj["nextContinuationData"] + return + + for _, o in obj.items(): + _real_find(o) + + _real_find(extracted) + + return videos, continuation + def _entries(self, page): - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page + info = [] + + yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set'), None) + + search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) + for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) + video_info, continuation = self._find_videos_in_json(search_response) - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: + new_info = [] + + for v in video_info: + v_id = try_get(v, lambda x: x['videoId']) + if not v_id: + continue + + have_video = False + for old in info: + if old['videoId'] == v_id: + have_video = True + break + + if not have_video: + new_info.append(v) + + if not new_info: break - ids.extend(new_ids) + info.extend(new_info) - for entry in self._ids_to_results(new_ids): - yield entry + for video in new_info: + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['simpleText'])) - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: + if not continuation: break - more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + search_response = self._download_json( + 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] + query={ + "ctoken": try_get(continuation, lambda x: x["continuation"]), + "continuation": try_get(continuation, lambda x: x["continuation"]), + "itct": try_get(continuation, lambda x: x["clickTrackingParams"]) + }, + headers={ + "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]), + "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]), + "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]), + "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]), + "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]), + "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]), + "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), + }) def _real_extract(self, url): page = self._download_webpage( From f536080701c29829d6eebefeb4915307ee44e7d8 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Thu, 30 Jul 2020 16:38:56 +0200 Subject: [PATCH 27/49] Run formatter --- youtube_dl/extractor/youtube.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 64c4ef32cf2..d97e0ab4e5b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3314,7 +3314,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ _LOGIN_REQUIRED = True _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' - _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" + _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property def IE_NAME(self): @@ -3323,10 +3323,9 @@ def IE_NAME(self): def _real_initialize(self): self._login() - def _find_videos_in_json(self, extracted): videos = [] - continuation = None + continuation = None def _real_find(obj): if obj is None or isinstance(obj, str): @@ -3340,19 +3339,19 @@ def _real_find(obj): if "videoId" in obj: videos.append(obj) return - + if "nextContinuationData" in obj: nonlocal continuation continuation = obj["nextContinuationData"] - return - + return + for _, o in obj.items(): _real_find(o) _real_find(extracted) return videos, continuation - + def _entries(self, page): info = [] @@ -3363,14 +3362,14 @@ def _entries(self, page): for page_num in itertools.count(1): video_info, continuation = self._find_videos_in_json(search_response) - new_info = [] + new_info = [] for v in video_info: v_id = try_get(v, lambda x: x['videoId']) if not v_id: continue - have_video = False + have_video = False for old in info: if old['videoId'] == v_id: have_video = True @@ -3406,7 +3405,7 @@ def _entries(self, page): "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]), "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]), "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]), - "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), + "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), }) def _real_extract(self, url): From 299056ad52222911eea22db0b1a0715bef7572ef Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 31 Jul 2020 10:05:11 +0200 Subject: [PATCH 28/49] Fix python2 compatibility and title extraction --- youtube_dl/extractor/youtube.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d97e0ab4e5b..ec631cd2292 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3325,7 +3325,7 @@ def _real_initialize(self): def _find_videos_in_json(self, extracted): videos = [] - continuation = None + c = {} def _real_find(obj): if obj is None or isinstance(obj, str): @@ -3341,8 +3341,7 @@ def _real_find(obj): return if "nextContinuationData" in obj: - nonlocal continuation - continuation = obj["nextContinuationData"] + c["continuation"] = obj["nextContinuationData"] return for _, o in obj.items(): @@ -3350,7 +3349,7 @@ def _real_find(obj): _real_find(extracted) - return videos, continuation + return videos, try_get(c, lambda x: x["continuation"]) def _entries(self, page): info = [] @@ -3384,7 +3383,7 @@ def _entries(self, page): info.extend(new_info) for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['simpleText'])) + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text'])) if not continuation: break From 1f93faf60bb1447ff1aa661e46916e863640ade2 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Mon, 24 Aug 2020 14:29:16 +0200 Subject: [PATCH 29/49] [youtube] Make `ytcfg.set` config extraction non-fatal If the markup of the page changes in the future, it might be possible that _FEED_DATA still works, but the other regex does not. SInce it is not necessary for the first page of videos, we make sure the program doesn't exit before extracting them. TL;DR: Extract the first video page even if there are problems --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ec631cd2292..ec821cbc042 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3354,7 +3354,7 @@ def _real_find(obj): def _entries(self, page): info = [] - yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set'), None) + yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False) search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) @@ -3385,7 +3385,7 @@ def _entries(self, page): for video in new_info: yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text'])) - if not continuation: + if not continuation or not yt_conf: break search_response = self._download_json( From f442082a50f94fc3c36db954764b70d6a08beaa1 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Mon, 24 Aug 2020 14:30:08 +0200 Subject: [PATCH 30/49] [youtube] More general title extraction Seems like this attribute is moved every few weeks, so we just extract both and use the one that is present. --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ec821cbc042..c8d80bbd2b0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3383,7 +3383,7 @@ def _entries(self, page): info.extend(new_info) for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text'])) + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText'])) if not continuation or not yt_conf: break From bea9b00588a2d5376c8edeaa968d4c484db415c8 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Thu, 3 Sep 2020 20:41:45 +0200 Subject: [PATCH 31/49] Fix regex for other variable declaration type This now supports declarations like `window["ytInitialData"] = ...` and `var ytInitialData = ...` --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c8d80bbd2b0..c03ca5b31c7 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3313,7 +3313,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property From b84071c0a914d271b1bee475628150990daae905 Mon Sep 17 00:00:00 2001 From: Joel Potts <jpotts@redpoints.com> Date: Tue, 15 Sep 2020 17:16:58 +0200 Subject: [PATCH 32/49] [youtube] Added 'subscriber_count' to extraction --- youtube_dl/extractor/youtube.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 02f3ab61aef..f0d2a8873c2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -39,6 +39,7 @@ mimetype2ext, orderedSet, parse_codecs, + parse_count, parse_duration, remove_quotes, remove_start, @@ -2421,6 +2422,14 @@ def _extract_count(count_name): video_duration = parse_duration(self._html_search_meta( 'duration', video_webpage, 'video duration')) + # Get Subscriber Count of channel + subscriber_count = parse_count(self._search_regex( + r'"text":"([\d\.]+\w?) subscribers"', + video_webpage, + 'subscriber count', + default=None + )) + # annotations video_annotations = None if self._downloader.params.get('writeannotations', False): @@ -2558,6 +2567,7 @@ def decrypt_sig(mobj): 'album': album, 'release_date': release_date, 'release_year': release_year, + 'subscriber_count': subscriber_count, } From c0a1a8926d91b7d1656240bbfc880b160811a3b9 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Tue, 22 Sep 2020 20:52:52 +0200 Subject: [PATCH 33/49] Use better regex for all fixed extraction types --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c03ca5b31c7..1f9cc73717f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3217,7 +3217,7 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _SEARCH_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, From a6c666d06cbcdc7afc15726abae79235fb22546e Mon Sep 17 00:00:00 2001 From: Joel Potts <jpotts@redpoints.com> Date: Tue, 15 Sep 2020 17:33:44 +0200 Subject: [PATCH 34/49] [youtube] Updated extraction of 'like_count' value --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 02f3ab61aef..b5e274d00ec 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2393,7 +2393,7 @@ def extract_meta(field): def _extract_count(count_name): return str_to_int(self._search_regex( - r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' + r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}' % re.escape(count_name), video_webpage, count_name, default=None)) From 9c1f99402fa25a5a691944c133432741af19829b Mon Sep 17 00:00:00 2001 From: Gilles Pietri <gilles@wolface.fr> Date: Wed, 23 Sep 2020 23:09:00 +0200 Subject: [PATCH 35/49] [bandcamp] fix regexp for JSON matching on bandcamp --- youtube_dl/extractor/bandcamp.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index f14b407dc82..ad181232066 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -91,10 +91,11 @@ def _real_extract(self, url): duration = None formats = [] - track_info = self._parse_json( - self._search_regex( - r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', - webpage, 'track info', default='{}'), title) + trackinfo_block = self._search_regex( + r'trackinfo":\[\s*({.+?})\s*\],"', + webpage, 'track info', default='{}') + quoted_json = trackinfo_block.replace('"', '"') + track_info = self._parse_json(quoted_json, title) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -117,7 +118,7 @@ def _real_extract(self, url): def extract(key): return self._search_regex( - r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key, + r',"%s":(")(?P<value>(?:(?!").)+)"' % key, webpage, key, default=None, group='value') artist = extract('artist') From 14194392a813a12b3a1477ec75bcd0c8626ef3bb Mon Sep 17 00:00:00 2001 From: Gilles Pietri <gilles@wolface.fr> Date: Sat, 26 Sep 2020 17:34:35 +0200 Subject: [PATCH 36/49] [bandcamp] use unescapeHTML instead of a simple replace of quotes --- youtube_dl/extractor/bandcamp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index ad181232066..55d110e2806 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -92,10 +92,10 @@ def _real_extract(self, url): formats = [] trackinfo_block = self._search_regex( - r'trackinfo":\[\s*({.+?})\s*\],"', + r'trackinfo(?:["\']|"):\[\s*({.+?})\s*\],(?:["\']|")', webpage, 'track info', default='{}') - quoted_json = trackinfo_block.replace('"', '"') - track_info = self._parse_json(quoted_json, title) + unescaped_json = unescapeHTML(trackinfo_block) + track_info = self._parse_json(unescaped_json, title) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -118,7 +118,7 @@ def _real_extract(self, url): def extract(key): return self._search_regex( - r',"%s":(")(?P<value>(?:(?!").)+)"' % key, + r',(["\']|")%s\1:\1(?P<value>(?:(?!\1).)+)\1' % key, webpage, key, default=None, group='value') artist = extract('artist') From f43a856334b633e3d2f778b455fb08a4a06fbf51 Mon Sep 17 00:00:00 2001 From: Gilles Pietri <gilles@wolface.fr> Date: Sun, 27 Sep 2020 14:51:42 +0200 Subject: [PATCH 37/49] [bandcamp] match album titles inside the new JSON data block, and unescape the title properly --- youtube_dl/extractor/bandcamp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 55d110e2806..f036a89ebd9 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -316,10 +316,10 @@ def _real_extract(self, url): if self._html_search_meta('duration', elem_content, default=None)] title = self._html_search_regex( - r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', + r'album_title\s*(?:"|["\']):\s*(?:"|["\'])((?:\\.|[^"\\])+?)(?:"|["\'])', webpage, 'title', fatal=False) if title: - title = title.replace(r'\"', '"') + title = unescapeHTML(title) return { '_type': 'playlist', 'uploader_id': uploader_id, From 9385ec4b1c797ffab66b945f23fd4248c0c8a32e Mon Sep 17 00:00:00 2001 From: Gilles Pietri <gilles@wolface.fr> Date: Sun, 27 Sep 2020 15:11:08 +0200 Subject: [PATCH 38/49] [bandcamp] fix the freeDownloadPage JSON lookup, and use the id from the URL to match the tracks --- youtube_dl/extractor/bandcamp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index f036a89ebd9..eccb867a0db 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -128,12 +128,12 @@ def extract(key): release_date = unified_strdate(extract('album_release_date')) download_link = self._search_regex( - r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + r'freeDownloadPage(?:["\']|"):\s*(["\']|")(?P<url>(?:(?!\1).)+)\1', webpage, 'download link', default=None, group='url') if download_link: track_id = self._search_regex( - r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', - webpage, 'track id') + r'\?id=(?P<id>\d+)&', + download_link, 'track id') download_webpage = self._download_webpage( download_link, track_id, 'Downloading free downloads page') From 37f625598cb9b02cb06b3f12033cc29699d70818 Mon Sep 17 00:00:00 2001 From: Gilles Pietri <gilles@wolface.fr> Date: Sun, 27 Sep 2020 15:52:55 +0200 Subject: [PATCH 39/49] [bandcamp] update youtuble dl test song information to match title as artist - track, and add missing keys from info_dict --- youtube_dl/extractor/bandcamp.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index eccb867a0db..3d32b1e0f29 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -33,8 +33,11 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \\ - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, + 'uploader': 'youtube-dl \\', + 'timestamp': 1354224127, + 'upload_date': '20121129', }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { From 75a83afe3b8fd9dfe242ca2de428c313a2bd3e0e Mon Sep 17 00:00:00 2001 From: Gilles Pietri <gilles@wolface.fr> Date: Mon, 28 Sep 2020 19:42:56 +0200 Subject: [PATCH 40/49] [bandcamp] fix test song uploader name, cleanup remanings " and \ in data, including album titles --- youtube_dl/extractor/bandcamp.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 3d32b1e0f29..3405b570afb 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -33,9 +33,9 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \\ - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, - 'uploader': 'youtube-dl \\', + 'uploader': "youtube-dl \"'/\\\u00e4\u21ad", 'timestamp': 1354224127, 'upload_date': '20121129', }, @@ -43,7 +43,7 @@ class BandcampIE(InfoExtractor): }, { # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '853e35bf34aa1d6fe2615ae612564b36', + 'md5': '5d92af55811e47f38962a54c30b07ef0', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', @@ -94,11 +94,12 @@ def _real_extract(self, url): duration = None formats = [] - trackinfo_block = self._search_regex( + trackinfo_block = self._html_search_regex( r'trackinfo(?:["\']|"):\[\s*({.+?})\s*\],(?:["\']|")', webpage, 'track info', default='{}') - unescaped_json = unescapeHTML(trackinfo_block) - track_info = self._parse_json(unescaped_json, title) + + track_info = self._parse_json(trackinfo_block, title) + if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -120,9 +121,10 @@ def _real_extract(self, url): duration = float_or_none(track_info.get('duration')) def extract(key): - return self._search_regex( - r',(["\']|")%s\1:\1(?P<value>(?:(?!\1).)+)\1' % key, + data = self._html_search_regex( + r',(["\']|")%s\1:\1(?P<value>(?:\\\1|((?!\1).))+)\1' % key, webpage, key, default=None, group='value') + return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data artist = extract('artist') album = extract('album_title') @@ -319,10 +321,12 @@ def _real_extract(self, url): if self._html_search_meta('duration', elem_content, default=None)] title = self._html_search_regex( - r'album_title\s*(?:"|["\']):\s*(?:"|["\'])((?:\\.|[^"\\])+?)(?:"|["\'])', - webpage, 'title', fatal=False) + r'album_title\s*(?:"|["\']):\s*("|["\'])(?P<album>(?:\\\1|((?!\1).))+)\1', + webpage, 'title', fatal=False, group='album') + if title: - title = unescapeHTML(title) + title = title.replace(r'\"', '"') + return { '_type': 'playlist', 'uploader_id': uploader_id, From 03edd545a9e14b0fbcb36574248d8cf0e7a224d6 Mon Sep 17 00:00:00 2001 From: Gilles Pietri <gilles@wolface.fr> Date: Tue, 29 Sep 2020 12:09:55 +0200 Subject: [PATCH 41/49] [bandcamp] Revert test song title, and extract title generally (which may fail, as the other title json values might come up), instead of out of trackinfo, as bandcamp prefixes it with artist - --- youtube_dl/extractor/bandcamp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 3405b570afb..04b8aa80f91 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -33,7 +33,7 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, 'uploader': "youtube-dl \"'/\\\u00e4\u21ad", 'timestamp': 1354224127, @@ -99,7 +99,6 @@ def _real_extract(self, url): webpage, 'track info', default='{}') track_info = self._parse_json(trackinfo_block, title) - if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -115,7 +114,7 @@ def _real_extract(self, url): 'acodec': ext, 'abr': int_or_none(abr_str), }) - track = track_info.get('title') + track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) track_number = int_or_none(track_info.get('track_num')) duration = float_or_none(track_info.get('duration')) @@ -126,6 +125,7 @@ def extract(key): webpage, key, default=None, group='value') return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data + track = extract('title') artist = extract('artist') album = extract('album_title') timestamp = unified_timestamp( From 955c4cb6ac87d997e090cb809c21bba8cc6e3e0a Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Wed, 30 Sep 2020 15:49:51 +0200 Subject: [PATCH 42/49] [youtube/search_url]: improve title extraction --- youtube_dl/extractor/youtube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1f9cc73717f..6207585cf5d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3257,12 +3257,12 @@ def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): result_items = self._find_videos_in_json(search_response) - for plobj in result_items: - video_id = try_get(plobj, lambda x: x['videoId']) - video_title = try_get(plobj, lambda x: x['title']['runs'][0]['text']) + for renderer in result_items: + video_id = try_get(renderer, lambda x: x['videoId']) + video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText']) if video_id is None or video_title is None: - # we do not have a videoRenderer or it is empty + # we do not have a videoRenderer or title extraction broke continue video_title = video_title.strip() From 6e728bc988cff77b451c7cf1ded171d5086476ea Mon Sep 17 00:00:00 2001 From: Unknown <blackjack4494@web.de> Date: Wed, 7 Oct 2020 04:17:40 +0200 Subject: [PATCH 43/49] [skip travis] ignore cookies (gitignore) --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 9d371d9978f..065a14f49be 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,5 @@ venv/ # VS Code related files .vscode + +cookies.txt From 9d9314cb66cc2c815844fa8778360e1b0098b0f7 Mon Sep 17 00:00:00 2001 From: Unknown <blackjack4494@web.de> Date: Wed, 7 Oct 2020 04:19:08 +0200 Subject: [PATCH 44/49] [youtube] only playable on yt and age gated --- youtube_dlc/extractor/youtube.py | 69 +++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index d781c35b5f6..8946b7df866 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -1861,31 +1861,60 @@ def extract_player_response(player_response, video_id): embed_webpage = None if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+' or re.search(r'player-age-gate-content">', video_webpage) is not None): + cookie_keys = self._get_cookies('https://www.youtube.com').keys() age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube url = proto + '://www.youtube.com/embed/%s' % video_id embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') - data = compat_urllib_parse_urlencode({ - 'video_id': video_id, - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), - }) - video_info_url = proto + '://www.youtube.com/get_video_info?' + data - try: - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - except ExtractorError: - video_info_webpage = None - if video_info_webpage: - video_info = compat_parse_qs(video_info_webpage) - pl_response = video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(video_info) - view_count = extract_view_count(video_info) + # check if video is only playable on youtube - if so it requires auth (cookies) + if re.search(r'player-unavailable">', embed_webpage) is not None: + if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys + or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys): + age_gate = False + # Try looking directly into the video webpage + ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) + if ytplayer_config: + args = ytplayer_config['args'] + if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): + # Convert to the same format returned by compat_parse_qs + video_info = dict((k, [v]) for k, v in args.items()) + add_dash_mpd(video_info) + # Rental video is not rented but preview is available (e.g. + # https://www.youtube.com/watch?v=yYr8q0y5Jfg, + # https://github.com/ytdl-org/youtube-dl/issues/10532) + if not video_info and args.get('ypc_vid'): + return self.url_result( + args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) + if args.get('livestream') == '1' or args.get('live_playback') == 1: + is_live = True + if not player_response: + player_response = extract_player_response(args.get('player_response'), video_id) + if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): + add_dash_mpd_pr(player_response) + else: + raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True) + else: + data = compat_urllib_parse_urlencode({ + 'video_id': video_id, + 'eurl': 'https://youtube.googleapis.com/v/' + video_id, + 'sts': self._search_regex( + r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), + }) + video_info_url = proto + '://www.youtube.com/get_video_info?' + data + try: + video_info_webpage = self._download_webpage( + video_info_url, video_id, + note='Refetching age-gated info webpage', + errnote='unable to download video info webpage') + except ExtractorError: + video_info_webpage = None + if video_info_webpage: + video_info = compat_parse_qs(video_info_webpage) + pl_response = video_info.get('player_response', [None])[0] + player_response = extract_player_response(pl_response, video_id) + add_dash_mpd(video_info) + view_count = extract_view_count(video_info) else: age_gate = False # Try looking directly into the video webpage From 4bb9c8802e85211b67250fca726f3403ffc9be5e Mon Sep 17 00:00:00 2001 From: Unknown <blackjack4494@web.de> Date: Wed, 7 Oct 2020 04:31:23 +0200 Subject: [PATCH 45/49] flake8 --- youtube_dlc/extractor/youtube.py | 44 ++++++++++++++++---------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 8946b7df866..8c7e57b29c1 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -1870,28 +1870,28 @@ def extract_player_response(player_response, video_id): # check if video is only playable on youtube - if so it requires auth (cookies) if re.search(r'player-unavailable">', embed_webpage) is not None: if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys - or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys): - age_gate = False - # Try looking directly into the video webpage - ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) - if ytplayer_config: - args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - add_dash_mpd(video_info) - # Rental video is not rented but preview is available (e.g. - # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/ytdl-org/youtube-dl/issues/10532) - if not video_info and args.get('ypc_vid'): - return self.url_result( - args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) - if args.get('livestream') == '1' or args.get('live_playback') == 1: - is_live = True - if not player_response: - player_response = extract_player_response(args.get('player_response'), video_id) - if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): - add_dash_mpd_pr(player_response) + or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys): + age_gate = False + # Try looking directly into the video webpage + ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) + if ytplayer_config: + args = ytplayer_config['args'] + if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): + # Convert to the same format returned by compat_parse_qs + video_info = dict((k, [v]) for k, v in args.items()) + add_dash_mpd(video_info) + # Rental video is not rented but preview is available (e.g. + # https://www.youtube.com/watch?v=yYr8q0y5Jfg, + # https://github.com/ytdl-org/youtube-dl/issues/10532) + if not video_info and args.get('ypc_vid'): + return self.url_result( + args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) + if args.get('livestream') == '1' or args.get('live_playback') == 1: + is_live = True + if not player_response: + player_response = extract_player_response(args.get('player_response'), video_id) + if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): + add_dash_mpd_pr(player_response) else: raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True) else: From c73baf23e0e9e9b8197523b70859a57f12aab6ad Mon Sep 17 00:00:00 2001 From: Unknown <blackjack4494@web.de> Date: Wed, 7 Oct 2020 04:54:38 +0200 Subject: [PATCH 46/49] fix to support python 2.6 --- youtube_dlc/extractor/youtube.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 8c7e57b29c1..293d6069d09 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -1869,8 +1869,13 @@ def extract_player_response(player_response, video_id): embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') # check if video is only playable on youtube - if so it requires auth (cookies) if re.search(r'player-unavailable">', embed_webpage) is not None: + ''' + # TODO apply this patch when Support for Python 2.6(!) and above drops if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys): + ''' + if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys) + or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)): age_gate = False # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) From b777004649bcf2c5eb86c12a525ccb327ab55126 Mon Sep 17 00:00:00 2001 From: Unknown <blackjack4494@web.de> Date: Wed, 7 Oct 2020 05:34:22 +0200 Subject: [PATCH 47/49] Merge branch 'ytdl-org-master' --- README.md | 302 +++++++++++++++++++++++++++++ youtube_dlc/extractor/expressen.py | 7 +- youtube_dlc/extractor/iprima.py | 3 +- 3 files changed, 309 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2a0cf3a48c3..2d8bd9b8524 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,14 @@ youtube-dlc is a fork of youtube-dl with the intention of getting features teste - [Adobe Pass Options:](#adobe-pass-options) - [Post-processing Options:](#post-processing-options) - [Extractor Options:](#extractor-options) +- [CONFIGURATION](#configuration) + - [Authentication with `.netrc` file](#authentication-with-netrc-file) +- [OUTPUT TEMPLATE](#output-template) + - [Output template and Windows batch files](#output-template-and-windows-batch-files) + - [Output template examples](#output-template-examples) +- [FORMAT SELECTION](#format-selection) + - [Format selection examples](#format-selection-examples) +- [VIDEO SELECTION](#video-selection-1) # INSTALLATION @@ -474,3 +482,297 @@ Then simply type this ## Extractor Options: --ignore-dynamic-mpd Do not process dynamic DASH manifests +# CONFIGURATION + +You can configure youtube-dlc by placing any supported command line option to a configuration file. On Linux and macOS, the system wide configuration file is located at `/etc/youtube-dlc.conf` and the user wide configuration file at `~/.config/youtube-dlc/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dlc\config.txt` or `C:\Users\<user name>\youtube-dlc.conf`. Note that by default configuration file may not exist so you may need to create it yourself. + +For example, with the following configuration file youtube-dlc will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: +``` +# Lines starting with # are comments + +# Always extract audio +-x + +# Do not copy the mtime +--no-mtime + +# Use this proxy +--proxy 127.0.0.1:3128 + +# Save all videos under Movies directory in your home directory +-o ~/Movies/%(title)s.%(ext)s +``` + +Note that options in configuration file are just the same options aka switches used in regular command line calls thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. + +You can use `--ignore-config` if you want to disable the configuration file for a particular youtube-dlc run. + +You can also use `--config-location` if you want to use custom configuration file for a particular youtube-dlc run. + +### Authentication with `.netrc` file + +You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dlc execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by only you: +``` +touch $HOME/.netrc +chmod a-rwx,u+rw $HOME/.netrc +``` +After that you can add credentials for an extractor in the following format, where *extractor* is the name of the extractor in lowercase: +``` +machine <extractor> login <login> password <password> +``` +For example: +``` +machine youtube login myaccount@gmail.com password my_youtube_password +machine twitch login my_twitch_account_name password my_twitch_password +``` +To activate authentication with the `.netrc` file you should pass `--netrc` to youtube-dlc or place it in the [configuration file](#configuration). + +On Windows you may also need to setup the `%HOME%` environment variable manually. For example: +``` +set HOME=%USERPROFILE% +``` + +# OUTPUT TEMPLATE + +The `-o` option allows users to indicate a template for the output file names. + +**tl;dr:** [navigate me to examples](#output-template-examples). + +The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dlc -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by formatting operations. Allowed names along with sequence type are: + + - `id` (string): Video identifier + - `title` (string): Video title + - `url` (string): Video URL + - `ext` (string): Video filename extension + - `alt_title` (string): A secondary title of the video + - `display_id` (string): An alternative identifier for the video + - `uploader` (string): Full name of the video uploader + - `license` (string): License name the video is licensed under + - `creator` (string): The creator of the video + - `release_date` (string): The date (YYYYMMDD) when the video was released + - `timestamp` (numeric): UNIX timestamp of the moment the video became available + - `upload_date` (string): Video upload date (YYYYMMDD) + - `uploader_id` (string): Nickname or id of the video uploader + - `channel` (string): Full name of the channel the video is uploaded on + - `channel_id` (string): Id of the channel + - `location` (string): Physical location where the video was filmed + - `duration` (numeric): Length of the video in seconds + - `view_count` (numeric): How many users have watched the video on the platform + - `like_count` (numeric): Number of positive ratings of the video + - `dislike_count` (numeric): Number of negative ratings of the video + - `repost_count` (numeric): Number of reposts of the video + - `average_rating` (numeric): Average rating give by users, the scale used depends on the webpage + - `comment_count` (numeric): Number of comments on the video + - `age_limit` (numeric): Age restriction for the video (years) + - `is_live` (boolean): Whether this video is a live stream or a fixed-length video + - `start_time` (numeric): Time in seconds where the reproduction should start, as specified in the URL + - `end_time` (numeric): Time in seconds where the reproduction should end, as specified in the URL + - `format` (string): A human-readable description of the format + - `format_id` (string): Format code specified by `--format` + - `format_note` (string): Additional info about the format + - `width` (numeric): Width of the video + - `height` (numeric): Height of the video + - `resolution` (string): Textual description of width and height + - `tbr` (numeric): Average bitrate of audio and video in KBit/s + - `abr` (numeric): Average audio bitrate in KBit/s + - `acodec` (string): Name of the audio codec in use + - `asr` (numeric): Audio sampling rate in Hertz + - `vbr` (numeric): Average video bitrate in KBit/s + - `fps` (numeric): Frame rate + - `vcodec` (string): Name of the video codec in use + - `container` (string): Name of the container format + - `filesize` (numeric): The number of bytes, if known in advance + - `filesize_approx` (numeric): An estimate for the number of bytes + - `protocol` (string): The protocol that will be used for the actual download + - `extractor` (string): Name of the extractor + - `extractor_key` (string): Key name of the extractor + - `epoch` (numeric): Unix epoch when creating the file + - `autonumber` (numeric): Number that will be increased with each download, starting at `--autonumber-start` + - `playlist` (string): Name or id of the playlist that contains the video + - `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according to the total length of the playlist + - `playlist_id` (string): Playlist identifier + - `playlist_title` (string): Playlist title + - `playlist_uploader` (string): Full name of the playlist uploader + - `playlist_uploader_id` (string): Nickname or id of the playlist uploader + +Available for the video that belongs to some logical chapter or section: + + - `chapter` (string): Name or title of the chapter the video belongs to + - `chapter_number` (numeric): Number of the chapter the video belongs to + - `chapter_id` (string): Id of the chapter the video belongs to + +Available for the video that is an episode of some series or programme: + + - `series` (string): Title of the series or programme the video episode belongs to + - `season` (string): Title of the season the video episode belongs to + - `season_number` (numeric): Number of the season the video episode belongs to + - `season_id` (string): Id of the season the video episode belongs to + - `episode` (string): Title of the video episode + - `episode_number` (numeric): Number of the video episode within a season + - `episode_id` (string): Id of the video episode + +Available for the media that is a track or a part of a music album: + + - `track` (string): Title of the track + - `track_number` (numeric): Number of the track within an album or a disc + - `track_id` (string): Id of the track + - `artist` (string): Artist(s) of the track + - `genre` (string): Genre(s) of the track + - `album` (string): Title of the album the track belongs to + - `album_type` (string): Type of the album + - `album_artist` (string): List of all artists appeared on the album + - `disc_number` (numeric): Number of the disc or other physical medium the track belongs to + - `release_year` (numeric): Year (YYYY) when the album was released + +Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with `NA`. + +For example for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `youtube-dlc test video` and id `BaW_jenozKcj`, this will result in a `youtube-dlc test video-BaW_jenozKcj.mp4` file created in the current directory. + +For numeric sequences you can use numeric related formatting, for example, `%(view_count)05d` will result in a string with view count padded with zeros up to 5 characters, like in `00042`. + +Output templates can also contain arbitrary hierarchical path, e.g. `-o '%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s'` which will result in downloading each video in a directory corresponding to this path template. Any missing directory will be automatically created for you. + +To use percent literals in an output template use `%%`. To output to stdout use `-o -`. + +The current default template is `%(title)s-%(id)s.%(ext)s`. + +In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title: + +#### Output template and Windows batch files + +If you are using an output template inside a Windows batch file then you must escape plain percent characters (`%`) by doubling, so that `-o "%(title)s-%(id)s.%(ext)s"` should become `-o "%%(title)s-%%(id)s.%%(ext)s"`. However you should not touch `%`'s that are not plain characters, e.g. environment variables for expansion should stay intact: `-o "C:\%HOMEPATH%\Desktop\%%(title)s.%%(ext)s"`. + +#### Output template examples + +Note that on Windows you may need to use double quotes instead of single. + +```bash +$ youtube-dlc --get-filename -o '%(title)s.%(ext)s' BaW_jenozKc +youtube-dlc test video ''_ä↭𝕐.mp4 # All kinds of weird characters + +$ youtube-dlc --get-filename -o '%(title)s.%(ext)s' BaW_jenozKc --restrict-filenames +youtube-dlc_test_video_.mp4 # A simple file name + +# Download YouTube playlist videos in separate directory indexed by video order in a playlist +$ youtube-dlc -o '%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s' https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re + +# Download all playlists of YouTube channel/user keeping each playlist in separate directory: +$ youtube-dlc -o '%(uploader)s/%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s' https://www.youtube.com/user/TheLinuxFoundation/playlists + +# Download Udemy course keeping each chapter in separate directory under MyVideos directory in your home +$ youtube-dlc -u user -p password -o '~/MyVideos/%(playlist)s/%(chapter_number)s - %(chapter)s/%(title)s.%(ext)s' https://www.udemy.com/java-tutorial/ + +# Download entire series season keeping each series and each season in separate directory under C:/MyVideos +$ youtube-dlc -o "C:/MyVideos/%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" https://videomore.ru/kino_v_detalayah/5_sezon/367617 + +# Stream the video being downloaded to stdout +$ youtube-dlc -o - BaW_jenozKc +``` + +# FORMAT SELECTION + +By default youtube-dlc tries to download the best available quality, i.e. if you want the best quality you **don't need** to pass any special options, youtube-dlc will guess it for you by **default**. + +But sometimes you may want to download in a different format, for example when you are on a slow or intermittent connection. The key mechanism for achieving this is so-called *format selection* based on which you can explicitly specify desired format, select formats based on some criterion or criteria, setup precedence and much more. + +The general syntax for format selection is `--format FORMAT` or shorter `-f FORMAT` where `FORMAT` is a *selector expression*, i.e. an expression that describes format or formats you would like to download. + +**tl;dr:** [navigate me to examples](#format-selection-examples). + +The simplest case is requesting a specific format, for example with `-f 22` you can download the format with format code equal to 22. You can get the list of available format codes for particular video using `--list-formats` or `-F`. Note that these format codes are extractor specific. + +You can also use a file extension (currently `3gp`, `aac`, `flv`, `m4a`, `mp3`, `mp4`, `ogg`, `wav`, `webm` are supported) to download the best quality format of a particular file extension served as a single file, e.g. `-f webm` will download the best quality format with the `webm` extension served as a single file. + +You can also use special names to select particular edge case formats: + + - `best`: Select the best quality format represented by a single file with video and audio. + - `worst`: Select the worst quality format represented by a single file with video and audio. + - `bestvideo`: Select the best quality video-only format (e.g. DASH video). May not be available. + - `worstvideo`: Select the worst quality video-only format. May not be available. + - `bestaudio`: Select the best quality audio only-format. May not be available. + - `worstaudio`: Select the worst quality audio only-format. May not be available. + +For example, to download the worst quality video-only format you can use `-f worstvideo`. + +If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes. Note that slash is left-associative, i.e. formats on the left hand side are preferred, for example `-f 22/17/18` will download format 22 if it's available, otherwise it will download format 17 if it's available, otherwise it will download format 18 if it's available, otherwise it will complain that no suitable formats are available for download. + +If you want to download several formats of the same video use a comma as a separator, e.g. `-f 22,17,18` will download all these three formats, of course if they are available. Or a more sophisticated example combined with the precedence feature: `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. + +You can also filter the video formats by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). + +The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, `>=`, `=` (equals), `!=` (not equals): + + - `filesize`: The number of bytes, if known in advance + - `width`: Width of the video, if known + - `height`: Height of the video, if known + - `tbr`: Average bitrate of audio and video in KBit/s + - `abr`: Average audio bitrate in KBit/s + - `vbr`: Average video bitrate in KBit/s + - `asr`: Audio sampling rate in Hertz + - `fps`: Frame rate + +Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends with), `*=` (contains) and following string meta fields: + + - `ext`: File extension + - `acodec`: Name of the audio codec in use + - `vcodec`: Name of the video codec in use + - `container`: Name of the container format + - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`) + - `format_id`: A short description of the format + +Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). + +Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the video hoster. + +Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. + +You can merge the video and audio of two formats into a single file using `-f <video-format>+<audio-format>` (requires ffmpeg or avconv installed), for example `-f bestvideo+bestaudio` will download the best video-only format, the best audio-only format and mux them together with ffmpeg/avconv. + +Format selectors can also be grouped using parentheses, for example if you want to download the best mp4 and webm formats with a height lower than 480 you can use `-f '(mp4,webm)[height<480]'`. + +Since the end of April 2015 and version 2015.04.26, youtube-dlc uses `-f bestvideo+bestaudio/best` as the default format selection (see [#5447](https://github.com/ytdl-org/youtube-dl/issues/5447), [#5456](https://github.com/ytdl-org/youtube-dl/issues/5456)). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading the best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some DASH formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dlc to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dlc still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. + +If you want to preserve the old format selection behavior (prior to youtube-dlc 2015.04.26), i.e. you want to download the best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dlc. + +#### Format selection examples + +Note that on Windows you may need to use double quotes instead of single. + +```bash +# Download best mp4 format available or any other best if no mp4 available +$ youtube-dlc -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' + +# Download best format available but no better than 480p +$ youtube-dlc -f 'bestvideo[height<=480]+bestaudio/best[height<=480]' + +# Download best video only format but no bigger than 50 MB +$ youtube-dlc -f 'best[filesize<50M]' + +# Download best format available via direct link over HTTP/HTTPS protocol +$ youtube-dlc -f '(bestvideo+bestaudio/best)[protocol^=http]' + +# Download the best video format and the best audio format without merging them +$ youtube-dlc -f 'bestvideo,bestaudio' -o '%(title)s.f%(format_id)s.%(ext)s' +``` +Note that in the last example, an output template is recommended as bestvideo and bestaudio may have the same file name. + + +# VIDEO SELECTION + +Videos can be filtered by their upload date using the options `--date`, `--datebefore` or `--dateafter`. They accept dates in two formats: + + - Absolute dates: Dates in the format `YYYYMMDD`. + - Relative dates: Dates in the format `(now|today)[+-][0-9](day|week|month|year)(s)?` + +Examples: + +```bash +# Download only the videos uploaded in the last 6 months +$ youtube-dlc --dateafter now-6months + +# Download only the videos uploaded on January 1, 1970 +$ youtube-dlc --date 19700101 + +$ # Download only the videos uploaded in the 200x decade +$ youtube-dlc --dateafter 20000101 --datebefore 20091231 +``` \ No newline at end of file diff --git a/youtube_dlc/extractor/expressen.py b/youtube_dlc/extractor/expressen.py index f79365038d9..dc8b855d233 100644 --- a/youtube_dlc/extractor/expressen.py +++ b/youtube_dlc/extractor/expressen.py @@ -15,7 +15,7 @@ class ExpressenIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// - (?:www\.)?expressen\.se/ + (?:www\.)?(?:expressen|di)\.se/ (?:(?:tvspelare/video|videoplayer/embed)/)? tv/(?:[^/]+/)* (?P<id>[^/?#&]+) @@ -42,13 +42,16 @@ class ExpressenIE(InfoExtractor): }, { 'url': 'https://www.expressen.se/videoplayer/embed/tv/ditv/ekonomistudion/experterna-har-ar-fragorna-som-avgor-valet/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di', 'only_matching': True, + }, { + 'url': 'https://www.di.se/videoplayer/embed/tv/ditv/borsmorgon/implantica-rusar-70--under-borspremiaren-hor-styrelsemedlemmen/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return [ mobj.group('url') for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?expressen\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1', + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1', webpage)] def _real_extract(self, url): diff --git a/youtube_dlc/extractor/iprima.py b/youtube_dlc/extractor/iprima.py index 53a550c11e4..648ae6741f1 100644 --- a/youtube_dlc/extractor/iprima.py +++ b/youtube_dlc/extractor/iprima.py @@ -86,7 +86,8 @@ def _real_extract(self, url): (r'<iframe[^>]+\bsrc=["\'](?:https?:)?//(?:api\.play-backend\.iprima\.cz/prehravac/embedded|prima\.iprima\.cz/[^/]+/[^/]+)\?.*?\bid=(p\d+)', r'data-product="([^"]+)">', r'id=["\']player-(p\d+)"', - r'playerId\s*:\s*["\']player-(p\d+)'), + r'playerId\s*:\s*["\']player-(p\d+)', + r'\bvideos\s*=\s*["\'](p\d+)'), webpage, 'real id') playerpage = self._download_webpage( From b6e0c7d2e3bb17b36a3b6e16fa8fd67092658d6c Mon Sep 17 00:00:00 2001 From: Unknown <blackjack4494@web.de> Date: Fri, 9 Oct 2020 07:06:49 +0200 Subject: [PATCH 48/49] [mtv] fix mtv.com and more(?) --- youtube_dlc/extractor/mtv.py | 41 ++++++++++++++++++++++++++++++++++-- youtube_dlc/utils.py | 7 ++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/mtv.py b/youtube_dlc/extractor/mtv.py index fedd5f46bba..88c5eda383d 100644 --- a/youtube_dlc/extractor/mtv.py +++ b/youtube_dlc/extractor/mtv.py @@ -7,6 +7,7 @@ from ..compat import ( compat_str, compat_xpath, + compat_urlparse, ) from ..utils import ( ExtractorError, @@ -22,6 +23,7 @@ unescapeHTML, update_url_query, url_basename, + get_domain, xpath_text, ) @@ -253,7 +255,39 @@ def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None): return try_get(feed, lambda x: x['result']['data']['id'], compat_str) - def _extract_mgid(self, webpage): + def _extract_new_triforce_mgid(self, webpage, url='', data_zone=None, video_id=None): + # print(compat_urlparse.urlparse(url).netloc) + domain = get_domain(url) + if domain is None: + raise ExtractorError( + '[%s] could not get domain' % self.IE_NAME, + expected=True) + url = url.replace("https://", "http://") + enc_url = compat_urlparse.quote(url, safe='') + _TRIFORCE_V8_TEMPLATE = 'https://%s/feeds/triforce/manifest/v8?url=%s' + triforce_manifest_url = _TRIFORCE_V8_TEMPLATE % (domain, enc_url) + + manifest = self._download_json(triforce_manifest_url, video_id, fatal=False) + if manifest.get('manifest').get('type') == 'redirect': + self.to_screen('Found a redirect. Downloading manifest from new location') + new_loc = manifest.get('manifest').get('newLocation') + new_loc = new_loc.replace("https://", "http://") + enc_new_loc = compat_urlparse.quote(new_loc, safe='') + triforce_manifest_new_loc = _TRIFORCE_V8_TEMPLATE % (domain, enc_new_loc) + manifest = self._download_json(triforce_manifest_new_loc, video_id, fatal=False) + + item_id = try_get(manifest, lambda x: x['manifest']['reporting']['itemId'], compat_str) + if not item_id: + self.to_screen('Found no id!') + return + + # 'episode' can be anything. 'content' is used often as well + _MGID_TEMPLATE = 'mgid:arc:episode:%s:%s' + mgid = _MGID_TEMPLATE % (domain, item_id) + + return mgid + + def _extract_mgid(self, webpage, url): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf # or http://media.mtvnservices.com/{mgid} @@ -275,6 +309,9 @@ def _extract_mgid(self, webpage): mgid = self._search_regex( r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid', default=None) + if not mgid: + mgid = self._extract_new_triforce_mgid(webpage, url) + if not mgid: mgid = self._extract_triforce_mgid(webpage) @@ -283,7 +320,7 @@ def _extract_mgid(self, webpage): def _real_extract(self, url): title = url_basename(url) webpage = self._download_webpage(url, title) - mgid = self._extract_mgid(webpage) + mgid = self._extract_mgid(webpage, url) videos_info = self._get_videos_info(mgid) return videos_info diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index 32b179c6fcb..54a4ea2aaca 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -1984,6 +1984,7 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True): class HTMLAttributeParser(compat_HTMLParser): """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): self.attrs = {} compat_HTMLParser.__init__(self) @@ -2378,6 +2379,7 @@ class GeoRestrictedError(ExtractorError): This exception may be thrown when a video is not available from your geographic location due to geographic restrictions imposed by a website. """ + def __init__(self, msg, countries=None): super(GeoRestrictedError, self).__init__(msg, expected=True) self.msg = msg @@ -3558,6 +3560,11 @@ def remove_quotes(s): return s +def get_domain(url): + domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url) + return domain.group('domain') if domain else None + + def url_basename(url): path = compat_urlparse.urlparse(url).path return path.strip('/').split('/')[-1] From cf7cb9428745dc744129e0ba90c626919fb98f48 Mon Sep 17 00:00:00 2001 From: Unknown <blackjack4494@web.de> Date: Fri, 9 Oct 2020 07:50:22 +0200 Subject: [PATCH 49/49] [mtvn] update mtv network related extractors --- youtube_dlc/extractor/bet.py | 2 ++ youtube_dlc/extractor/cmt.py | 6 ++++-- youtube_dlc/extractor/comedycentral.py | 2 +- youtube_dlc/extractor/mtv.py | 23 +++++++++++++---------- youtube_dlc/extractor/nick.py | 2 +- youtube_dlc/extractor/spike.py | 16 +++++++++++++--- youtube_dlc/extractor/vh1.py | 2 ++ 7 files changed, 36 insertions(+), 17 deletions(-) diff --git a/youtube_dlc/extractor/bet.py b/youtube_dlc/extractor/bet.py index d7ceaa85e45..2c714423503 100644 --- a/youtube_dlc/extractor/bet.py +++ b/youtube_dlc/extractor/bet.py @@ -3,6 +3,8 @@ from .mtv import MTVServicesInfoExtractor from ..utils import unified_strdate +# TODO Remove - Reason: Outdated Site + class BetIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html' diff --git a/youtube_dlc/extractor/cmt.py b/youtube_dlc/extractor/cmt.py index e701fbeab82..a4ddb91609f 100644 --- a/youtube_dlc/extractor/cmt.py +++ b/youtube_dlc/extractor/cmt.py @@ -2,6 +2,8 @@ from .mtv import MTVIE +# TODO Remove - Reason: Outdated Site + class CMTIE(MTVIE): IE_NAME = 'cmt.com' @@ -39,7 +41,7 @@ class CMTIE(MTVIE): 'only_matching': True, }] - def _extract_mgid(self, webpage): + def _extract_mgid(self, webpage, url): mgid = self._search_regex( r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P<mgid>.+?)\1', webpage, 'mgid', group='mgid', default=None) @@ -50,5 +52,5 @@ def _extract_mgid(self, webpage): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - mgid = self._extract_mgid(webpage) + mgid = self._extract_mgid(webpage, url) return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/youtube_dlc/extractor/comedycentral.py b/youtube_dlc/extractor/comedycentral.py index d08b909a68e..f54c4adeb9f 100644 --- a/youtube_dlc/extractor/comedycentral.py +++ b/youtube_dlc/extractor/comedycentral.py @@ -48,7 +48,7 @@ class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1') + mgid = self._extract_mgid(webpage, url, data_zone='t2_lc_promo1') videos_info = self._get_videos_info(mgid) return videos_info diff --git a/youtube_dlc/extractor/mtv.py b/youtube_dlc/extractor/mtv.py index 88c5eda383d..e545a9ef3bd 100644 --- a/youtube_dlc/extractor/mtv.py +++ b/youtube_dlc/extractor/mtv.py @@ -255,8 +255,10 @@ def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None): return try_get(feed, lambda x: x['result']['data']['id'], compat_str) - def _extract_new_triforce_mgid(self, webpage, url='', data_zone=None, video_id=None): + def _extract_new_triforce_mgid(self, webpage, url='', video_id=None): # print(compat_urlparse.urlparse(url).netloc) + if url == '': + return domain = get_domain(url) if domain is None: raise ExtractorError( @@ -268,13 +270,14 @@ def _extract_new_triforce_mgid(self, webpage, url='', data_zone=None, video_id=N triforce_manifest_url = _TRIFORCE_V8_TEMPLATE % (domain, enc_url) manifest = self._download_json(triforce_manifest_url, video_id, fatal=False) - if manifest.get('manifest').get('type') == 'redirect': - self.to_screen('Found a redirect. Downloading manifest from new location') - new_loc = manifest.get('manifest').get('newLocation') - new_loc = new_loc.replace("https://", "http://") - enc_new_loc = compat_urlparse.quote(new_loc, safe='') - triforce_manifest_new_loc = _TRIFORCE_V8_TEMPLATE % (domain, enc_new_loc) - manifest = self._download_json(triforce_manifest_new_loc, video_id, fatal=False) + if manifest: + if manifest.get('manifest').get('type') == 'redirect': + self.to_screen('Found a redirect. Downloading manifest from new location') + new_loc = manifest.get('manifest').get('newLocation') + new_loc = new_loc.replace("https://", "http://") + enc_new_loc = compat_urlparse.quote(new_loc, safe='') + triforce_manifest_new_loc = _TRIFORCE_V8_TEMPLATE % (domain, enc_new_loc) + manifest = self._download_json(triforce_manifest_new_loc, video_id, fatal=False) item_id = try_get(manifest, lambda x: x['manifest']['reporting']['itemId'], compat_str) if not item_id: @@ -287,7 +290,7 @@ def _extract_new_triforce_mgid(self, webpage, url='', data_zone=None, video_id=N return mgid - def _extract_mgid(self, webpage, url): + def _extract_mgid(self, webpage, url, data_zone=None): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf # or http://media.mtvnservices.com/{mgid} @@ -313,7 +316,7 @@ def _extract_mgid(self, webpage, url): mgid = self._extract_new_triforce_mgid(webpage, url) if not mgid: - mgid = self._extract_triforce_mgid(webpage) + mgid = self._extract_triforce_mgid(webpage, data_zone) return mgid diff --git a/youtube_dlc/extractor/nick.py b/youtube_dlc/extractor/nick.py index 2e8b302ac85..04b98f7bde5 100644 --- a/youtube_dlc/extractor/nick.py +++ b/youtube_dlc/extractor/nick.py @@ -245,5 +245,5 @@ class NickRuIE(MTVServicesInfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - mgid = self._extract_mgid(webpage) + mgid = self._extract_mgid(webpage, url) return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/youtube_dlc/extractor/spike.py b/youtube_dlc/extractor/spike.py index aabff7a3ce7..3cee331f6a7 100644 --- a/youtube_dlc/extractor/spike.py +++ b/youtube_dlc/extractor/spike.py @@ -20,8 +20,18 @@ class BellatorIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.bellator.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - def _extract_mgid(self, webpage): - return self._extract_triforce_mgid(webpage) + def _extract_mgid(self, webpage, url): + mgid = None + + if not mgid: + mgid = self._extract_triforce_mgid(webpage) + + if not mgid: + mgid = self._extract_new_triforce_mgid(webpage, url) + + return mgid + +# TODO Remove - Reason: Outdated Site class ParamountNetworkIE(MTVServicesInfoExtractor): @@ -43,7 +53,7 @@ class ParamountNetworkIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - def _extract_mgid(self, webpage): + def _extract_mgid(self, webpage, url): root_data = self._parse_json(self._search_regex( r'window\.__DATA__\s*=\s*({.+})', webpage, 'data'), None) diff --git a/youtube_dlc/extractor/vh1.py b/youtube_dlc/extractor/vh1.py index dff94a2b845..ea576dc6ba6 100644 --- a/youtube_dlc/extractor/vh1.py +++ b/youtube_dlc/extractor/vh1.py @@ -3,6 +3,8 @@ from .mtv import MTVServicesInfoExtractor +# TODO Remove - Reason: Outdated Site + class VH1IE(MTVServicesInfoExtractor): IE_NAME = 'vh1.com'