From bd1340d29483f52bddf27f11adbc3ad488cb0968 Mon Sep 17 00:00:00 2001 From: Glenn Slayden <5589855+glenn-slayden@users.noreply.github.com> Date: Wed, 24 Jun 2020 23:02:06 -0700 Subject: [PATCH 1/2] Prevent HTTP 301 for YouTube playlist continuations When a YouTube playlist or channel listing has more than one page of videos, the continuation URLs specify `youtube.com` instead of `www.youtube.com`. This causes an unnecessary HTTP round-trip for each continuation page the extractor accesses. **Example** youtube-dl -s --print-traffic https://www.youtube.com/channel/UCBR8-60-B28hp2BmDPdntcQ **Before** GET /playlist?list=UUBR8-60-B28hp2BmDPdntcQ&disable_polymer=true Host: www.youtube.com HTTP/1.1 200 OK GET /browse_ajax?action_continuation=1&continuation=4qmFsgIsEhpWTFVVQlI4LTYwLUIyOGhwMkJtRFBkbnRjURoOZWdaUVZEcERSMUUlM0Q%253D&disable_polymer=true Host: youtube.com HTTP/1.1 301 Moved Permanently Location: https://www.youtube.com/browse_ajax?action_continuation=1&continuation=4qmFsgIsEhpWTFVVQlI4LTYwLUIyOGhwMkJtRFBkbnRjURoOZWdaUVZEcERSMUUlM0Q%253D&disable_polymer=true GET /browse_ajax?action_continuation=1&continuation=4qmFsgIsEhpWTFVVQlI4LTYwLUIyOGhwMkJtRFBkbnRjURoOZWdaUVZEcERSMUUlM0Q%253D&disable_polymer=true Host: www.youtube.com HTTP/1.1 200 OK GET /browse_ajax?action_continuation=1&continuation=4qmFsgIqEhpWTFVVQlI4LTYwLUIyOGhwMkJtRFBkbnRjURoMZWdkUVZEcERUV2RD&disable_polymer=true Host: youtube.com HTTP/1.1 301 Moved Permanently Location: https://www.youtube.com/browse_ajax?action_continuation=1&continuation=4qmFsgIqEhpWTFVVQlI4LTYwLUIyOGhwMkJtRFBkbnRjURoMZWdkUVZEcERUV2RD&disable_polymer=true GET /browse_ajax?action_continuation=1&continuation=4qmFsgIqEhpWTFVVQlI4LTYwLUIyOGhwMkJtRFBkbnRjURoMZWdkUVZEcERUV2RD&disable_polymer=true Host: www.youtube.com HTTP/1.1 200 OK GET /browse_ajax?action_continuation=1&continuation=4qmFsgIqEhpWTFVVQlI4LTYwLUIyOGhwMkJtRFBkbnRjURoMZWdkUVZEcERTM2RE&disable_polymer=true Host: youtube.com HTTP/1.1 301 Moved Permanently Location: https://www.youtube.com/browse_ajax?action_continuation=1&continuation=4qmFsgIqEhpWTFVVQlI4LTYwLUIyOGhwMkJtRFBkbnRjURoMZWdkUVZEcERTM2RE&disable_polymer=true GET /browse_ajax?action_continuation=1&continuation=4qmFsgIqEhpWTFVVQlI4LTYwLUIyOGhwMkJtRFBkbnRjURoMZWdkUVZEcERTM2RE&disable_polymer=true Host: www.youtube.com HTTP/1.1 200 OK **After** GET /playlist?list=UUBR8-60-B28hp2BmDPdntcQ&disable_polymer=true Host: www.youtube.com HTTP/1.1 200 OK GET /browse_ajax?action_continuation=1&continuation=4qmFsgIsEhpWTFVVQlI4LTYwLUIyOGhwMkJtRFBkbnRjURoOZWdaUVZEcERSMUUlM0Q%253D&disable_polymer=true Host: www.youtube.com HTTP/1.1 200 OK GET /browse_ajax?action_continuation=1&continuation=4qmFsgIqEhpWTFVVQlI4LTYwLUIyOGhwMkJtRFBkbnRjURoMZWdkUVZEcERUV2RD&disable_polymer=true Host: www.youtube.com HTTP/1.1 200 OK --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1bc79e01478..638c6617eaf 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -303,7 +303,7 @@ def _entries(self, page, playlist_id): # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, From 9050896212f28269f1ab6abd2944ab98be8b5b86 Mon Sep 17 00:00:00 2001 From: Sergey M Date: Wed, 1 Jul 2020 02:54:58 +0700 Subject: [PATCH 2/2] Update youtube.py --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 638c6617eaf..4ab58d6b651 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2776,7 +2776,7 @@ def _extract_mix(self, playlist_id): ids = [] last_id = playlist_id[-11:] for n in itertools.count(1): - url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) + url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) webpage = self._download_webpage( url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) new_ids = orderedSet(re.findall( @@ -3286,7 +3286,7 @@ def _entries(self, page): break more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, transform_source=uppercase_escape, headers=self._YOUTUBE_CLIENT_HEADERS)