From 8b2e4e87e0d3b11d9d60d8111280cee7bcaa3455 Mon Sep 17 00:00:00 2001 From: ngld <ngld@tproxy.de> Date: Wed, 12 Aug 2015 15:53:13 +0200 Subject: [PATCH 1/7] [roosterteeth] added --- youtube_dl/extractor/__init__.py | 4 + youtube_dl/extractor/roosterteeth.py | 335 +++++++++++++++++++++++++++ 2 files changed, 339 insertions(+) create mode 100644 youtube_dl/extractor/roosterteeth.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a73a1317eb7..becc51a3d51 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -498,6 +498,10 @@ from .reverbnation import ReverbNationIE from .ringtv import RingTVIE from .ro220 import Ro220IE +from .roosterteeth import ( + RoosterteethIE, + RoosterteethShowIE +) from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtbf import RTBFIE diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py new file mode 100644 index 00000000000..7d976579f28 --- /dev/null +++ b/youtube_dl/extractor/roosterteeth.py @@ -0,0 +1,335 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + js_to_json, + ExtractorError, + compat_urllib_parse_urlparse, + compat_urllib_parse, + compat_urllib_request +) + + +class RoosterteethShowIE(InfoExtractor): + _VALID_URL = r'http://(?P<domain>(?:www\.)?(?:roosterteeth\.com|achievementhunter\.com|fun\.haus))/show/(?P<id>[^/]+)(?:/season)?' + _TESTS = [{ + 'url': 'http://roosterteeth.com/show/screen-play', + 'info_dict': { + 'id': 'screen-play', + 'description': 'A Rooster Teeth podcast focusing on all things Film and TV. Listen to our pop culture geeks chat about TV premieres and finales, blockbuster franchises, indie darlings, casting rumors and spotlight a film to discuss in their weekly "Movie Book Club" segment. So pop some popcorn, grab a good seat and enjoy the show.', + 'title': 'Screen Play', + }, + 'playlist_count': 23 + }, { + 'url': 'http://roosterteeth.com/show/red-vs-blue#;season=.* 1$', + 'info_dict': { + 'id': 'red-vs-blue', + 'description': 'In the distant future, two groups of soldiers battle for control of the least desirable piece of real estate in the known universe - a box canyon in the middle of nowhere.', + 'title': 'Red vs. Blue', + }, + 'playlist_count': 24 + }, { + 'url': 'http://roosterteeth.com/show/red-vs-blue', + 'info_dict': { + 'id': 'red-vs-blue', + 'description': 'In the distant future, two groups of soldiers battle for control of the least desirable piece of real estate in the known universe - a box canyon in the middle of nowhere.', + 'title': 'Red vs. Blue', + }, + + 'playlist_mincount': 380 + }] + + def _real_extract(self, url): + ep_filter = {} + + if '#;' in url: + url, params = url.split('#;') + ep_filter = compat_urllib_parse.parse_qs(params) + + playlist_id = self._match_id(url) + html = self._download_webpage(url, playlist_id) + + title = self._html_search_regex(r'<div class="show-header">\s*<h1>([^<]+)</h1>\s*</div>', html, 'show title') + description = self._html_search_regex(r'<section class="show-details">((?:[^<]|<(?!/section>))+)</section>', html, 'show description') + + start_piece = "<div id='tab-content-episodes' class='tab-content'>" + start = html.find(start_piece) + if start == -1: + raise ExtractorError("Can't find the episodes!") + + html = html[start + len(start_piece):].lstrip() + sections = [] + if html.startswith('<ul class='): + # This show doesn't have seasons AKA sections. + end = html.find('</ul>') + if end == -1: + raise ExtractorError("Can't find the end of the episode list!") + + sections = [(None, html[:end])] + else: + # We have to extract the sections. + end = html.find('</article></section></section>') + if end == -1: + raise ExtractorError("Can't find the end of the section list!") + + html = html[:end] + HEADER_RE = re.compile(r"<h3 class='title' id='header-[^']+'>([^<]+)</h3>") + + # Process sections / seasons + for section in html.split('</section>'): + sec_title = self._html_search_regex(HEADER_RE, section, 'season title') + start = section.find("<ul class='episode-blocks'>") + end = section.find("</ul>", start) + + if start < 0 or end < 0: + raise ExtractorError("Couldn't parse season %s! (%s)" % (sec_title, playlist_id)) + + sections.append((sec_title, section[start:end])) + + results = [] + EP_RE = re.compile(r'<a href="(?P<url>[^"]+)">(?:[^<]|<(?!p class="name"))+<p class="name">(?P<title>[^<]+)</p>\s*</a>') + + for sec_title, part in reversed(sections): + episodes = part.split('</li>') + for ep_part in episodes: + if ep_part.strip() == '': + continue + + ep = EP_RE.search(ep_part) + if not ep: + raise ExtractorError("Failed to parse an episode of season %s! (%s, %s)" % (sec_title or '0', playlist_id, ep_part)) + + url = clean_html(ep.group('url')) + if sec_title: + # Pass the season title to the video extractor. + url += '#;' + compat_urllib_parse.urlencode({'season': sec_title}) + res = self.url_result(url, 'Roosterteeth') + res['season'] = sec_title + else: + res = self.url_result(url, 'Roosterteeth') + + if self._match_filter(res, ep_filter): + results.append(res) + + if len(sections) == 1 and sections[0][0] is None: + # If the page didn't contain sections, then the episodes are in reverse order. + results = list(reversed(results)) + + return self.playlist_result(results, playlist_id, title, description) + + def _match_filter(self, item, filter_rules): + for k, v in filter_rules.items(): + if isinstance(v, list) and len(v) > 1: + # A list of acceptable values + if item.get(k) not in v: + return False + else: + if not re.match(v[0], item.get(k)): + return False + + return True + + +class RoosterteethIE(InfoExtractor): + _VALID_URL = r'http://(?P<domain>(?:www\.)?(?:roosterteeth\.com|achievementhunter\.com|fun\.haus))/episode/(?P<id>[^/]+)' + _TESTS = [ + { + 'params': { + # Without this parameter ytdl downloads the whole file. + 'hls_prefer_native': True + }, + + 'url': 'http://achievementhunter.com/episode/rage-quit-season-1-episode-199', + 'md5': '828fe30ccdddf5d85e444e33686d531a', + 'info_dict': { + 'id': 'rage-quit-season-1-episode-199', + 'ext': 'mp4', + 'title': 'Rage Quit - No Time to Explain', + 'description': 'There\'s no time to explain this video.', + 'thumbnail': r're:^http://s3\.amazonaws\.com/cdn\.roosterteeth\.com/uploads/images/[a-f0-9-]+/md/[a-z0-9-]+\.jpeg$', + 'protocol': 'm3u8', + 'url': r're:^http://[a-zA-Z0-9.]+\.taucdn\.net/[0-9a-zA-Z]+/video/uploads/videos/[0-9a-f-]+/[0-9A-Z]+\.m3u8$', + } + }, + { + 'url': 'http://roosterteeth.com/episode/red-vs-blue-season-1-episode-1', + 'md5': '80277833f3ed946b553d13cf8e27443d', + 'info_dict': { + 'id': 'red-vs-blue-season-1-episode-1', + 'ext': 'mp4', + + 'title': 'Why Are We Here? - Episode 1 - Red vs. Blue Season 1', + 'thumbnail': r're:^https://i\.ytimg\.com/vi/[0-9a-zA-Z]+/maxresdefault\.jpg$', + 'url': r're:^https://[0-9a-z-]+\.googlevideo\.com/videoplayback', + + 'upload_date': '20150306', + 'uploader_id': 'UCII0hP2Ycmhh5j8lS4cexBQ', + 'uploader': 'Red vs. Blue', + 'description': 'The first episode of Red vs. Blue introduces the main characters, and poses the all-important question, why are we here?' + } + } + ] + _NETRC_MACHINE = 'roosterteeth' + _authed = None + _sponsor = None + + def _real_initialize(self): + self._authed = {} + + def _real_extract(self, url): + if '#;' in url: + url, params = url.split('#;') + params = compat_urllib_parse.parse_qs(params) + else: + params = {} + + video_id = self._match_id(url) + html = self._download_webpage(url, video_id) + + if html.find('Unfortunately, this is sponsor-only.') > -1: + domain = compat_urllib_parse_urlparse(url).netloc + release = re.search(r'<p>[^<]+ Releases ([0-9]+ [a-zA-Z]+) from now</p>', html) + if release: + release = ' The video will be public in %s.' % release.group(1) + else: + release = '' + + if not self._login(domain): + raise ExtractorError("This video is sponsor-only. You didn't provide your credentials or the login failed.%s" % release, expected=True) + + # Try again. + html = self._download_webpage(url, video_id) + if html.find('Unfortunately, this is sponsor-only.') > -1: + if not self._is_sponsor(domain): + raise ExtractorError('This video is sponsor-only but you are not a sponsor.%s' % release, expected=True) + else: + raise ExtractorError('This is a sponsor-only video and although I tried to login, it did not work.') + + js = self._html_search_regex(r'<script src="https?://roosterteeth\.com/scripts/lib/(?:jwplayer|youtube)\.min\.js"></script>\s*<script>\s*([^<]+)\s*</script>', html, 'video info') + info = re.search(r'RT\.(?P<player>youtube|jwplayer)\.player\((?P<json>\{(?:[^}]|\}(?!\);))+\})\);', js) + if not info: + raise ExtractorError("Can't parse the video metadata! (%s)" % js) + + player = info.group('player') + meta = self._parse_json(js_to_json(info.group('json')), video_id) + if player == 'jwplayer': + # Make sure that all values are there. + for attr in ('containerId', 'videoImage', 'videoTitle', 'manifest'): + if attr not in meta: + raise ExtractorError('Unexpected video info! Attribute %s is missing.' % attr) + + video_image = meta['videoImage'] + if video_image.startswith('//'): + video_image = 'http:' + video_image + + res = { + 'id': video_id, + 'title': meta['videoTitle'].strip(), + 'formats': self._extract_m3u8_formats(meta['manifest'], video_id, ext='mp4'), + 'thumbnail': video_image + } + elif player == 'youtube': + if 'youtubeKey' not in meta: + raise ExtractorError('Invalid metadata for youtube video!') + + res = self.url_result('https://youtube.com/watch?v=' + meta['youtubeKey']) + res['_type'] = 'url_transparent' + res['id'] = video_id + else: + raise ExtractorError('Unknown player type %s!' % player) + + if 'season' in params: + res['season'] = params['season'][0] + + desc = self._og_search_description(html) + if desc: + res['description'] = desc.strip() + + return res + + def _login(self, domain='roosterteeth.com'): + """ + Attempt to log in to RoosterTeeth (or Achievement Hunter). + NOTE: RT is planning to implement SSO which will probably change how this works. + """ + + if domain in self._authed: + return self._authed[domain] + + (username, password) = self._get_login_info() + + # No authentication to be performed + if username is None: + return False + + LOGIN_URL = 'http://%s/login' % domain + login_page, hdl = self._download_webpage_handle( + LOGIN_URL, None, + note='Downloading login page', + errnote='unable to fetch login page', fatal=False) + + if login_page is False: + return False + + if hdl.geturl() != LOGIN_URL: + # We were redirected which means that we're already logged in. + self._authed[domain] = True + return True + + token = self._search_regex(r'(?s)<input.+?name="_token".+?value="(.+?)"', + login_page, 'Login token') + + # Log in + login_form_strs = { + '_token': token, + 'username': username, + 'password': password + } + + # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode + # chokes on unicode + login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items()) + login_data = compat_urllib_parse.urlencode(login_form).encode('ascii') + + req = compat_urllib_request.Request(LOGIN_URL, login_data, {'Content-Type': 'application/x-www-form-urlencoded'}) + login_results = self._download_webpage( + req, None, + note='Logging in', errnote='unable to log in', fatal=False) + + if login_results is False: + return False + + if login_results.find('Error in exception handler.') > -1 or login_results.find('Authentication failed. Please check and try again, or reset your password') > -1: + self.report_warning('unable to log in: bad username or password') + self._authed[domain] = False + return False + + self._authed[domain] = True + return True + + def _is_sponsor(self, domain='roosterteeth.com'): + if self._sponsor is None: + username, _ = self._get_login_info() + profile_page = 'http://%s/user/%s' % (domain, compat_urllib_parse.quote(username)) + html = self._download_webpage( + profile_page, None, + note='Checking user profile...', + errnote='unable to access user profile', fatal=False) + + if not html: + return False + + user_info = self._search_regex( + r'<div class="sidebar-profile-header">\s*<p[^>]+>\s*<a href="%s">[^<]+</a>\s*<span>((?:[^<]|<(?!/span>))+)</span>' % (profile_page), + html, 'user status', fatal=False) + + if not user_info: + return False + + self._sponsor = '<i class="icon ion-star"></i>' in user_info + + return self._sponsor From 01d6c90dd59854b80ff0f55daecc479d5ccf3314 Mon Sep 17 00:00:00 2001 From: ngld <ngld@tproxy.de> Date: Wed, 12 Aug 2015 19:24:44 +0200 Subject: [PATCH 2/7] [roosterteeth] Fix for Python 2 --- youtube_dl/extractor/roosterteeth.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 7d976579f28..44f1e27468d 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -8,6 +8,7 @@ clean_html, js_to_json, ExtractorError, + compat_parse_qs, compat_urllib_parse_urlparse, compat_urllib_parse, compat_urllib_request @@ -48,7 +49,7 @@ def _real_extract(self, url): if '#;' in url: url, params = url.split('#;') - ep_filter = compat_urllib_parse.parse_qs(params) + ep_filter = compat_parse_qs(params) playlist_id = self._match_id(url) html = self._download_webpage(url, playlist_id) @@ -183,7 +184,7 @@ def _real_initialize(self): def _real_extract(self, url): if '#;' in url: url, params = url.split('#;') - params = compat_urllib_parse.parse_qs(params) + params = compat_parse_qs(params) else: params = {} From 147b78b6bf09cf4c5add6ef411e10248ca017047 Mon Sep 17 00:00:00 2001 From: ngld <ngld@tproxy.de> Date: Wed, 12 Aug 2015 19:31:35 +0200 Subject: [PATCH 3/7] [roosterteeth] Remove custom filter and correct the video info regex. --- youtube_dl/extractor/roosterteeth.py | 49 +++------------------------- 1 file changed, 5 insertions(+), 44 deletions(-) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 44f1e27468d..d997810cfbd 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -25,14 +25,6 @@ class RoosterteethShowIE(InfoExtractor): 'title': 'Screen Play', }, 'playlist_count': 23 - }, { - 'url': 'http://roosterteeth.com/show/red-vs-blue#;season=.* 1$', - 'info_dict': { - 'id': 'red-vs-blue', - 'description': 'In the distant future, two groups of soldiers battle for control of the least desirable piece of real estate in the known universe - a box canyon in the middle of nowhere.', - 'title': 'Red vs. Blue', - }, - 'playlist_count': 24 }, { 'url': 'http://roosterteeth.com/show/red-vs-blue', 'info_dict': { @@ -45,12 +37,6 @@ class RoosterteethShowIE(InfoExtractor): }] def _real_extract(self, url): - ep_filter = {} - - if '#;' in url: - url, params = url.split('#;') - ep_filter = compat_parse_qs(params) - playlist_id = self._match_id(url) html = self._download_webpage(url, playlist_id) @@ -105,16 +91,12 @@ def _real_extract(self, url): raise ExtractorError("Failed to parse an episode of season %s! (%s, %s)" % (sec_title or '0', playlist_id, ep_part)) url = clean_html(ep.group('url')) + res = self.url_result(url, 'Roosterteeth') + if sec_title: - # Pass the season title to the video extractor. - url += '#;' + compat_urllib_parse.urlencode({'season': sec_title}) - res = self.url_result(url, 'Roosterteeth') res['season'] = sec_title - else: - res = self.url_result(url, 'Roosterteeth') - if self._match_filter(res, ep_filter): - results.append(res) + results.append(res) if len(sections) == 1 and sections[0][0] is None: # If the page didn't contain sections, then the episodes are in reverse order. @@ -122,18 +104,6 @@ def _real_extract(self, url): return self.playlist_result(results, playlist_id, title, description) - def _match_filter(self, item, filter_rules): - for k, v in filter_rules.items(): - if isinstance(v, list) and len(v) > 1: - # A list of acceptable values - if item.get(k) not in v: - return False - else: - if not re.match(v[0], item.get(k)): - return False - - return True - class RoosterteethIE(InfoExtractor): _VALID_URL = r'http://(?P<domain>(?:www\.)?(?:roosterteeth\.com|achievementhunter\.com|fun\.haus))/episode/(?P<id>[^/]+)' @@ -182,12 +152,6 @@ def _real_initialize(self): self._authed = {} def _real_extract(self, url): - if '#;' in url: - url, params = url.split('#;') - params = compat_parse_qs(params) - else: - params = {} - video_id = self._match_id(url) html = self._download_webpage(url, video_id) @@ -210,7 +174,7 @@ def _real_extract(self, url): else: raise ExtractorError('This is a sponsor-only video and although I tried to login, it did not work.') - js = self._html_search_regex(r'<script src="https?://roosterteeth\.com/scripts/lib/(?:jwplayer|youtube)\.min\.js"></script>\s*<script>\s*([^<]+)\s*</script>', html, 'video info') + js = self._html_search_regex(r'<script src="https?://(?:roosterteeth\.com|achievementhunter\.com|fun\.haus)/scripts/lib/(?:jwplayer|youtube)\.min\.js"></script>\s*<script>\s*([^<]+)\s*</script>', html, 'video info') info = re.search(r'RT\.(?P<player>youtube|jwplayer)\.player\((?P<json>\{(?:[^}]|\}(?!\);))+\})\);', js) if not info: raise ExtractorError("Can't parse the video metadata! (%s)" % js) @@ -243,13 +207,10 @@ def _real_extract(self, url): else: raise ExtractorError('Unknown player type %s!' % player) - if 'season' in params: - res['season'] = params['season'][0] - desc = self._og_search_description(html) if desc: res['description'] = desc.strip() - + return res def _login(self, domain='roosterteeth.com'): From b6423cd5b1c1d12c0efc27964566b8bbf046cb77 Mon Sep 17 00:00:00 2001 From: ngld <ngld@tproxy.de> Date: Wed, 12 Aug 2015 21:29:18 +0200 Subject: [PATCH 4/7] [roosterteeth] Use the native HLS implementation by default and add the season name to episodes when downloading from a show page. --- youtube_dl/extractor/roosterteeth.py | 30 +++++++++++++++++----------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index d997810cfbd..63105282408 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -11,7 +11,9 @@ compat_parse_qs, compat_urllib_parse_urlparse, compat_urllib_parse, - compat_urllib_request + compat_urllib_request, + smuggle_url, + unsmuggle_url ) @@ -92,9 +94,11 @@ def _real_extract(self, url): url = clean_html(ep.group('url')) res = self.url_result(url, 'Roosterteeth') + res['title'] = clean_html(ep.group('title')) if sec_title: - res['season'] = sec_title + res['url'] = smuggle_url(res['url'], {'season': sec_title}) + res['title'] = '%s: %s' % (sec_title, res['title']) results.append(res) @@ -109,11 +113,6 @@ class RoosterteethIE(InfoExtractor): _VALID_URL = r'http://(?P<domain>(?:www\.)?(?:roosterteeth\.com|achievementhunter\.com|fun\.haus))/episode/(?P<id>[^/]+)' _TESTS = [ { - 'params': { - # Without this parameter ytdl downloads the whole file. - 'hls_prefer_native': True - }, - 'url': 'http://achievementhunter.com/episode/rage-quit-season-1-episode-199', 'md5': '828fe30ccdddf5d85e444e33686d531a', 'info_dict': { @@ -122,7 +121,7 @@ class RoosterteethIE(InfoExtractor): 'title': 'Rage Quit - No Time to Explain', 'description': 'There\'s no time to explain this video.', 'thumbnail': r're:^http://s3\.amazonaws\.com/cdn\.roosterteeth\.com/uploads/images/[a-f0-9-]+/md/[a-z0-9-]+\.jpeg$', - 'protocol': 'm3u8', + 'protocol': 'm3u8_native', 'url': r're:^http://[a-zA-Z0-9.]+\.taucdn\.net/[0-9a-zA-Z]+/video/uploads/videos/[0-9a-f-]+/[0-9A-Z]+\.m3u8$', } }, @@ -133,7 +132,7 @@ class RoosterteethIE(InfoExtractor): 'id': 'red-vs-blue-season-1-episode-1', 'ext': 'mp4', - 'title': 'Why Are We Here? - Episode 1 - Red vs. Blue Season 1', + 'title': 'Episode 1', 'thumbnail': r're:^https://i\.ytimg\.com/vi/[0-9a-zA-Z]+/maxresdefault\.jpg$', 'url': r're:^https://[0-9a-z-]+\.googlevideo\.com/videoplayback', @@ -152,6 +151,7 @@ def _real_initialize(self): self._authed = {} def _real_extract(self, url): + url, data = unsmuggle_url(url) video_id = self._match_id(url) html = self._download_webpage(url, video_id) @@ -174,7 +174,7 @@ def _real_extract(self, url): else: raise ExtractorError('This is a sponsor-only video and although I tried to login, it did not work.') - js = self._html_search_regex(r'<script src="https?://(?:roosterteeth\.com|achievementhunter\.com|fun\.haus)/scripts/lib/(?:jwplayer|youtube)\.min\.js"></script>\s*<script>\s*([^<]+)\s*</script>', html, 'video info') + js = self._html_search_regex(r'<script src="https?://(?:www\.)?(?:roosterteeth\.com|achievementhunter\.com|fun\.haus)/scripts/lib/(?:jwplayer|youtube)\.min\.js"></script>\s*<script>\s*([^<]+)\s*</script>', html, 'video info') info = re.search(r'RT\.(?P<player>youtube|jwplayer)\.player\((?P<json>\{(?:[^}]|\}(?!\);))+\})\);', js) if not info: raise ExtractorError("Can't parse the video metadata! (%s)" % js) @@ -193,8 +193,7 @@ def _real_extract(self, url): res = { 'id': video_id, - 'title': meta['videoTitle'].strip(), - 'formats': self._extract_m3u8_formats(meta['manifest'], video_id, ext='mp4'), + 'formats': self._extract_m3u8_formats(meta['manifest'], video_id, ext='mp4', entry_protocol='m3u8_native'), 'thumbnail': video_image } elif player == 'youtube': @@ -211,6 +210,13 @@ def _real_extract(self, url): if desc: res['description'] = desc.strip() + res['raw_title'] = self._html_search_regex(r'<title>([^<]+)</title>', html, 'video title') + if data and 'season' in data: + res['title'] = '%s: %s' % (data['season'], res['raw_title']) + res['season'] = data['season'] + else: + res['title'] = res['raw_title'] + return res def _login(self, domain='roosterteeth.com'): From 73f537ce18883d8d3f3337306368491ea2334cf7 Mon Sep 17 00:00:00 2001 From: ngld <ngld@tproxy.de> Date: Wed, 12 Aug 2015 21:42:41 +0200 Subject: [PATCH 5/7] [roosterteeth] Relax the thumbnail and URL checks --- youtube_dl/extractor/roosterteeth.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 63105282408..8da542c3ee7 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -120,9 +120,9 @@ class RoosterteethIE(InfoExtractor): 'ext': 'mp4', 'title': 'Rage Quit - No Time to Explain', 'description': 'There\'s no time to explain this video.', - 'thumbnail': r're:^http://s3\.amazonaws\.com/cdn\.roosterteeth\.com/uploads/images/[a-f0-9-]+/md/[a-z0-9-]+\.jpeg$', + 'thumbnail': r're:^https?://.*\.jpeg$', 'protocol': 'm3u8_native', - 'url': r're:^http://[a-zA-Z0-9.]+\.taucdn\.net/[0-9a-zA-Z]+/video/uploads/videos/[0-9a-f-]+/[0-9A-Z]+\.m3u8$', + 'url': r're:^https?://[a-zA-Z0-9.]+\.taucdn\.net/.*\.m3u8$', } }, { From 1b36d0bb5e9861d3613b9cdced4294247831a3d0 Mon Sep 17 00:00:00 2001 From: ngld <ngld@tproxy.de> Date: Tue, 25 Aug 2015 15:14:04 +0200 Subject: [PATCH 6/7] Small consistency improvements --- youtube_dl/extractor/roosterteeth.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 8da542c3ee7..8872d312d41 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -200,9 +200,11 @@ def _real_extract(self, url): if 'youtubeKey' not in meta: raise ExtractorError('Invalid metadata for youtube video!') - res = self.url_result('https://youtube.com/watch?v=' + meta['youtubeKey']) - res['_type'] = 'url_transparent' - res['id'] = video_id + res = { + '_type': 'url_transparent', + 'url': 'https://youtube.com/watch?v=' + meta['youtubeKey'], + 'id': video_id + } else: raise ExtractorError('Unknown player type %s!' % player) @@ -285,7 +287,7 @@ def _is_sponsor(self, domain='roosterteeth.com'): profile_page = 'http://%s/user/%s' % (domain, compat_urllib_parse.quote(username)) html = self._download_webpage( profile_page, None, - note='Checking user profile...', + note='Checking user profile', errnote='unable to access user profile', fatal=False) if not html: From 00fe5a46cacf7e13ecef481aae444ee799495739 Mon Sep 17 00:00:00 2001 From: ngld <ngld@tproxy.de> Date: Fri, 2 Oct 2015 00:46:33 +0200 Subject: [PATCH 7/7] [roosterteeth] Updated extractor --- youtube_dl/extractor/roosterteeth.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 8872d312d41..6e673ecd54f 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -110,7 +110,7 @@ def _real_extract(self, url): class RoosterteethIE(InfoExtractor): - _VALID_URL = r'http://(?P<domain>(?:www\.)?(?:roosterteeth\.com|achievementhunter\.com|fun\.haus))/episode/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?P<domain>(?:www\.)?(?:roosterteeth\.com|achievementhunter\.com|fun\.haus))/episode/(?P<id>[^/]+)' _TESTS = [ { 'url': 'http://achievementhunter.com/episode/rage-quit-season-1-episode-199', @@ -174,29 +174,27 @@ def _real_extract(self, url): else: raise ExtractorError('This is a sponsor-only video and although I tried to login, it did not work.') - js = self._html_search_regex(r'<script src="https?://(?:www\.)?(?:roosterteeth\.com|achievementhunter\.com|fun\.haus)/scripts/lib/(?:jwplayer|youtube)\.min\.js"></script>\s*<script>\s*([^<]+)\s*</script>', html, 'video info') - info = re.search(r'RT\.(?P<player>youtube|jwplayer)\.player\((?P<json>\{(?:[^}]|\}(?!\);))+\})\);', js) - if not info: - raise ExtractorError("Can't parse the video metadata! (%s)" % js) + p = re.search(r'<script src="https?://(?:www\.)?(?:roosterteeth\.com|achievementhunter\.com|fun\.haus)/scripts/lib/(?P<player>jwplayer|youtube)\.(?:min\.)?js"></script>\s*<script>\s*(?P<script>[^<]+)\s*</script>', html) + if not p: + raise ExtractorError("Can't parse the video metadata! (%s)" % video_id) - player = info.group('player') - meta = self._parse_json(js_to_json(info.group('json')), video_id) + player = p.group('player') if player == 'jwplayer': - # Make sure that all values are there. - for attr in ('containerId', 'videoImage', 'videoTitle', 'manifest'): - if attr not in meta: - raise ExtractorError('Unexpected video info! Attribute %s is missing.' % attr) - - video_image = meta['videoImage'] + video_image = self._search_regex(r"var videoImage = '([^']+)';", p.group('script'), 'video image') if video_image.startswith('//'): video_image = 'http:' + video_image + manifest = self._search_regex(r"RT\.jwplayer\.player\([^\{]+\{\s*file: '([^']+)',", p.group('script'), 'manifest') + res = { 'id': video_id, - 'formats': self._extract_m3u8_formats(meta['manifest'], video_id, ext='mp4', entry_protocol='m3u8_native'), + 'formats': self._extract_m3u8_formats(manifest, video_id, ext='mp4'), 'thumbnail': video_image } elif player == 'youtube': + info = self._html_search_regex(r'RT\.(?:youtube|jwplayer)\.player\((\{(?:[^}]|\}(?!\);))+\})\);', p.group('script'), 'video metadata') + meta = self._parse_json(js_to_json(info), video_id) + if 'youtubeKey' not in meta: raise ExtractorError('Invalid metadata for youtube video!')