From 12e733bfcb9c06277d30af3ecf04a4d02d237538 Mon Sep 17 00:00:00 2001 From: Lam Date: Mon, 20 Apr 2020 20:41:52 +0200 Subject: [PATCH] Revert "[funnyordie] move extraction to VoxMedia extractor and improve vox volume embed extraction(closes #16846)" This reverts commit 77139f68f9e4ecae3089813b062d3cbf21e7469b. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/funnyordie.py | 162 +++++++++++++++++++++++++++++ youtube_dl/extractor/voxmedia.py | 101 ++++++------------ 3 files changed, 197 insertions(+), 67 deletions(-) create mode 100644 youtube_dl/extractor/funnyordie.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5c66e155e4eb..524c704cf2f4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -397,6 +397,7 @@ ) from .funimation import FunimationIE from .funk import FunkIE +from .funnyordie import FunnyOrDieIE from .fusion import FusionIE from .fxnetworks import FXNetworksIE from .gaia import GaiaIE diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py new file mode 100644 index 000000000000..f85e7de1496b --- /dev/null +++ b/youtube_dl/extractor/funnyordie.py @@ -0,0 +1,162 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + unified_timestamp, +) + + +class FunnyOrDieIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?funnyordie\.com/(?Pembed|articles|videos)/(?P[0-9a-f]+)(?:$|[?#/])' + _TESTS = [{ + 'url': 'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version', + 'md5': 'bcd81e0c4f26189ee09be362ad6e6ba9', + 'info_dict': { + 'id': '0732f586d7', + 'ext': 'mp4', + 'title': 'Heart-Shaped Box: Literal Video Version', + 'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338', + 'thumbnail': r're:^http:.*\.jpg$', + 'uploader': 'DASjr', + 'timestamp': 1317904928, + 'upload_date': '20111006', + 'duration': 318.3, + }, + }, { + 'url': 'http://www.funnyordie.com/embed/e402820827', + 'info_dict': { + 'id': 'e402820827', + 'ext': 'mp4', + 'title': 'Please Use This Song (Jon Lajoie)', + 'description': 'Please use this to sell something. www.jonlajoie.com', + 'thumbnail': r're:^http:.*\.jpg$', + 'timestamp': 1398988800, + 'upload_date': '20140502', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.funnyordie.com/articles/ebf5e34fc8/10-hours-of-walking-in-nyc-as-a-man', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + + links = re.findall(r']+src=(["\'])(?P.+?/master\.m3u8[^"\']*)\1', + webpage, 'm3u8 url', group='url') + + formats = [] + + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + source_formats = list(filter( + lambda f: f.get('vcodec') != 'none', m3u8_formats)) + + bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)] + bitrates.sort() + + if source_formats: + self._sort_formats(source_formats) + + for bitrate, f in zip(bitrates, source_formats or [{}] * len(bitrates)): + for path, ext in links: + ff = f.copy() + if ff: + if ext != 'mp4': + ff = dict( + [(k, v) for k, v in ff.items() + if k in ('height', 'width', 'format_id')]) + ff.update({ + 'format_id': ff['format_id'].replace('hls', ext), + 'ext': ext, + 'protocol': 'http', + }) + else: + ff.update({ + 'format_id': '%s-%d' % (ext, bitrate), + 'vbr': bitrate, + }) + ff['url'] = self._proto_relative_url( + '%s%d.%s' % (path, bitrate, ext)) + formats.append(ff) + self._check_formats(formats, video_id) + + formats.extend(m3u8_formats) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) + + subtitles = {} + for src, src_lang in re.findall(r']+\bclass=["\']channel-preview-name[^>]+>(.+?)[^/?]+)' + _VALID_URL = r'https?://(?:www\.)?(?:(?:theverge|vox|sbnation|eater|polygon|curbed|racked)\.com|recode\.net)/(?:[^/]+/)*(?P[^/?]+)' _TESTS = [{ - # Volume embed, Youtube 'url': 'http://www.theverge.com/2014/6/27/5849272/material-world-how-google-discovered-what-software-is-made-of', 'info_dict': { - 'id': 'j4mLW6x17VM', + 'id': '11eXZobjrG8DCSTgrNjVinU-YmmdYjhe', 'ext': 'mp4', - 'title': 'Material world: how Google discovered what software is made of', - 'description': 'md5:dfc17e7715e3b542d66e33a109861382', - 'upload_date': '20190710', - 'uploader_id': 'TheVerge', - 'uploader': 'The Verge', + 'title': 'Google\'s new material design direction', + 'description': 'md5:2f44f74c4d14a1f800ea73e1c6832ad2', }, - 'add_ie': ['Youtube'], + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], }, { - # Volume embed, Youtube + # data-ooyala-id 'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', - 'md5': '4c8f4a0937752b437c3ebc0ed24802b5', + 'md5': 'd744484ff127884cd2ba09e3fa604e4b', 'info_dict': { - 'id': 'Gy8Md3Eky38', + 'id': 'RkZXU4cTphOCPDMZg5oEounJyoFI0g-B', 'ext': 'mp4', 'title': 'The Nexus 6: hands-on with Google\'s phablet', - 'description': 'md5:d9f0216e5fb932dd2033d6db37ac3f1d', - 'uploader_id': 'TheVerge', - 'upload_date': '20141021', - 'uploader': 'The Verge', + 'description': 'md5:87a51fe95ff8cea8b5bdb9ac7ae6a6af', }, - 'add_ie': ['Youtube'], - 'skip': 'similar to the previous test', + 'add_ie': ['Ooyala'], + 'skip': 'Video Not Found', }, { - # Volume embed, Youtube + # volume embed 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', 'info_dict': { - 'id': 'YCjDnX-Xzhg', + 'id': 'wydzk3dDpmRz7PQoXRsTIX6XTkPjYL0b', 'ext': 'mp4', - 'title': "Mississippi's laws are so bad that its anti-LGBTQ law isn't needed to allow discrimination", - 'description': 'md5:fc1317922057de31cd74bce91eb1c66c', - 'uploader_id': 'voxdotcom', - 'upload_date': '20150915', - 'uploader': 'Vox', + 'title': 'The new frontier of LGBTQ civil rights, explained', + 'description': 'md5:0dc58e94a465cbe91d02950f770eb93f', }, - 'add_ie': ['Youtube'], - 'skip': 'similar to the previous test', + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], }, { # youtube embed 'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance', @@ -124,7 +93,6 @@ class VoxMediaIE(InfoExtractor): 'uploader': 'Vox', }, 'add_ie': ['Youtube'], - 'skip': 'Page no longer contain videos', }, { # SBN.VideoLinkset.entryGroup multiple ooyala embeds 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', @@ -150,11 +118,10 @@ class VoxMediaIE(InfoExtractor): 'description': 'md5:e02d56b026d51aa32c010676765a690d', }, }], - 'skip': 'Page no longer contain videos', }, { # volume embed, Brightcove Once 'url': 'https://www.recode.net/2014/6/17/11628066/post-post-pc-ceo-the-full-code-conference-video-of-microsofts-satya', - 'md5': '2dbc77b8b0bff1894c2fce16eded637d', + 'md5': '01571a896281f77dc06e084138987ea2', 'info_dict': { 'id': '1231c973d', 'ext': 'mp4',