From 4a56c4d55911ec3a884e24532f783345df5efb04 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 6 Nov 2019 19:56:10 +0100 Subject: [PATCH] [kinja] add support for Kinja embeds closes #5756 closes #11282 closes #22237 closes #22384 --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 17 ++- youtube_dl/extractor/kinja.py | 221 +++++++++++++++++++++++++++ youtube_dl/extractor/onionstudios.py | 54 +------ 4 files changed, 241 insertions(+), 52 deletions(-) create mode 100644 youtube_dl/extractor/kinja.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9f43b284d5e2..9e3b554fa6d3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -513,6 +513,7 @@ from .ketnet import KetnetIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE +from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .konserthusetplay import KonserthusetPlayIE from .kontrtube import KontrTubeIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1c0780e986a4..3d919f656fc8 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -119,6 +119,7 @@ from .expressen import ExpressenIE from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE +from .kinja import KinjaEmbedIE class GenericIE(InfoExtractor): @@ -1487,16 +1488,18 @@ class GenericIE(InfoExtractor): 'timestamp': 1432570283, }, }, - # OnionStudios embed + # Kinja embed { 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', 'info_dict': { - 'id': '2855', + 'id': '106351', 'ext': 'mp4', 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You', + 'description': 'Migrated from OnionStudios', 'thumbnail': r're:^https?://.*\.jpe?g$', - 'uploader': 'ClickHole', - 'uploader_id': 'clickhole', + 'uploader': 'clickhole', + 'upload_date': '20150527', + 'timestamp': 1432744860, } }, # SnagFilms embed @@ -2894,6 +2897,12 @@ def _real_extract(self, url): if senate_isvp_url: return self.url_result(senate_isvp_url, 'SenateISVP') + # Look for Kinja embeds + kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url) + if kinja_embed_urls: + return self.playlist_from_matches( + kinja_embed_urls, video_id, video_title) + # Look for OnionStudios embeds onionstudios_url = OnionStudiosIE._extract_url(webpage) if onionstudios_url: diff --git a/youtube_dl/extractor/kinja.py b/youtube_dl/extractor/kinja.py new file mode 100644 index 000000000000..79e3026d2f4a --- /dev/null +++ b/youtube_dl/extractor/kinja.py @@ -0,0 +1,221 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) +from ..utils import ( + int_or_none, + parse_iso8601, + strip_or_none, + try_get, + unescapeHTML, + urljoin, +) + + +class KinjaEmbedIE(InfoExtractor): + IENAME = 'kinja:embed' + _DOMAIN_REGEX = r'''(?:[^.]+\.)? + (?: + avclub| + clickhole| + deadspin| + gizmodo| + jalopnik| + jezebel| + kinja| + kotaku| + lifehacker| + splinternews| + the(?:inventory|onion|root|takeout) + )\.com''' + _COMMON_REGEX = r'''/ + (?: + ajax/inset| + embed/video + )/iframe\?.*?\bid=''' + _VALID_URL = r'''(?x)https?://%s%s + (?P + fb| + imgur| + instagram| + jwp(?:layer)?-video| + kinjavideo| + mcp| + megaphone| + ooyala| + soundcloud(?:-playlist)?| + tumblr-post| + twitch-stream| + twitter| + ustream-channel| + vimeo| + vine| + youtube-(?:list|video) + )-(?P[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX) + _TESTS = [{ + 'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=kinjavideo-100313', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=megaphone-PPY1300931075', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=ooyala-xzMXhleDpopuT0u1ijt_qZj3Va-34pEX%2FZTIxYmJjZDM2NWYzZDViZGRiOWJjYzc5', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-128574047', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-playlist-317413750', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=tumblr-post-160130699814-daydreams-at-midnight', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=twitch-stream-libratus_extra', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=twitter-1068875942473404422', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=ustream-channel-10414700', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=vimeo-120153502', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=vine-5BlvV5qqPrD', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-list-BCQ3KyrPjgA/PLE6509247C270A72E', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-video-00QyL0AgPAE', + 'only_matching': True, + }] + _JWPLATFORM_PROVIDER = ('cdn.jwplayer.com/v2/media/', 'JWPlatform') + _PROVIDER_MAP = { + 'fb': ('facebook.com/video.php?v=', 'Facebook'), + 'imgur': ('imgur.com/', 'Imgur'), + 'instagram': ('instagram.com/p/', 'Instagram'), + 'jwplayer-video': _JWPLATFORM_PROVIDER, + 'jwp-video': _JWPLATFORM_PROVIDER, + 'megaphone': ('player.megaphone.fm/', 'Generic'), + 'ooyala': ('player.ooyala.com/player.js?embedCode=', 'Ooyala'), + 'soundcloud': ('api.soundcloud.com/tracks/', 'Soundcloud'), + 'soundcloud-playlist': ('api.soundcloud.com/playlists/', 'SoundcloudPlaylist'), + 'tumblr-post': ('%s.tumblr.com/post/%s', 'Tumblr'), + 'twitch-stream': ('twitch.tv/', 'TwitchStream'), + 'twitter': ('twitter.com/i/cards/tfw/v1/', 'TwitterCard'), + 'ustream-channel': ('ustream.tv/embed/', 'Ustream'), + 'vimeo': ('vimeo.com/', 'Vimeo'), + 'vine': ('vine.co/v/', 'Vine'), + 'youtube-list': ('youtube.com/embed/%s?list=%s', 'YoutubePlaylist'), + 'youtube-video': ('youtube.com/embed/', 'Youtube'), + } + + @staticmethod + def _extract_urls(webpage, url): + return [urljoin(url, unescapeHTML(mobj.group('url'))) for mobj in re.finditer( + r'(?x)]+?src=(?P["\'])(?P(?:(?:https?:)?//%s)?%s(?:(?!\1).)+)\1' % (KinjaEmbedIE._DOMAIN_REGEX, KinjaEmbedIE._COMMON_REGEX), + webpage)] + + def _real_extract(self, url): + video_type, video_id = re.match(self._VALID_URL, url).groups() + + provider = self._PROVIDER_MAP.get(video_type) + if provider: + video_id = compat_urllib_parse_unquote(video_id) + if video_type == 'tumblr-post': + video_id, blog = video_id.split('-', 1) + result_url = provider[0] % (blog, video_id) + elif video_type == 'youtube-list': + video_id, playlist_id = video_id.split('/') + result_url = provider[0] % (video_id, playlist_id) + else: + if video_type == 'ooyala': + video_id = video_id.split('/')[0] + result_url = provider[0] + video_id + return self.url_result('http://' + result_url, provider[1]) + + if video_type == 'kinjavideo': + data = self._download_json( + 'https://kinja.com/api/core/video/views/videoById', + video_id, query={'videoId': video_id})['data'] + title = data['title'] + + formats = [] + for k in ('signedPlaylist', 'streaming'): + m3u8_url = data.get(k + 'Url') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + thumbnail = None + poster = data.get('poster') or {} + poster_id = poster.get('id') + if poster_id: + thumbnail = 'https://i.kinja-img.com/gawker-media/image/upload/%s.%s' % (poster_id, poster.get('format') or 'jpg') + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(data.get('description')), + 'formats': formats, + 'tags': data.get('tags'), + 'timestamp': int_or_none(try_get( + data, lambda x: x['postInfo']['publishTimeMillis']), 1000), + 'thumbnail': thumbnail, + 'uploader': data.get('network'), + } + else: + video_data = self._download_json( + 'https://api.vmh.univision.com/metadata/v1/content/' + video_id, + video_id)['videoMetadata'] + iptc = video_data['photoVideoMetadataIPTC'] + title = iptc['title']['en'] + fmg = video_data.get('photoVideoMetadata_fmg') or {} + tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com' + data = self._download_json( + tvss_domain + '/api/v3/video-auth/url-signature-tokens', + video_id, query={'mcpids': video_id})['data'][0] + formats = [] + + rendition_url = data.get('renditionUrl') + if rendition_url: + formats = self._extract_m3u8_formats( + rendition_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + + fallback_rendition_url = data.get('fallbackRenditionUrl') + if fallback_rendition_url: + formats.append({ + 'format_id': 'fallback', + 'tbr': int_or_none(self._search_regex( + r'_(\d+)\.mp4', fallback_rendition_url, + 'bitrate', default=None)), + 'url': fallback_rendition_url, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str), + 'uploader': fmg.get('network'), + 'duration': int_or_none(iptc.get('fileDuration')), + 'formats': formats, + 'description': try_get(iptc, lambda x: x['description']['en'], compat_str), + 'timestamp': parse_iso8601(iptc.get('dateReleased')), + } diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index 7f8c6f0d31a0..cf5c39e66b95 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -4,13 +4,8 @@ import re from .common import InfoExtractor -from ..utils import ( - compat_str, - int_or_none, - js_to_json, - parse_iso8601, - try_get, -) +from ..compat import compat_str +from ..utils import js_to_json class OnionStudiosIE(InfoExtractor): @@ -20,7 +15,7 @@ class OnionStudiosIE(InfoExtractor): 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', 'md5': '5a118d466d62b5cd03647cf2c593977f', 'info_dict': { - 'id': '2937', + 'id': '3459881', 'ext': 'mp4', 'title': 'Hannibal charges forward, stops for a cocktail', 'description': 'md5:545299bda6abf87e5ec666548c6a9448', @@ -53,43 +48,6 @@ def _real_extract(self, url): mcp_id = compat_str(self._parse_json(self._search_regex( r'window\.mcpMapping\s*=\s*({.+?});', webpage, 'MCP Mapping'), video_id, js_to_json)[video_id]['mcp_id']) - video_data = self._download_json( - 'https://api.vmh.univision.com/metadata/v1/content/' + mcp_id, - mcp_id)['videoMetadata'] - iptc = video_data['photoVideoMetadataIPTC'] - title = iptc['title']['en'] - fmg = video_data.get('photoVideoMetadata_fmg') or {} - tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com' - data = self._download_json( - tvss_domain + '/api/v3/video-auth/url-signature-tokens', - mcp_id, query={'mcpids': mcp_id})['data'][0] - formats = [] - - rendition_url = data.get('renditionUrl') - if rendition_url: - formats = self._extract_m3u8_formats( - rendition_url, mcp_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) - - fallback_rendition_url = data.get('fallbackRenditionUrl') - if fallback_rendition_url: - formats.append({ - 'format_id': 'fallback', - 'tbr': int_or_none(self._search_regex( - r'_(\d+)\.mp4', fallback_rendition_url, - 'bitrate', default=None)), - 'url': fallback_rendition_url, - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str), - 'uploader': fmg.get('network'), - 'duration': int_or_none(iptc.get('fileDuration')), - 'formats': formats, - 'description': try_get(iptc, lambda x: x['description']['en'], compat_str), - 'timestamp': parse_iso8601(iptc.get('dateReleased')), - } + return self.url_result( + 'http://kinja.com/ajax/inset/iframe?id=mcp-' + mcp_id, + 'KinjaEmbed', mcp_id)