Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[sproutvideo] Add new extractor (closes #7935, replaces #21962) #27685

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions youtube_dl/downloader/hls.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def real_download(self, filename, info_dict):
s = urlh.read().decode('utf-8', 'ignore')

if not self.can_download(s, info_dict):
if info_dict.get('extra_param_to_segment_url') or info_dict.get('_decryption_key_url'):
if info_dict.get('extra_param_to_segment_url') or info_dict.get('extra_param_to_key_url'):
self.report_error('pycrypto not found. Please install it.')
return False
self.report_warning(
Expand Down Expand Up @@ -115,10 +115,19 @@ def is_ad_fragment_end(s):
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
test = self.params.get('test', False)

extra_query = None
extra_segment_query = None
extra_key_query = None
extra_key_url = None
extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
extra_param_to_key_url = info_dict.get('extra_param_to_key_url')
if extra_param_to_segment_url:
extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url)
extra_segment_query = compat_urlparse.parse_qs(extra_param_to_segment_url)
extra_key_query = compat_urlparse.parse_qs(extra_param_to_segment_url)
if extra_param_to_key_url:
if extra_param_to_key_url.startswith('http'):
extra_key_url = extra_param_to_key_url
else:
extra_key_query = compat_urlparse.parse_qs(extra_param_to_key_url)
i = 0
media_sequence = 0
decrypt_info = {'METHOD': 'NONE'}
Expand All @@ -138,8 +147,8 @@ def is_ad_fragment_end(s):
line
if re.match(r'^https?://', line)
else compat_urlparse.urljoin(man_url, line))
if extra_query:
frag_url = update_url_query(frag_url, extra_query)
if extra_segment_query:
frag_url = update_url_query(frag_url, extra_segment_query)
count = 0
headers = info_dict.get('http_headers', {})
if byte_range:
Expand Down Expand Up @@ -193,8 +202,10 @@ def is_ad_fragment_end(s):
if not re.match(r'^https?://', decrypt_info['URI']):
decrypt_info['URI'] = compat_urlparse.urljoin(
man_url, decrypt_info['URI'])
if extra_query:
decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
if extra_key_query:
decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_key_query)
elif extra_key_url:
decrypt_info['URI'] = extra_key_url
if decrypt_url != decrypt_info['URI']:
decrypt_info['KEY'] = None
elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
Expand Down
1 change: 1 addition & 0 deletions youtube_dl/extractor/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1115,6 +1115,7 @@
)
from .springboardplatform import SpringboardPlatformIE
from .sprout import SproutIE
from .sproutvideo import SproutVideoIE
from .srgssr import (
SRGSSRIE,
SRGSSRPlayIE,
Expand Down
18 changes: 18 additions & 0 deletions youtube_dl/extractor/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@
from .odnoklassniki import OdnoklassnikiIE
from .kinja import KinjaEmbedIE
from .arcpublishing import ArcPublishingIE
from .sproutvideo import SproutVideoIE


class GenericIE(InfoExtractor):
Expand Down Expand Up @@ -2181,6 +2182,18 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
{
# SproutVideo iframe in page
'url': 'https://www.solidarum.org/vivre-ensemble/adrien-labaeye-berlin-des-communautes-aux-communs',
'info_dict': {
'id': '4c9dddb01910e3c9c4',
'ext': 'mp4',
'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
},
'params': {
'skip_download': True,
},
},
# {
# # TODO: find another test
# # http://schema.org/VideoObject
Expand Down Expand Up @@ -3298,6 +3311,11 @@ def _real_extract(self, url):
return self.playlist_from_matches(
zype_urls, video_id, video_title, ie=ZypeIE.ie_key())

sproutvideo_urls = SproutVideoIE._extract_urls(webpage)
if sproutvideo_urls:
return self.playlist_from_matches(
sproutvideo_urls, video_id, video_title, ie=SproutVideoIE.ie_key())

# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries:
Expand Down
90 changes: 90 additions & 0 deletions youtube_dl/extractor/sproutvideo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor

from ..compat import (
compat_b64decode,
compat_urllib_parse_urlencode,
)
from ..utils import std_headers


class SproutVideoIE(InfoExtractor):
_NOSCHEMA_URL = r'//videos\.sproutvideo\.com/embed/(?P<id>[a-f0-9]+)/[a-f0-9]+'
_VALID_URL = r'https?:%s' % _NOSCHEMA_URL
_TEST = {
'url': 'https://videos.sproutvideo.com/embed/4c9dddb01910e3c9c4/0fc24387c4f24ee3',
'md5': 'fbc675bb97437e797d11d14d99563f50',
'info_dict': {
'id': '4c9dddb01910e3c9c4',
'ext': 'mp4',
'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
}
}

@staticmethod
def _extract_urls(webpage):
# Fix the video URL if the iframe doesn't have a defined schema
return [sprout.group('url') for sprout in re.finditer(
r'<iframe[^>]+src=[\'"](?P<url>(?:https?:|)%s[^\'"]+)[\'"]' % SproutVideoIE._NOSCHEMA_URL,
webpage)]

def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id, headers=std_headers)

data = self._search_regex(r"var\s+dat\s+=\s+'([^']+)';", webpage, 'data')
data_decoded = compat_b64decode(data).decode('utf-8')
parsed_data = self._parse_json(data_decoded, video_id)

# https://github.com/ytdl-org/youtube-dl/issues/16996#issuecomment-406901324
# signature->m for manifests
# signature->k for keys
# signature->t for segments
m_sign = SproutVideoIE._policy_to_qs(parsed_data, 'm')
k_sign = SproutVideoIE._policy_to_qs(parsed_data, 'k')
t_sign = SproutVideoIE._policy_to_qs(parsed_data, 't')

custom_headers = {
'Accept': '*/*',
'Origin': 'https://videos.sproutvideo.com',
'Referer': url
}

resource_url = 'https://{0}.videos.sproutvideo.com/{1}/{2}/video/index.m3u8?{3}'.format(
parsed_data['base'], parsed_data['s3_user_hash'], parsed_data['s3_video_hash'], m_sign)

formats = self._extract_m3u8_formats(
resource_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False,
headers=custom_headers)
self._sort_formats(formats)

for entry in formats:
entry.update({
'url': '{0}?{1}'.format(entry['url'], m_sign),
'extra_param_to_segment_url': t_sign,
'extra_param_to_key_url': k_sign,
'http_headers': custom_headers
})

return {
'id': video_id,
'title': parsed_data['title'],
'formats': formats,
}

@staticmethod
def _format_qsdata(qs_data):
parsed_dict = dict()
for key in qs_data:
parsed_dict[key.replace('CloudFront-', '')] = qs_data[key]
return parsed_dict

@staticmethod
def _policy_to_qs(policy, key):
sign = SproutVideoIE._format_qsdata(policy['signatures'][key])
sign['sessionID'] = policy['sessionID']
return compat_urllib_parse_urlencode(sign, doseq=True)
2 changes: 1 addition & 1 deletion youtube_dl/extractor/vzaar.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def _real_extract(self, url):
m3u8_id='hls', fatal=False)
if hls_aes:
for f in m3u8_formats:
f['_decryption_key_url'] = url_templ % ('goose', '') + qs
f['extra_param_to_key_url'] = url_templ % ('goose', '') + qs
formats.extend(m3u8_formats)

self._sort_formats(formats)
Expand Down