Skip to content

Commit

Permalink
Merge branch 'yt-dlp:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
saintliao authored Dec 19, 2023
2 parents de1d3d2 + db8b4ed commit 8b07815
Show file tree
Hide file tree
Showing 6 changed files with 395 additions and 24 deletions.
4 changes: 4 additions & 0 deletions yt_dlp/extractor/_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@
)
from .businessinsider import BusinessInsiderIE
from .bundesliga import BundesligaIE
from .bundestag import BundestagIE
from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE
from .c56 import C56IE
Expand Down Expand Up @@ -864,6 +865,7 @@
)
from .jove import JoveIE
from .joj import JojIE
from .joqrag import JoqrAgIE
from .jstream import JStreamIE
from .jtbc import (
JTBCIE,
Expand Down Expand Up @@ -991,6 +993,7 @@
LyndaIE,
LyndaCourseIE
)
from .maariv import MaarivIE
from .magellantv import MagellanTVIE
from .magentamusik360 import MagentaMusik360IE
from .mailru import (
Expand Down Expand Up @@ -1590,6 +1593,7 @@
from .reuters import ReutersIE
from .reverbnation import ReverbNationIE
from .rheinmaintv import RheinMainTVIE
from .rinsefm import RinseFMIE
from .rmcdecouverte import RMCDecouverteIE
from .rockstargames import RockstarGamesIE
from .rokfin import (
Expand Down
123 changes: 123 additions & 0 deletions yt_dlp/extractor/bundestag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import re
from functools import partial

from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
bug_reports_message,
clean_html,
format_field,
get_element_text_and_html_by_tag,
int_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj


class BundestagIE(InfoExtractor):
_VALID_URL = [
r'https?://dbtg\.tv/[cf]vid/(?P<id>\d+)',
r'https?://www\.bundestag\.de/mediathek/?\?(?:[^#]+&)?videoid=(?P<id>\d+)',
]
_TESTS = [{
'url': 'https://dbtg.tv/cvid/7605304',
'info_dict': {
'id': '7605304',
'ext': 'mp4',
'title': '145. Sitzung vom 15.12.2023, TOP 24 Barrierefreiheit',
'description': 'md5:321a9dc6bdad201264c0045efc371561',
},
}, {
'url': 'https://www.bundestag.de/mediathek?videoid=7602120&url=L21lZGlhdGhla292ZXJsYXk=&mod=mediathek',
'info_dict': {
'id': '7602120',
'ext': 'mp4',
'title': '130. Sitzung vom 18.10.2023, TOP 1 Befragung der Bundesregierung',
'description': 'Befragung der Bundesregierung',
},
}, {
'url': 'https://www.bundestag.de/mediathek?videoid=7604941#url=L21lZGlhdGhla292ZXJsYXk/dmlkZW9pZD03NjA0OTQx&mod=mediathek',
'only_matching': True,
}, {
'url': 'http://dbtg.tv/fvid/3594346',
'only_matching': True,
}]

_OVERLAY_URL = 'https://www.bundestag.de/mediathekoverlay'
_INSTANCE_FORMAT = 'https://cldf-wzw-od.r53.cdn.tv1.eu/13014bundestagod/_definst_/13014bundestag/ondemand/3777parlamentsfernsehen/archiv/app144277506/145293313/{0}/{0}_playlist.smil/playlist.m3u8'

_SHARE_URL = 'https://webtv.bundestag.de/player/macros/_x_s-144277506/shareData.json?contentId='
_SHARE_AUDIO_REGEX = r'/\d+_(?P<codec>\w+)_(?P<bitrate>\d+)kb_(?P<channels>\w+)_\w+_\d+\.(?P<ext>\w+)'
_SHARE_VIDEO_REGEX = r'/\d+_(?P<codec>\w+)_(?P<width>\w+)_(?P<height>\w+)_(?P<bitrate>\d+)kb_\w+_\w+_\d+\.(?P<ext>\w+)'

def _bt_extract_share_formats(self, video_id):
share_data = self._download_json(
f'{self._SHARE_URL}{video_id}', video_id, note='Downloading share format JSON')
if traverse_obj(share_data, ('status', 'code', {int})) != 1:
self.report_warning(format_field(
share_data, [('status', 'message', {str})],
'Share API response: %s', default='Unknown Share API Error')
+ bug_reports_message())
return

for name, url in share_data.items():
if not isinstance(name, str) or not url_or_none(url):
continue

elif name.startswith('audio'):
match = re.search(self._SHARE_AUDIO_REGEX, url)
yield {
'format_id': name,
'url': url,
'vcodec': 'none',
**traverse_obj(match, {
'acodec': 'codec',
'audio_channels': ('channels', {{'mono': 1, 'stereo': 2}.get}),
'abr': ('bitrate', {int_or_none}),
'ext': 'ext',
}),
}

elif name.startswith('download'):
match = re.search(self._SHARE_VIDEO_REGEX, url)
yield {
'format_id': name,
'url': url,
**traverse_obj(match, {
'vcodec': 'codec',
'tbr': ('bitrate', {int_or_none}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'ext': 'ext',
}),
}

def _real_extract(self, url):
video_id = self._match_id(url)
formats = []
result = {'id': video_id, 'formats': formats}

try:
formats.extend(self._extract_m3u8_formats(
self._INSTANCE_FORMAT.format(video_id), video_id, m3u8_id='instance'))
except ExtractorError as error:
if isinstance(error.cause, HTTPError) and error.cause.status == 404:
raise ExtractorError('Could not find video id', expected=True)
self.report_warning(f'Error extracting hls formats: {error}', video_id)
formats.extend(self._bt_extract_share_formats(video_id))
if not formats:
self.raise_no_formats('Could not find suitable formats', video_id=video_id)

result.update(traverse_obj(self._download_webpage(
self._OVERLAY_URL, video_id,
query={'videoid': video_id, 'view': 'main'},
note='Downloading metadata overlay', fatal=False,
), {
'title': (
{partial(get_element_text_and_html_by_tag, 'h3')}, 0,
{partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
'description': ({partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}),
}))

return result
112 changes: 112 additions & 0 deletions yt_dlp/extractor/joqrag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import datetime
import urllib.parse

from .common import InfoExtractor
from ..utils import (
clean_html,
datetime_from_str,
unified_timestamp,
urljoin,
)


class JoqrAgIE(InfoExtractor):
IE_DESC = '超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR)'
_VALID_URL = [r'https?://www\.uniqueradio\.jp/agplayer5/(?:player|inc-player-hls)\.php',
r'https?://(?:www\.)?joqr\.co\.jp/ag/',
r'https?://(?:www\.)?joqr\.co\.jp/qr/ag(?:daily|regular)program/?(?:$|[#?])']
_TESTS = [{
'url': 'https://www.uniqueradio.jp/agplayer5/player.php',
'info_dict': {
'id': 'live',
'title': str,
'channel': '超!A&G+',
'description': str,
'live_status': 'is_live',
'release_timestamp': int,
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
}, {
'url': 'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php',
'only_matching': True,
}, {
'url': 'https://www.joqr.co.jp/ag/article/103760/',
'only_matching': True,
}, {
'url': 'http://www.joqr.co.jp/qr/agdailyprogram/',
'only_matching': True,
}, {
'url': 'http://www.joqr.co.jp/qr/agregularprogram/',
'only_matching': True,
}]

def _extract_metadata(self, variable, html):
return clean_html(urllib.parse.unquote_plus(self._search_regex(
rf'var\s+{variable}\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
html, 'metadata', group='value', default=''))) or None

def _extract_start_timestamp(self, video_id, is_live):
def extract_start_time_from(date_str):
dt = datetime_from_str(date_str) + datetime.timedelta(hours=9)
date = dt.strftime('%Y%m%d')
start_time = self._search_regex(
r'<h3[^>]+\bclass="dailyProgram-itemHeaderTime"[^>]*>[\s\d:]+–\s*(\d{1,2}:\d{1,2})',
self._download_webpage(
f'https://www.joqr.co.jp/qr/agdailyprogram/?date={date}', video_id,
note=f'Downloading program list of {date}', fatal=False,
errnote=f'Failed to download program list of {date}') or '',
'start time', default=None)
if start_time:
return unified_timestamp(f'{dt.strftime("%Y/%m/%d")} {start_time} +09:00')
return None

start_timestamp = extract_start_time_from('today')
if not start_timestamp:
return None

if not is_live or start_timestamp < datetime_from_str('now').timestamp():
return start_timestamp
else:
return extract_start_time_from('yesterday')

def _real_extract(self, url):
video_id = 'live'

metadata = self._download_webpage(
'https://www.uniqueradio.jp/aandg', video_id,
note='Downloading metadata', errnote='Failed to download metadata')
title = self._extract_metadata('Program_name', metadata)

if title == '放送休止':
formats = []
live_status = 'is_upcoming'
release_timestamp = self._extract_start_timestamp(video_id, False)
msg = 'This stream is not currently live'
if release_timestamp:
msg += (' and will start at '
+ datetime.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S'))
self.raise_no_formats(msg, expected=True)
else:
m3u8_path = self._search_regex(
r'<source\s[^>]*\bsrc="([^"]+)"',
self._download_webpage(
'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php', video_id,
note='Downloading player data', errnote='Failed to download player data'),
'm3u8 url')
formats = self._extract_m3u8_formats(
urljoin('https://www.uniqueradio.jp/', m3u8_path), video_id)
live_status = 'is_live'
release_timestamp = self._extract_start_timestamp(video_id, True)

return {
'id': video_id,
'title': title,
'channel': '超!A&G+',
'description': self._extract_metadata('Program_text', metadata),
'formats': formats,
'live_status': live_status,
'release_timestamp': release_timestamp,
}
62 changes: 62 additions & 0 deletions yt_dlp/extractor/maariv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_resolution,
unified_timestamp,
url_or_none,
)
from ..utils.traversal import traverse_obj


class MaarivIE(InfoExtractor):
IE_NAME = 'maariv.co.il'
_VALID_URL = r'https?://player\.maariv\.co\.il/public/player\.html\?(?:[^#]+&)?media=(?P<id>\d+)'
_EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL})']
_TESTS = [{
'url': 'https://player.maariv.co.il/public/player.html?player=maariv-desktop&media=3611585',
'info_dict': {
'id': '3611585',
'duration': 75,
'ext': 'mp4',
'upload_date': '20231009',
'title': 'מבצע חרבות ברזל',
'timestamp': 1696851301,
},
}]
_WEBPAGE_TESTS = [{
'url': 'https://www.maariv.co.il/news/law/Article-1044008',
'info_dict': {
'id': '3611585',
'duration': 75,
'ext': 'mp4',
'upload_date': '20231009',
'title': 'מבצע חרבות ברזל',
'timestamp': 1696851301,
},
}]

def _real_extract(self, url):
video_id = self._match_id(url)
data = self._download_json(
f'https://dal.walla.co.il/media/{video_id}?origin=player.maariv.co.il', video_id)['data']

formats = []
if hls_url := traverse_obj(data, ('video', 'url', {url_or_none})):
formats.extend(self._extract_m3u8_formats(hls_url, video_id, m3u8_id='hls', fatal=False))

for http_format in traverse_obj(data, ('video', 'stream_urls', ..., 'stream_url', {url_or_none})):
formats.append({
'url': http_format,
'format_id': 'http',
**parse_resolution(http_format),
})

return {
'id': video_id,
**traverse_obj(data, {
'title': 'title',
'duration': ('video', 'duration', {int_or_none}),
'timestamp': ('upload_date', {unified_timestamp}),
}),
'formats': formats,
}
33 changes: 33 additions & 0 deletions yt_dlp/extractor/rinsefm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from .common import InfoExtractor
from ..utils import format_field, parse_iso8601


class RinseFMIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rinse\.fm/episodes/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://rinse.fm/episodes/club-glow-15-12-2023-2000/',
'md5': '76ee0b719315617df42e15e710f46c7b',
'info_dict': {
'id': '1536535',
'ext': 'mp3',
'title': 'Club Glow - 15/12/2023 - 20:00',
'thumbnail': r're:^https://.+\.(?:jpg|JPG)$',
'release_timestamp': 1702598400,
'release_date': '20231215'
}
}]

def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
entry = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['entry']

return {
'id': entry['id'],
'title': entry.get('title'),
'url': entry['fileUrl'],
'vcodec': 'none',
'release_timestamp': parse_iso8601(entry.get('episodeDate')),
'thumbnail': format_field(
entry, [('featuredImage', 0, 'filename')], 'https://rinse.imgix.net/media/%s', default=None),
}
Loading

0 comments on commit 8b07815

Please sign in to comment.