From c91af948e43570025e4aa887e248fd025abae394 Mon Sep 17 00:00:00 2001 From: Tristan Charpentier Date: Sun, 17 Dec 2023 09:07:55 -0500 Subject: [PATCH 1/5] [ie/RinseFM] Add extractor (#8778) Authored by: hashFactory --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/rinsefm.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 yt_dlp/extractor/rinsefm.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 9b96bd5b4554..94369ca66f47 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1590,6 +1590,7 @@ from .reuters import ReutersIE from .reverbnation import ReverbNationIE from .rheinmaintv import RheinMainTVIE +from .rinsefm import RinseFMIE from .rmcdecouverte import RMCDecouverteIE from .rockstargames import RockstarGamesIE from .rokfin import ( diff --git a/yt_dlp/extractor/rinsefm.py b/yt_dlp/extractor/rinsefm.py new file mode 100644 index 000000000000..760adf0ebae3 --- /dev/null +++ b/yt_dlp/extractor/rinsefm.py @@ -0,0 +1,33 @@ +from .common import InfoExtractor +from ..utils import format_field, parse_iso8601 + + +class RinseFMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rinse\.fm/episodes/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://rinse.fm/episodes/club-glow-15-12-2023-2000/', + 'md5': '76ee0b719315617df42e15e710f46c7b', + 'info_dict': { + 'id': '1536535', + 'ext': 'mp3', + 'title': 'Club Glow - 15/12/2023 - 20:00', + 'thumbnail': r're:^https://.+\.(?:jpg|JPG)$', + 'release_timestamp': 1702598400, + 'release_date': '20231215' + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + entry = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['entry'] + + return { + 'id': entry['id'], + 'title': entry.get('title'), + 'url': entry['fileUrl'], + 'vcodec': 'none', + 'release_timestamp': parse_iso8601(entry.get('episodeDate')), + 'thumbnail': format_field( + entry, [('featuredImage', 0, 'filename')], 'https://rinse.imgix.net/media/%s', default=None), + } From c5f01bf7d4b9426c87c3f8248de23934a56579e0 Mon Sep 17 00:00:00 2001 From: "Amir Y. Perehodnik" Date: Mon, 18 Dec 2023 17:52:43 +0200 Subject: [PATCH 2/5] [ie/Maariv] Add extractor (#8331) Authored by: amir16yp --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/maariv.py | 62 +++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 yt_dlp/extractor/maariv.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 94369ca66f47..b3c41139407d 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -991,6 +991,7 @@ LyndaIE, LyndaCourseIE ) +from .maariv import MaarivIE from .magellantv import MagellanTVIE from .magentamusik360 import MagentaMusik360IE from .mailru import ( diff --git a/yt_dlp/extractor/maariv.py b/yt_dlp/extractor/maariv.py new file mode 100644 index 000000000000..425a8b3b4a6d --- /dev/null +++ b/yt_dlp/extractor/maariv.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_resolution, + unified_timestamp, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class MaarivIE(InfoExtractor): + IE_NAME = 'maariv.co.il' + _VALID_URL = r'https?://player\.maariv\.co\.il/public/player\.html\?(?:[^#]+&)?media=(?P\d+)' + _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL})'] + _TESTS = [{ + 'url': 'https://player.maariv.co.il/public/player.html?player=maariv-desktop&media=3611585', + 'info_dict': { + 'id': '3611585', + 'duration': 75, + 'ext': 'mp4', + 'upload_date': '20231009', + 'title': 'מבצע חרבות ברזל', + 'timestamp': 1696851301, + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.maariv.co.il/news/law/Article-1044008', + 'info_dict': { + 'id': '3611585', + 'duration': 75, + 'ext': 'mp4', + 'upload_date': '20231009', + 'title': 'מבצע חרבות ברזל', + 'timestamp': 1696851301, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._download_json( + f'https://dal.walla.co.il/media/{video_id}?origin=player.maariv.co.il', video_id)['data'] + + formats = [] + if hls_url := traverse_obj(data, ('video', 'url', {url_or_none})): + formats.extend(self._extract_m3u8_formats(hls_url, video_id, m3u8_id='hls', fatal=False)) + + for http_format in traverse_obj(data, ('video', 'stream_urls', ..., 'stream_url', {url_or_none})): + formats.append({ + 'url': http_format, + 'format_id': 'http', + **parse_resolution(http_format), + }) + + return { + 'id': video_id, + **traverse_obj(data, { + 'title': 'title', + 'duration': ('video', 'duration', {int_or_none}), + 'timestamp': ('upload_date', {unified_timestamp}), + }), + 'formats': formats, + } From 00a3e47bf5440c96025a76e08337ff2a475ed83e Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Mon, 18 Dec 2023 21:32:08 +0100 Subject: [PATCH 3/5] [ie/bundestag] Add extractor (#8783) Authored by: Grub4K --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/bundestag.py | 123 ++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 yt_dlp/extractor/bundestag.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b3c41139407d..572d79fba253 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -276,6 +276,7 @@ ) from .businessinsider import BusinessInsiderIE from .bundesliga import BundesligaIE +from .bundestag import BundestagIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE diff --git a/yt_dlp/extractor/bundestag.py b/yt_dlp/extractor/bundestag.py new file mode 100644 index 000000000000..9fd7c7de185b --- /dev/null +++ b/yt_dlp/extractor/bundestag.py @@ -0,0 +1,123 @@ +import re +from functools import partial + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + bug_reports_message, + clean_html, + format_field, + get_element_text_and_html_by_tag, + int_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class BundestagIE(InfoExtractor): + _VALID_URL = [ + r'https?://dbtg\.tv/[cf]vid/(?P\d+)', + r'https?://www\.bundestag\.de/mediathek/?\?(?:[^#]+&)?videoid=(?P\d+)', + ] + _TESTS = [{ + 'url': 'https://dbtg.tv/cvid/7605304', + 'info_dict': { + 'id': '7605304', + 'ext': 'mp4', + 'title': '145. Sitzung vom 15.12.2023, TOP 24 Barrierefreiheit', + 'description': 'md5:321a9dc6bdad201264c0045efc371561', + }, + }, { + 'url': 'https://www.bundestag.de/mediathek?videoid=7602120&url=L21lZGlhdGhla292ZXJsYXk=&mod=mediathek', + 'info_dict': { + 'id': '7602120', + 'ext': 'mp4', + 'title': '130. Sitzung vom 18.10.2023, TOP 1 Befragung der Bundesregierung', + 'description': 'Befragung der Bundesregierung', + }, + }, { + 'url': 'https://www.bundestag.de/mediathek?videoid=7604941#url=L21lZGlhdGhla292ZXJsYXk/dmlkZW9pZD03NjA0OTQx&mod=mediathek', + 'only_matching': True, + }, { + 'url': 'http://dbtg.tv/fvid/3594346', + 'only_matching': True, + }] + + _OVERLAY_URL = 'https://www.bundestag.de/mediathekoverlay' + _INSTANCE_FORMAT = 'https://cldf-wzw-od.r53.cdn.tv1.eu/13014bundestagod/_definst_/13014bundestag/ondemand/3777parlamentsfernsehen/archiv/app144277506/145293313/{0}/{0}_playlist.smil/playlist.m3u8' + + _SHARE_URL = 'https://webtv.bundestag.de/player/macros/_x_s-144277506/shareData.json?contentId=' + _SHARE_AUDIO_REGEX = r'/\d+_(?P\w+)_(?P\d+)kb_(?P\w+)_\w+_\d+\.(?P\w+)' + _SHARE_VIDEO_REGEX = r'/\d+_(?P\w+)_(?P\w+)_(?P\w+)_(?P\d+)kb_\w+_\w+_\d+\.(?P\w+)' + + def _bt_extract_share_formats(self, video_id): + share_data = self._download_json( + f'{self._SHARE_URL}{video_id}', video_id, note='Downloading share format JSON') + if traverse_obj(share_data, ('status', 'code', {int})) != 1: + self.report_warning(format_field( + share_data, [('status', 'message', {str})], + 'Share API response: %s', default='Unknown Share API Error') + + bug_reports_message()) + return + + for name, url in share_data.items(): + if not isinstance(name, str) or not url_or_none(url): + continue + + elif name.startswith('audio'): + match = re.search(self._SHARE_AUDIO_REGEX, url) + yield { + 'format_id': name, + 'url': url, + 'vcodec': 'none', + **traverse_obj(match, { + 'acodec': 'codec', + 'audio_channels': ('channels', {{'mono': 1, 'stereo': 2}.get}), + 'abr': ('bitrate', {int_or_none}), + 'ext': 'ext', + }), + } + + elif name.startswith('download'): + match = re.search(self._SHARE_VIDEO_REGEX, url) + yield { + 'format_id': name, + 'url': url, + **traverse_obj(match, { + 'vcodec': 'codec', + 'tbr': ('bitrate', {int_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'ext': 'ext', + }), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = [] + result = {'id': video_id, 'formats': formats} + + try: + formats.extend(self._extract_m3u8_formats( + self._INSTANCE_FORMAT.format(video_id), video_id, m3u8_id='instance')) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 404: + raise ExtractorError('Could not find video id', expected=True) + self.report_warning(f'Error extracting hls formats: {error}', video_id) + formats.extend(self._bt_extract_share_formats(video_id)) + if not formats: + self.raise_no_formats('Could not find suitable formats', video_id=video_id) + + result.update(traverse_obj(self._download_webpage( + self._OVERLAY_URL, video_id, + query={'videoid': video_id, 'view': 'main'}, + note='Downloading metadata overlay', fatal=False, + ), { + 'title': ( + {partial(get_element_text_and_html_by_tag, 'h3')}, 0, + {partial(re.sub, r']*>[^<]+', '')}, {clean_html}), + 'description': ({partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}), + })) + + return result From 1c54a98e19d047e7c15184237b6ef8ad50af489c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 19 Dec 2023 07:24:55 -0600 Subject: [PATCH 4/5] [ie/twitter] Extract stale tweets (#8724) Closes #8691 Authored by: bashonly --- yt_dlp/extractor/twitter.py | 85 ++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index d7609bc8132e..932b478d44bf 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -479,9 +479,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 18, + '_old_archive_ids': ['twitter 643211948184596480'], }, }, { 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', @@ -515,6 +515,7 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'tags': ['TV', 'StarWars', 'TheForceAwakens'], 'age_limit': 0, + '_old_archive_ids': ['twitter 665052190608723968'], }, }, { 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', @@ -558,9 +559,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': ['Damndaniel'], 'age_limit': 0, + '_old_archive_ids': ['twitter 700207533655363584'], }, }, { 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', @@ -599,9 +600,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 0, + '_old_archive_ids': ['twitter 719944021058060289'], }, }, { 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', @@ -616,6 +617,7 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:^https?://.*\.jpg', }, 'add_ie': ['Periscope'], + 'skip': 'Broadcast not found', }, { # has mp4 formats via mobile API 'url': 'https://twitter.com/news_al3alm/status/852138619213144067', @@ -635,9 +637,9 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:^https?://.*\.jpg', 'tags': [], 'repost_count': int, - 'view_count': int, 'like_count': int, 'comment_count': int, + '_old_archive_ids': ['twitter 852138619213144067'], }, }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', @@ -657,9 +659,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': ['Maria'], 'age_limit': 0, + '_old_archive_ids': ['twitter 910031516746514432'], }, 'params': { 'skip_download': True, # requires ffmpeg @@ -683,9 +685,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 0, + '_old_archive_ids': ['twitter 1001551623938805763'], }, 'params': { 'skip_download': True, # requires ffmpeg @@ -749,6 +751,7 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'tags': [], 'age_limit': 0, + '_old_archive_ids': ['twitter 1349794411333394432'], }, 'params': { 'skip_download': True, @@ -771,18 +774,18 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 0, + '_old_archive_ids': ['twitter 1577855540407197696'], }, 'params': {'skip_download': True}, }, { 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'info_dict': { 'id': '1577719286659006464', - 'title': 'Ultima📛| New Era - Test', + 'title': 'Ultima - Test', 'description': 'Test https://t.co/Y3KEZD7Dad', - 'uploader': 'Ultima📛| New Era', + 'uploader': 'Ultima', 'uploader_id': 'UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX', 'upload_date': '20221005', @@ -813,9 +816,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': ['HurricaneIan'], 'age_limit': 0, + '_old_archive_ids': ['twitter 1575560063510810624'], }, }, { # Adult content, fails if not logged in @@ -951,10 +954,10 @@ class TwitterIE(TwitterBaseIE): 'uploader_url': 'https://twitter.com/CTVJLaidlaw', 'display_id': '1600649710662213632', 'like_count': int, - 'view_count': int, 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', 'upload_date': '20221208', 'age_limit': 0, + '_old_archive_ids': ['twitter 1600649710662213632'], }, 'params': {'noplaylist': True}, }, { @@ -979,7 +982,7 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, - 'view_count': int, + '_old_archive_ids': ['twitter 1621117700482416640'], }, }, { 'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2', @@ -995,13 +998,13 @@ class TwitterIE(TwitterBaseIE): 'repost_count': int, 'duration': 9.531, 'comment_count': int, - 'view_count': int, 'upload_date': '20221203', 'age_limit': 0, 'timestamp': 1670092210.0, 'tags': [], 'uploader': '\u06ea', 'description': '\U0001F48B https://t.co/bTj9Qz7vQP', + '_old_archive_ids': ['twitter 1599108751385972737'], }, 'params': {'noplaylist': True}, }, { @@ -1012,7 +1015,6 @@ class TwitterIE(TwitterBaseIE): 'ext': 'mp4', 'uploader_url': 'https://twitter.com/MunTheShinobi', 'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml', - 'view_count': int, 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'age_limit': 0, 'uploader': 'Mün', @@ -1025,6 +1027,7 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': 'MunTheShinobi', 'duration': 139.987, 'timestamp': 1670306984.0, + '_old_archive_ids': ['twitter 1600009574919962625'], }, }, { # retweeted_status (private) @@ -1068,8 +1071,8 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', 'like_count': int, 'repost_count': int, - 'view_count': int, 'comment_count': int, + '_old_archive_ids': ['twitter 1695424220702888009'], }, }, { # retweeted_status w/ legacy API @@ -1091,18 +1094,24 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', 'like_count': int, 'repost_count': int, + '_old_archive_ids': ['twitter 1695424220702888009'], }, 'params': {'extractor_args': {'twitter': {'api': ['legacy']}}}, }, { # Broadcast embedded in tweet - 'url': 'https://twitter.com/JessicaDobsonWX/status/1693057346933600402', + 'url': 'https://twitter.com/JessicaDobsonWX/status/1731121063248175384', 'info_dict': { - 'id': '1yNGaNLjEblJj', + 'id': '1rmxPMjLzAXKN', 'ext': 'mp4', - 'title': 'Jessica Dobson - WAVE Weather Now - Saturday 8/19/23 Update', + 'title': 'WAVE Weather Now - Saturday 12/2/23 Update', 'uploader': 'Jessica Dobson', - 'uploader_id': '1DZEoDwDovRQa', - 'thumbnail': r're:^https?://.*\.jpg', + 'uploader_id': 'JessicaDobsonWX', + 'uploader_url': 'https://twitter.com/JessicaDobsonWX', + 'timestamp': 1701566398, + 'upload_date': '20231203', + 'live_status': 'was_live', + 'thumbnail': r're:https://[^/]+pscp\.tv/.+\.jpg', + 'concurrent_view_count': int, 'view_count': int, }, 'add_ie': ['TwitterBroadcast'], @@ -1125,6 +1134,30 @@ class TwitterIE(TwitterBaseIE): }, 'params': {'extractor_args': {'twitter': {'api': ['syndication']}}}, 'expected_warnings': ['Not all metadata'], + }, { + # "stale tweet" with typename "TweetWithVisibilityResults" + 'url': 'https://twitter.com/RobertKennedyJr/status/1724884212803834154', + 'md5': '62b1e11cdc2cdd0e527f83adb081f536', + 'info_dict': { + 'id': '1724883339285544960', + 'ext': 'mp4', + 'title': 'md5:cc56716f9ed0b368de2ba54c478e493c', + 'description': 'md5:9dc14f5b0f1311fc7caf591ae253a164', + 'display_id': '1724884212803834154', + 'uploader': 'Robert F. Kennedy Jr', + 'uploader_id': 'RobertKennedyJr', + 'uploader_url': 'https://twitter.com/RobertKennedyJr', + 'upload_date': '20231115', + 'timestamp': 1700079417.0, + 'duration': 341.048, + 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', + 'tags': ['Kennedy24'], + 'repost_count': int, + 'like_count': int, + 'comment_count': int, + 'age_limit': 0, + '_old_archive_ids': ['twitter 1724884212803834154'], + }, }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -1179,19 +1212,23 @@ def _graphql_to_legacy(self, data, twid): ), default={}, get_all=False) if self.is_logged_in else traverse_obj( data, ('tweetResult', 'result', {dict}), default={}) - if result.get('__typename') not in ('Tweet', 'TweetTombstone', 'TweetUnavailable', None): - self.report_warning(f'Unknown typename: {result.get("__typename")}', twid, only_once=True) + typename = result.get('__typename') + if typename not in ('Tweet', 'TweetWithVisibilityResults', 'TweetTombstone', 'TweetUnavailable', None): + self.report_warning(f'Unknown typename: {typename}', twid, only_once=True) if 'tombstone' in result: cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more') raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True) - elif result.get('__typename') == 'TweetUnavailable': + elif typename == 'TweetUnavailable': reason = result.get('reason') if reason == 'NsfwLoggedOut': self.raise_login_required('NSFW tweet requires authentication') elif reason == 'Protected': self.raise_login_required('You are not authorized to view this protected tweet') raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True) + # Result for "stale tweet" needs additional transformation + elif typename == 'TweetWithVisibilityResults': + result = traverse_obj(result, ('tweet', {dict})) or {} status = result.get('legacy', {}) status.update(traverse_obj(result, { @@ -1377,7 +1414,7 @@ def add_thumbnail(name, size): 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, - 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), + 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), # No longer available 'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000), # The codec of http formats are unknown '_format_sort_fields': ('res', 'br', 'size', 'proto'), From db8b4edc7d0bd27da462f6fe82ff6e13e3d68a04 Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Tue, 19 Dec 2023 22:21:47 +0800 Subject: [PATCH 5/5] [ie/JoqrAg] Add extractor (#8384) Authored by: pzhlkj6612 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/joqrag.py | 112 ++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 yt_dlp/extractor/joqrag.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 572d79fba253..d5f030c6b078 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -865,6 +865,7 @@ ) from .jove import JoveIE from .joj import JojIE +from .joqrag import JoqrAgIE from .jstream import JStreamIE from .jtbc import ( JTBCIE, diff --git a/yt_dlp/extractor/joqrag.py b/yt_dlp/extractor/joqrag.py new file mode 100644 index 000000000000..3bb28af94e12 --- /dev/null +++ b/yt_dlp/extractor/joqrag.py @@ -0,0 +1,112 @@ +import datetime +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + clean_html, + datetime_from_str, + unified_timestamp, + urljoin, +) + + +class JoqrAgIE(InfoExtractor): + IE_DESC = '超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR)' + _VALID_URL = [r'https?://www\.uniqueradio\.jp/agplayer5/(?:player|inc-player-hls)\.php', + r'https?://(?:www\.)?joqr\.co\.jp/ag/', + r'https?://(?:www\.)?joqr\.co\.jp/qr/ag(?:daily|regular)program/?(?:$|[#?])'] + _TESTS = [{ + 'url': 'https://www.uniqueradio.jp/agplayer5/player.php', + 'info_dict': { + 'id': 'live', + 'title': str, + 'channel': '超!A&G+', + 'description': str, + 'live_status': 'is_live', + 'release_timestamp': int, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }, { + 'url': 'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php', + 'only_matching': True, + }, { + 'url': 'https://www.joqr.co.jp/ag/article/103760/', + 'only_matching': True, + }, { + 'url': 'http://www.joqr.co.jp/qr/agdailyprogram/', + 'only_matching': True, + }, { + 'url': 'http://www.joqr.co.jp/qr/agregularprogram/', + 'only_matching': True, + }] + + def _extract_metadata(self, variable, html): + return clean_html(urllib.parse.unquote_plus(self._search_regex( + rf'var\s+{variable}\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + html, 'metadata', group='value', default=''))) or None + + def _extract_start_timestamp(self, video_id, is_live): + def extract_start_time_from(date_str): + dt = datetime_from_str(date_str) + datetime.timedelta(hours=9) + date = dt.strftime('%Y%m%d') + start_time = self._search_regex( + r']+\bclass="dailyProgram-itemHeaderTime"[^>]*>[\s\d:]+–\s*(\d{1,2}:\d{1,2})', + self._download_webpage( + f'https://www.joqr.co.jp/qr/agdailyprogram/?date={date}', video_id, + note=f'Downloading program list of {date}', fatal=False, + errnote=f'Failed to download program list of {date}') or '', + 'start time', default=None) + if start_time: + return unified_timestamp(f'{dt.strftime("%Y/%m/%d")} {start_time} +09:00') + return None + + start_timestamp = extract_start_time_from('today') + if not start_timestamp: + return None + + if not is_live or start_timestamp < datetime_from_str('now').timestamp(): + return start_timestamp + else: + return extract_start_time_from('yesterday') + + def _real_extract(self, url): + video_id = 'live' + + metadata = self._download_webpage( + 'https://www.uniqueradio.jp/aandg', video_id, + note='Downloading metadata', errnote='Failed to download metadata') + title = self._extract_metadata('Program_name', metadata) + + if title == '放送休止': + formats = [] + live_status = 'is_upcoming' + release_timestamp = self._extract_start_timestamp(video_id, False) + msg = 'This stream is not currently live' + if release_timestamp: + msg += (' and will start at ' + + datetime.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S')) + self.raise_no_formats(msg, expected=True) + else: + m3u8_path = self._search_regex( + r']*\bsrc="([^"]+)"', + self._download_webpage( + 'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php', video_id, + note='Downloading player data', errnote='Failed to download player data'), + 'm3u8 url') + formats = self._extract_m3u8_formats( + urljoin('https://www.uniqueradio.jp/', m3u8_path), video_id) + live_status = 'is_live' + release_timestamp = self._extract_start_timestamp(video_id, True) + + return { + 'id': video_id, + 'title': title, + 'channel': '超!A&G+', + 'description': self._extract_metadata('Program_text', metadata), + 'formats': formats, + 'live_status': live_status, + 'release_timestamp': release_timestamp, + }