From 91a670a4f7babe9c8aa2018f57d8c8952a6f49d8 Mon Sep 17 00:00:00 2001 From: gillux Date: Sat, 7 Oct 2023 06:27:54 +0800 Subject: [PATCH 001/108] [ie/LiTV] Fix extractor (#7785) Closes #5456 Authored by: jiru --- yt_dlp/extractor/litv.py | 48 ++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/yt_dlp/extractor/litv.py b/yt_dlp/extractor/litv.py index 19b298ec6c09..2c7c7175eafe 100644 --- a/yt_dlp/extractor/litv.py +++ b/yt_dlp/extractor/litv.py @@ -13,7 +13,7 @@ class LiTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P[^&]+)' - _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' + _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?content_id=%s' _TESTS = [{ 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', @@ -21,16 +21,18 @@ class LiTVIE(InfoExtractor): 'id': 'VOD00041606', 'title': '花千骨', }, - 'playlist_count': 50, + 'playlist_count': 51, # 50 episodes + 1 trailer }, { 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', - 'md5': '969e343d9244778cb29acec608e53640', + 'md5': 'b90ff1e9f1d8f5cfcd0a44c3e2b34c7a', 'info_dict': { 'id': 'VOD00041610', 'ext': 'mp4', 'title': '花千骨第1集', 'thumbnail': r're:https?://.*\.jpg$', - 'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f', + 'description': '《花千骨》陸劇線上看。十六年前,平靜的村莊內,一名女嬰隨異相出生,途徑此地的蜀山掌門清虛道長算出此女命運非同一般,她體內散發的異香易招惹妖魔。一念慈悲下,他在村莊周邊設下結界阻擋妖魔入侵,讓其年滿十六後去蜀山,並賜名花千骨。', + 'categories': ['奇幻', '愛情', '中國', '仙俠'], + 'episode': 'Episode 1', 'episode_number': 1, }, 'params': { @@ -46,20 +48,17 @@ class LiTVIE(InfoExtractor): 'title': '芈月傳第1集 霸星芈月降世楚國', 'description': '楚威王二年,太史令唐昧夜觀星象,發現霸星即將現世。王后得知霸星的預言後,想盡辦法不讓孩子順利出生,幸得莒姬相護化解危機。沒想到眾人期待下出生的霸星卻是位公主,楚威王對此失望至極。楚王后命人將女嬰丟棄河中,居然奇蹟似的被少司命像攔下,楚威王認為此女非同凡響,為她取名芈月。', }, - 'skip': 'Georestricted to Taiwan', + 'skip': 'No longer exists', }] - def _extract_playlist(self, season_list, video_id, program_info, prompt=True): - episode_title = program_info['title'] - content_id = season_list['contentId'] - + def _extract_playlist(self, playlist_data, content_type): all_episodes = [ self.url_result(smuggle_url( - self._URL_TEMPLATE % (program_info['contentType'], episode['contentId']), + self._URL_TEMPLATE % (content_type, episode['contentId']), {'force_noplaylist': True})) # To prevent infinite recursion - for episode in season_list['episode']] + for episode in traverse_obj(playlist_data, ('seasons', ..., 'episode', lambda _, v: v['contentId']))] - return self.playlist_result(all_episodes, content_id, episode_title) + return self.playlist_result(all_episodes, playlist_data['contentId'], playlist_data.get('title')) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -68,24 +67,31 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) + if self._search_regex( + r'(?i)]*http-equiv="refresh"\s[^>]*content="[0-9]+;\s*url=https://www\.litv\.tv/"', + webpage, 'meta refresh redirect', default=False, group=0): + raise ExtractorError('No such content found', expected=True) + program_info = self._parse_json(self._search_regex( r'var\s+programInfo\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), video_id) - season_list = list(program_info.get('seasonList', {}).values()) - playlist_id = traverse_obj(season_list, 0, 'contentId') - if self._yes_playlist(playlist_id, video_id, smuggled_data): - return self._extract_playlist(season_list[0], video_id, program_info) - - # In browsers `getMainUrl` request is always issued. Usually this + # In browsers `getProgramInfo` request is always issued. Usually this # endpoint gives the same result as the data embedded in the webpage. - # If georestricted, there are no embedded data, so an extra request is - # necessary to get the error code + # If, for some reason, there are no embedded data, we do an extra request. if 'assetId' not in program_info: program_info = self._download_json( 'https://www.litv.tv/vod/ajax/getProgramInfo', video_id, query={'contentId': video_id}, headers={'Accept': 'application/json'}) + + series_id = program_info['seriesId'] + if self._yes_playlist(series_id, video_id, smuggled_data): + playlist_data = self._download_json( + 'https://www.litv.tv/vod/ajax/getSeriesTree', video_id, + query={'seriesId': series_id}, headers={'Accept': 'application/json'}) + return self._extract_playlist(playlist_data, program_info['contentType']) + video_data = self._parse_json(self._search_regex( r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', webpage, 'video data', default='{}'), video_id) @@ -96,7 +102,7 @@ def _real_extract(self, url): 'contentType': program_info['contentType'], } video_data = self._download_json( - 'https://www.litv.tv/vod/getMainUrl', video_id, + 'https://www.litv.tv/vod/ajax/getMainUrlNoAuth', video_id, data=json.dumps(payload).encode('utf-8'), headers={'Content-Type': 'application/json'}) From f980df734cf5c0eaded2f7b38c6c60bccfeebb48 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Fri, 6 Oct 2023 18:31:33 -0400 Subject: [PATCH 002/108] [ie/neteasemusic] Fix extractors (#8181) Closes #4388 Authored by: c-basalt --- yt_dlp/extractor/neteasemusic.py | 585 +++++++++++++++++-------------- 1 file changed, 317 insertions(+), 268 deletions(-) diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py index 5b7307bc8f66..68bfcb6ba772 100644 --- a/yt_dlp/extractor/neteasemusic.py +++ b/yt_dlp/extractor/neteasemusic.py @@ -2,105 +2,74 @@ import json import re import time -from base64 import b64encode -from binascii import hexlify -from datetime import datetime from hashlib import md5 from random import randint from .common import InfoExtractor from ..aes import aes_ecb_encrypt, pkcs7_padding -from ..compat import compat_urllib_parse_urlencode -from ..networking import Request from ..utils import ( ExtractorError, - bytes_to_intlist, - error_to_compat_str, - float_or_none, int_or_none, - intlist_to_bytes, - try_get, + join_nonempty, + str_or_none, + strftime_or_none, + traverse_obj, + unified_strdate, + url_or_none, + urljoin, + variadic, ) class NetEaseMusicBaseIE(InfoExtractor): _FORMATS = ['bMusic', 'mMusic', 'hMusic'] - _NETEASE_SALT = '3go8&$8*3*3h0k(2)2' _API_BASE = 'http://music.163.com/api/' + _GEO_BYPASS = False - @classmethod - def _encrypt(cls, dfsid): - salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8')) - string_bytes = bytearray(str(dfsid).encode('ascii')) - salt_len = len(salt_bytes) - for i in range(len(string_bytes)): - string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len] - m = md5() - m.update(bytes(string_bytes)) - result = b64encode(m.digest()).decode('ascii') - return result.replace('/', '_').replace('+', '-') - - def make_player_api_request_data_and_headers(self, song_id, bitrate): - KEY = b'e82ckenh8dichen8' - URL = '/api/song/enhance/player/url' - now = int(time.time() * 1000) - rand = randint(0, 1000) - cookie = { - 'osver': None, - 'deviceId': None, + @staticmethod + def kilo_or_none(value): + return int_or_none(value, scale=1000) + + def _create_eapi_cipher(self, api_path, query_body, cookies): + request_text = json.dumps({**query_body, 'header': cookies}, separators=(',', ':')) + + message = f'nobody{api_path}use{request_text}md5forencrypt'.encode('latin1') + msg_digest = md5(message).hexdigest() + + data = pkcs7_padding(list(str.encode( + f'{api_path}-36cd479b6b5-{request_text}-36cd479b6b5-{msg_digest}'))) + encrypted = bytes(aes_ecb_encrypt(data, list(b'e82ckenh8dichen8'))) + return f'params={encrypted.hex().upper()}'.encode() + + def _download_eapi_json(self, path, video_id, query_body, headers={}, **kwargs): + cookies = { + 'osver': 'undefined', + 'deviceId': 'undefined', 'appver': '8.0.0', 'versioncode': '140', - 'mobilename': None, + 'mobilename': 'undefined', 'buildver': '1623435496', 'resolution': '1920x1080', '__csrf': '', 'os': 'pc', - 'channel': None, - 'requestId': '{0}_{1:04}'.format(now, rand), - } - request_text = json.dumps( - {'ids': '[{0}]'.format(song_id), 'br': bitrate, 'header': cookie}, - separators=(',', ':')) - message = 'nobody{0}use{1}md5forencrypt'.format( - URL, request_text).encode('latin1') - msg_digest = md5(message).hexdigest() - - data = '{0}-36cd479b6b5-{1}-36cd479b6b5-{2}'.format( - URL, request_text, msg_digest) - data = pkcs7_padding(bytes_to_intlist(data)) - encrypted = intlist_to_bytes(aes_ecb_encrypt(data, bytes_to_intlist(KEY))) - encrypted_params = hexlify(encrypted).decode('ascii').upper() - - cookie = '; '.join( - ['{0}={1}'.format(k, v if v is not None else 'undefined') - for [k, v] in cookie.items()]) - - headers = { - 'User-Agent': self.extractor.get_param('http_headers')['User-Agent'], - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': 'https://music.163.com', - 'Cookie': cookie, + 'channel': 'undefined', + 'requestId': f'{int(time.time() * 1000)}_{randint(0, 1000):04}', + **traverse_obj(self._get_cookies(self._API_BASE), { + 'MUSIC_U': ('MUSIC_U', {lambda i: i.value}), + }) } - return ('params={0}'.format(encrypted_params), headers) + return self._download_json( + urljoin('https://interface3.music.163.com/', f'/eapi{path}'), video_id, + data=self._create_eapi_cipher(f'/api{path}', query_body, cookies), headers={ + 'Referer': 'https://music.163.com', + 'Cookie': '; '.join([f'{k}={v}' for k, v in cookies.items()]), + **headers, + }, **kwargs) def _call_player_api(self, song_id, bitrate): - url = 'https://interface3.music.163.com/eapi/song/enhance/player/url' - data, headers = self.make_player_api_request_data_and_headers(song_id, bitrate) - try: - msg = 'empty result' - result = self._download_json( - url, song_id, data=data.encode('ascii'), headers=headers) - if result: - return result - except ExtractorError as e: - if type(e.cause) in (ValueError, TypeError): - # JSON load failure - raise - except Exception as e: - msg = error_to_compat_str(e) - self.report_warning('%s API call (%s) failed: %s' % ( - song_id, bitrate, msg)) - return {} + return self._download_eapi_json( + '/song/enhance/player/url', song_id, {'ids': f'[{song_id}]', 'br': bitrate}, + note=f'Downloading song URL info: bitrate {bitrate}') def extract_formats(self, info): err = 0 @@ -110,45 +79,50 @@ def extract_formats(self, info): details = info.get(song_format) if not details: continue - bitrate = int_or_none(details.get('bitrate')) or 999000 - data = self._call_player_api(song_id, bitrate) - for song in try_get(data, lambda x: x['data'], list) or []: - song_url = try_get(song, lambda x: x['url']) - if not song_url: - continue + for song in traverse_obj(self._call_player_api(song_id, bitrate), ('data', lambda _, v: url_or_none(v['url']))): + song_url = song['url'] if self._is_valid_url(song_url, info['id'], 'song'): formats.append({ 'url': song_url, - 'ext': details.get('extension'), - 'abr': float_or_none(song.get('br'), scale=1000), 'format_id': song_format, - 'filesize': int_or_none(song.get('size')), - 'asr': int_or_none(details.get('sr')), + 'asr': traverse_obj(details, ('sr', {int_or_none})), + **traverse_obj(song, { + 'ext': ('type', {str}), + 'abr': ('br', {self.kilo_or_none}), + 'filesize': ('size', {int_or_none}), + }), }) elif err == 0: - err = try_get(song, lambda x: x['code'], int) + err = traverse_obj(song, ('code', {int})) or 0 if not formats: - msg = 'No media links found' if err != 0 and (err < 200 or err >= 400): - raise ExtractorError( - '%s (site code %d)' % (msg, err, ), expected=True) + raise ExtractorError(f'No media links found (site code {err})', expected=True) else: self.raise_geo_restricted( - msg + ': probably this video is not available from your location due to geo restriction.', - countries=['CN']) - + 'No media links found: probably due to geo restriction.', countries=['CN']) return formats - @classmethod - def convert_milliseconds(cls, ms): - return int(round(ms / 1000.0)) - def query_api(self, endpoint, video_id, note): - req = Request('%s%s' % (self._API_BASE, endpoint)) - req.headers['Referer'] = self._API_BASE - return self._download_json(req, video_id, note) + result = self._download_json( + f'{self._API_BASE}{endpoint}', video_id, note, headers={'Referer': self._API_BASE}) + code = traverse_obj(result, ('code', {int})) + message = traverse_obj(result, ('message', {str})) or '' + if code == -462: + self.raise_login_required(f'Login required to download: {message}') + elif code != 200: + raise ExtractorError(f'Failed to get meta info: {code} {message}') + return result + + def _get_entries(self, songs_data, entry_keys=None, id_key='id', name_key='name'): + for song in traverse_obj(songs_data, ( + *variadic(entry_keys, (str, bytes, dict, set)), + lambda _, v: int_or_none(v[id_key]) is not None)): + song_id = str(song[id_key]) + yield self.url_result( + f'http://music.163.com/#/song?id={song_id}', NetEaseMusicIE, + song_id, traverse_obj(song, (name_key, {str}))) class NetEaseMusicIE(NetEaseMusicBaseIE): @@ -156,16 +130,18 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): IE_DESC = '网易云音乐' _VALID_URL = r'https?://(y\.)?music\.163\.com/(?:[#m]/)?song\?.*?\bid=(?P[0-9]+)' _TESTS = [{ - 'url': 'http://music.163.com/#/song?id=32102397', - 'md5': '3e909614ce09b1ccef4a3eb205441190', + 'url': 'https://music.163.com/#/song?id=548648087', 'info_dict': { - 'id': '32102397', + 'id': '548648087', 'ext': 'mp3', - 'title': 'Bad Blood', - 'creator': 'Taylor Swift / Kendrick Lamar', - 'upload_date': '20150516', - 'timestamp': 1431792000, - 'description': 'md5:25fc5f27e47aad975aa6d36382c7833c', + 'title': '戒烟 (Live)', + 'creator': '李荣浩 / 朱正廷 / 陈立农 / 尤长靖 / ONER灵超 / ONER木子洋 / 杨非同 / 陆定昊', + 'timestamp': 1522944000, + 'upload_date': '20180405', + 'description': 'md5:3650af9ee22c87e8637cb2dde22a765c', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + "duration": 256, + 'thumbnail': r're:^http.*\.jpg', }, }, { 'note': 'No lyrics.', @@ -176,21 +152,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'title': 'Opus 28', 'creator': 'Dustin O\'Halloran', 'upload_date': '20080211', - 'description': 'md5:f12945b0f6e0365e3b73c5032e1b0ff4', 'timestamp': 1202745600, - }, - }, { - 'note': 'Has translated name.', - 'url': 'http://music.163.com/#/song?id=22735043', - 'info_dict': { - 'id': '22735043', - 'ext': 'mp3', - 'title': '소원을 말해봐 (Genie)', - 'creator': '少女时代', - 'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184', - 'upload_date': '20100127', - 'timestamp': 1264608000, - 'alt_title': '说出愿望吧(Genie)', + 'duration': 263, + 'thumbnail': r're:^http.*\.jpg', }, }, { 'url': 'https://y.music.163.com/m/song?app_version=8.8.45&id=95670&uct2=sKnvS4+0YStsWkqsPhFijw%3D%3D&dlt=0846', @@ -203,59 +167,99 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'upload_date': '19911130', 'timestamp': 691516800, 'description': 'md5:1ba2f911a2b0aa398479f595224f2141', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + 'duration': 268, + 'alt_title': '伴唱:现代人乐队 合唱:总政歌舞团', + 'thumbnail': r're:^http.*\.jpg', + }, + }, { + 'url': 'http://music.163.com/#/song?id=32102397', + 'md5': '3e909614ce09b1ccef4a3eb205441190', + 'info_dict': { + 'id': '32102397', + 'ext': 'mp3', + 'title': 'Bad Blood', + 'creator': 'Taylor Swift / Kendrick Lamar', + 'upload_date': '20150516', + 'timestamp': 1431792000, + 'description': 'md5:21535156efb73d6d1c355f95616e285a', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + 'duration': 199, + 'thumbnail': r're:^http.*\.jpg', }, + 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'Has translated name.', + 'url': 'http://music.163.com/#/song?id=22735043', + 'info_dict': { + 'id': '22735043', + 'ext': 'mp3', + 'title': '소원을 말해봐 (Genie)', + 'creator': '少女时代', + 'upload_date': '20100127', + 'timestamp': 1264608000, + 'description': 'md5:03d1ffebec3139aa4bafe302369269c5', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + 'duration': 229, + 'alt_title': '说出愿望吧(Genie)', + 'thumbnail': r're:^http.*\.jpg', + }, + 'skip': 'Blocked outside Mainland China', }] def _process_lyrics(self, lyrics_info): - original = lyrics_info.get('lrc', {}).get('lyric') - translated = lyrics_info.get('tlyric', {}).get('lyric') + original = traverse_obj(lyrics_info, ('lrc', 'lyric', {str})) + translated = traverse_obj(lyrics_info, ('tlyric', 'lyric', {str})) + + if not original or original == '[99:00.00]纯音乐,请欣赏\n': + return None if not translated: - return original + return { + 'lyrics': [{'data': original, 'ext': 'lrc'}], + } lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)' original_ts_texts = re.findall(lyrics_expr, original) - translation_ts_dict = dict( - (time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated) - ) - lyrics = '\n'.join([ - '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, '')) - for time_stamp, text in original_ts_texts - ]) - return lyrics + translation_ts_dict = dict(re.findall(lyrics_expr, translated)) + + merged = '\n'.join( + join_nonempty(f'{timestamp}{text}', translation_ts_dict.get(timestamp, ''), delim=' / ') + for timestamp, text in original_ts_texts) + + return { + 'lyrics_merged': [{'data': merged, 'ext': 'lrc'}], + 'lyrics': [{'data': original, 'ext': 'lrc'}], + 'lyrics_translated': [{'data': translated, 'ext': 'lrc'}], + } def _real_extract(self, url): song_id = self._match_id(url) - params = { - 'id': song_id, - 'ids': '[%s]' % song_id - } info = self.query_api( - 'song/detail?' + compat_urllib_parse_urlencode(params), - song_id, 'Downloading song info')['songs'][0] + f'song/detail?id={song_id}&ids=%5B{song_id}%5D', song_id, 'Downloading song info')['songs'][0] formats = self.extract_formats(info) - lyrics_info = self.query_api( - 'song/lyric?id=%s&lv=-1&tv=-1' % song_id, - song_id, 'Downloading lyrics data') - lyrics = self._process_lyrics(lyrics_info) - - alt_title = None - if info.get('transNames'): - alt_title = '/'.join(info.get('transNames')) + lyrics = self._process_lyrics(self.query_api( + f'song/lyric?id={song_id}&lv=-1&tv=-1', song_id, 'Downloading lyrics data')) + lyric_data = { + 'description': traverse_obj(lyrics, (('lyrics_merged', 'lyrics'), 0, 'data'), get_all=False), + 'subtitles': lyrics, + } if lyrics else {} return { 'id': song_id, - 'title': info['name'], - 'alt_title': alt_title, - 'creator': ' / '.join([artist['name'] for artist in info.get('artists', [])]), - 'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')), - 'thumbnail': info.get('album', {}).get('picUrl'), - 'duration': self.convert_milliseconds(info.get('duration', 0)), - 'description': lyrics, 'formats': formats, + 'alt_title': '/'.join(traverse_obj(info, (('transNames', 'alias'), ...))) or None, + 'creator': ' / '.join(traverse_obj(info, ('artists', ..., 'name'))) or None, + **lyric_data, + **traverse_obj(info, { + 'title': ('name', {str}), + 'timestamp': ('album', 'publishTime', {self.kilo_or_none}), + 'thumbnail': ('album', 'picUrl', {url_or_none}), + 'duration': ('duration', {self.kilo_or_none}), + }), } @@ -263,31 +267,44 @@ class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): IE_NAME = 'netease:album' IE_DESC = '网易云音乐 - 专辑' _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P[0-9]+)' - _TEST = { + _TESTS = [{ + 'url': 'https://music.163.com/#/album?id=133153666', + 'info_dict': { + 'id': '133153666', + 'title': '桃几的翻唱', + 'upload_date': '20210913', + 'description': '桃几2021年翻唱合集', + 'thumbnail': r're:^http.*\.jpg', + }, + 'playlist_mincount': 13, + }, { 'url': 'http://music.163.com/#/album?id=220780', 'info_dict': { 'id': '220780', - 'title': 'B\'day', + 'title': 'B\'Day', + 'upload_date': '20060904', + 'description': 'md5:71a74e1d8f392d88cf1bbe48879ad0b0', + 'thumbnail': r're:^http.*\.jpg', }, 'playlist_count': 23, - 'skip': 'Blocked outside Mainland China', - } + }] def _real_extract(self, url): album_id = self._match_id(url) - - info = self.query_api( - 'album/%s?id=%s' % (album_id, album_id), - album_id, 'Downloading album data')['album'] - - name = info['name'] - desc = info.get('description') - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['songs'] - ] - return self.playlist_result(entries, album_id, name, desc) + webpage = self._download_webpage(f'https://music.163.com/album?id={album_id}', album_id) + + songs = self._search_json( + r']+\bid="song-list-pre-data"[^>]*>', webpage, 'metainfo', album_id, + end_pattern=r'', contains_pattern=r'\[(?s:.+)\]') + metainfo = { + 'title': self._og_search_property('title', webpage, 'title', fatal=False), + 'description': self._html_search_regex( + (rf']+\bid="album-desc-{suffix}"[^>]*>(.*?)' for suffix in ('more', 'dot')), + webpage, 'description', flags=re.S, fatal=False), + 'thumbnail': self._og_search_property('image', webpage, 'thumbnail', fatal=False), + 'upload_date': unified_strdate(self._html_search_meta('music:release_date', webpage, 'date', fatal=False)), + } + return self.playlist_result(self._get_entries(songs), album_id, **metainfo) class NetEaseMusicSingerIE(NetEaseMusicBaseIE): @@ -299,10 +316,9 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE): 'url': 'http://music.163.com/#/artist?id=10559', 'info_dict': { 'id': '10559', - 'title': '张惠妹 - aMEI;阿密特', + 'title': '张惠妹 - aMEI;阿妹;阿密特', }, 'playlist_count': 50, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'Singer has translated name.', 'url': 'http://music.163.com/#/artist?id=124098', @@ -311,28 +327,28 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE): 'title': '李昇基 - 이승기', }, 'playlist_count': 50, - 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'Singer with both translated and alias', + 'url': 'https://music.163.com/#/artist?id=159692', + 'info_dict': { + 'id': '159692', + 'title': '初音ミク - 初音未来;Hatsune Miku', + }, + 'playlist_count': 50, }] def _real_extract(self, url): singer_id = self._match_id(url) info = self.query_api( - 'artist/%s?id=%s' % (singer_id, singer_id), - singer_id, 'Downloading singer data') - - name = info['artist']['name'] - if info['artist']['trans']: - name = '%s - %s' % (name, info['artist']['trans']) - if info['artist']['alias']: - name = '%s - %s' % (name, ';'.join(info['artist']['alias'])) - - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['hotSongs'] - ] - return self.playlist_result(entries, singer_id, name) + f'artist/{singer_id}?id={singer_id}', singer_id, note='Downloading singer data') + + name = join_nonempty( + traverse_obj(info, ('artist', 'name', {str})), + join_nonempty(*traverse_obj(info, ('artist', ('trans', ('alias', ...)), {str})), delim=';'), + delim=' - ') + + return self.playlist_result(self._get_entries(info, 'hotSongs'), singer_id, name) class NetEaseMusicListIE(NetEaseMusicBaseIE): @@ -344,10 +360,28 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE): 'info_dict': { 'id': '79177352', 'title': 'Billboard 2007 Top 100', - 'description': 'md5:12fd0819cab2965b9583ace0f8b7b022' + 'description': 'md5:12fd0819cab2965b9583ace0f8b7b022', + 'tags': ['欧美'], + 'uploader': '浑然破灭', + 'uploader_id': '67549805', + 'timestamp': int, + 'upload_date': r're:\d{8}', }, - 'playlist_count': 99, - 'skip': 'Blocked outside Mainland China', + 'playlist_mincount': 95, + }, { + 'note': 'Toplist/Charts sample', + 'url': 'https://music.163.com/#/discover/toplist?id=60198', + 'info_dict': { + 'id': '60198', + 'title': 're:美国Billboard榜 [0-9]{4}-[0-9]{2}-[0-9]{2}', + 'description': '美国Billboard排行榜', + 'tags': ['流行', '欧美', '榜单'], + 'uploader': 'Billboard公告牌', + 'uploader_id': '48171', + 'timestamp': int, + 'upload_date': r're:\d{8}', + }, + 'playlist_count': 100, }, { 'note': 'Toplist/Charts sample', 'url': 'http://music.163.com/#/discover/toplist?id=3733003', @@ -363,64 +397,86 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE): def _real_extract(self, url): list_id = self._match_id(url) - info = self.query_api( - 'playlist/detail?id=%s&lv=-1&tv=-1' % list_id, - list_id, 'Downloading playlist data')['result'] - - name = info['name'] - desc = info.get('description') + info = self._download_eapi_json( + '/v3/playlist/detail', list_id, + {'id': list_id, 't': '-1', 'n': '500', 's': '0'}, + note="Downloading playlist info") - if info.get('specialType') == 10: # is a chart/toplist - datestamp = datetime.fromtimestamp( - self.convert_milliseconds(info['updateTime'])).strftime('%Y-%m-%d') - name = '%s %s' % (name, datestamp) + metainfo = traverse_obj(info, ('playlist', { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'tags': ('tags', ..., {str}), + 'uploader': ('creator', 'nickname', {str}), + 'uploader_id': ('creator', 'userId', {str_or_none}), + 'timestamp': ('updateTime', {self.kilo_or_none}), + })) + if traverse_obj(info, ('playlist', 'specialType')) == 10: + metainfo['title'] = f'{metainfo.get("title")} {strftime_or_none(metainfo.get("timestamp"), "%Y-%m-%d")}' - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['tracks'] - ] - return self.playlist_result(entries, list_id, name, desc) + return self.playlist_result(self._get_entries(info, ('playlist', 'tracks')), list_id, **metainfo) class NetEaseMusicMvIE(NetEaseMusicBaseIE): IE_NAME = 'netease:mv' IE_DESC = '网易云音乐 - MV' _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P[0-9]+)' - _TEST = { + _TESTS = [{ + 'url': 'https://music.163.com/#/mv?id=10958064', + 'info_dict': { + 'id': '10958064', + 'ext': 'mp4', + 'title': '交换余生', + 'description': 'md5:e845872cff28820642a2b02eda428fea', + 'creator': '林俊杰', + 'upload_date': '20200916', + 'thumbnail': r're:http.*\.jpg', + 'duration': 364, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + }, { 'url': 'http://music.163.com/#/mv?id=415350', 'info_dict': { 'id': '415350', 'ext': 'mp4', 'title': '이럴거면 그러지말지', 'description': '白雅言自作曲唱甜蜜爱情', - 'creator': '白雅言', + 'creator': '白娥娟', 'upload_date': '20150520', + 'thumbnail': r're:http.*\.jpg', + 'duration': 216, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, - 'skip': 'Blocked outside Mainland China', - } + }] def _real_extract(self, url): mv_id = self._match_id(url) info = self.query_api( - 'mv/detail?id=%s&type=mp4' % mv_id, - mv_id, 'Downloading mv info')['data'] + f'mv/detail?id={mv_id}&type=mp4', mv_id, 'Downloading mv info')['data'] formats = [ - {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'height': int(brs)} + {'url': mv_url, 'ext': 'mp4', 'format_id': f'{brs}p', 'height': int_or_none(brs)} for brs, mv_url in info['brs'].items() ] return { 'id': mv_id, - 'title': info['name'], - 'description': info.get('desc') or info.get('briefDesc'), - 'creator': info['artistName'], - 'upload_date': info['publishTime'].replace('-', ''), 'formats': formats, - 'thumbnail': info.get('cover'), - 'duration': self.convert_milliseconds(info.get('duration', 0)), + **traverse_obj(info, { + 'title': ('name', {str}), + 'description': (('desc', 'briefDesc'), {str}, {lambda x: x or None}), + 'creator': ('artistName', {str}), + 'upload_date': ('publishTime', {unified_strdate}), + 'thumbnail': ('cover', {url_or_none}), + 'duration': ('duration', {self.kilo_or_none}), + 'view_count': ('playCount', {int_or_none}), + 'like_count': ('likeCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + }, get_all=False), } @@ -431,75 +487,74 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): _TESTS = [{ 'url': 'http://music.163.com/#/program?id=10109055', 'info_dict': { - 'id': '10109055', + 'id': '32593346', 'ext': 'mp3', 'title': '不丹足球背后的故事', 'description': '喜马拉雅人的足球梦 ...', 'creator': '大话西藏', - 'timestamp': 1434179342, + 'timestamp': 1434179287, 'upload_date': '20150613', + 'thumbnail': r're:http.*\.jpg', 'duration': 900, }, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'This program has accompanying songs.', 'url': 'http://music.163.com/#/program?id=10141022', 'info_dict': { 'id': '10141022', - 'title': '25岁,你是自在如风的少年<27°C>', + 'title': '滚滚电台的有声节目', 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', + 'creator': '滚滚电台ORZ', + 'timestamp': 1434450733, + 'upload_date': '20150616', + 'thumbnail': r're:http.*\.jpg', }, 'playlist_count': 4, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'This program has accompanying songs.', 'url': 'http://music.163.com/#/program?id=10141022', 'info_dict': { - 'id': '10141022', + 'id': '32647209', 'ext': 'mp3', - 'title': '25岁,你是自在如风的少年<27°C>', + 'title': '滚滚电台的有声节目', 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', - 'timestamp': 1434450841, + 'creator': '滚滚电台ORZ', + 'timestamp': 1434450733, 'upload_date': '20150616', + 'thumbnail': r're:http.*\.jpg', + 'duration': 1104, }, 'params': { 'noplaylist': True }, - 'skip': 'Blocked outside Mainland China', }] def _real_extract(self, url): program_id = self._match_id(url) info = self.query_api( - 'dj/program/detail?id=%s' % program_id, - program_id, 'Downloading program info')['program'] + f'dj/program/detail?id={program_id}', program_id, note='Downloading program info')['program'] - name = info['name'] - description = info['description'] + metainfo = traverse_obj(info, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'creator': ('dj', 'brand', {str}), + 'thumbnail': ('coverUrl', {url_or_none}), + 'timestamp': ('createTime', {self.kilo_or_none}), + }) if not self._yes_playlist(info['songs'] and program_id, info['mainSong']['id']): formats = self.extract_formats(info['mainSong']) return { - 'id': info['mainSong']['id'], - 'title': name, - 'description': description, - 'creator': info['dj']['brand'], - 'timestamp': self.convert_milliseconds(info['createTime']), - 'thumbnail': info['coverUrl'], - 'duration': self.convert_milliseconds(info.get('duration', 0)), + 'id': str(info['mainSong']['id']), 'formats': formats, + 'duration': traverse_obj(info, ('mainSong', 'duration', {self.kilo_or_none})), + **metainfo, } - song_ids = [info['mainSong']['id']] - song_ids.extend([song['id'] for song in info['songs']]) - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song_id, - 'NetEaseMusic', song_id) - for song_id in song_ids - ] - return self.playlist_result(entries, program_id, name, description) + songs = traverse_obj(info, (('mainSong', ('songs', ...)),)) + return self.playlist_result(self._get_entries(songs), program_id, **metainfo) class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): @@ -511,38 +566,32 @@ class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): 'info_dict': { 'id': '42', 'title': '声音蔓延', - 'description': 'md5:766220985cbd16fdd552f64c578a6b15' + 'description': 'md5:c7381ebd7989f9f367668a5aee7d5f08' }, 'playlist_mincount': 40, - 'skip': 'Blocked outside Mainland China', } _PAGE_SIZE = 1000 def _real_extract(self, url): dj_id = self._match_id(url) - name = None - desc = None + metainfo = {} entries = [] for offset in itertools.count(start=0, step=self._PAGE_SIZE): info = self.query_api( - 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d' - % (self._PAGE_SIZE, dj_id, offset), - dj_id, 'Downloading dj programs - %d' % offset) - - entries.extend([ - self.url_result( - 'http://music.163.com/#/program?id=%s' % program['id'], - 'NetEaseMusicProgram', program['id']) - for program in info['programs'] - ]) - - if name is None: - radio = info['programs'][0]['radio'] - name = radio['name'] - desc = radio['desc'] + f'dj/program/byradio?asc=false&limit={self._PAGE_SIZE}&radioId={dj_id}&offset={offset}', + dj_id, note=f'Downloading dj programs - {offset}') + + entries.extend(self.url_result( + f'http://music.163.com/#/program?id={program["id"]}', NetEaseMusicProgramIE, + program['id'], program.get('name')) for program in info['programs']) + if not metainfo: + metainfo = traverse_obj(info, ('programs', 0, 'radio', { + 'title': ('name', {str}), + 'description': ('desc', {str}), + })) if not info['more']: break - return self.playlist_result(entries, dj_id, name, desc) + return self.playlist_result(entries, dj_id, **metainfo) From a9efb4b8d74f3583450ffda0ee57259a47d39c70 Mon Sep 17 00:00:00 2001 From: xofe <22776566+xofe@users.noreply.github.com> Date: Fri, 6 Oct 2023 22:35:11 +0000 Subject: [PATCH 003/108] [ie/abc.net.au:iview] Improve `episode` extraction (#8201) Authored by: xofe --- yt_dlp/extractor/abc.py | 90 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index d2cf5f7c518c..9d527246a126 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -181,18 +181,102 @@ class ABCIViewIE(InfoExtractor): _GEO_COUNTRIES = ['AU'] _TESTS = [{ + 'url': 'https://iview.abc.net.au/show/utopia/series/1/video/CO1211V001S00', + 'md5': '52a942bfd7a0b79a6bfe9b4ce6c9d0ed', + 'info_dict': { + 'id': 'CO1211V001S00', + 'ext': 'mp4', + 'title': 'Series 1 Ep 1 Wood For The Trees', + 'series': 'Utopia', + 'description': 'md5:0cfb2c183c1b952d1548fd65c8a95c00', + 'upload_date': '20230726', + 'uploader_id': 'abc1', + 'series_id': 'CO1211V', + 'episode_id': 'CO1211V001S00', + 'season_number': 1, + 'season': 'Season 1', + 'episode_number': 1, + 'episode': 'Wood For The Trees', + 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/co/CO1211V001S00_5ad8353f4df09_1280.jpg', + 'timestamp': 1690403700, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'note': 'No episode name', 'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00', 'md5': '67715ce3c78426b11ba167d875ac6abf', 'info_dict': { 'id': 'LE1927H001S00', 'ext': 'mp4', - 'title': "Series 11 Ep 1", - 'series': "Gruen", + 'title': 'Series 11 Ep 1', + 'series': 'Gruen', 'description': 'md5:52cc744ad35045baf6aded2ce7287f67', 'upload_date': '20190925', 'uploader_id': 'abc1', + 'series_id': 'LE1927H', + 'episode_id': 'LE1927H001S00', + 'season_number': 11, + 'season': 'Season 11', + 'episode_number': 1, + 'episode': 'Episode 1', + 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/le/LE1927H001S00_5d954fbd79e25_1280.jpg', 'timestamp': 1569445289, }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'params': { + 'skip_download': True, + }, + }, { + 'note': 'No episode number', + 'url': 'https://iview.abc.net.au/show/four-corners/series/2022/video/NC2203H039S00', + 'md5': '77cb7d8434440e3b28fbebe331c2456a', + 'info_dict': { + 'id': 'NC2203H039S00', + 'ext': 'mp4', + 'title': 'Series 2022 Locking Up Kids', + 'series': 'Four Corners', + 'description': 'md5:54829ca108846d1a70e1fcce2853e720', + 'upload_date': '20221114', + 'uploader_id': 'abc1', + 'series_id': 'NC2203H', + 'episode_id': 'NC2203H039S00', + 'season_number': 2022, + 'season': 'Season 2022', + 'episode_number': None, + 'episode': 'Locking Up Kids', + 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/nc/NC2203H039S00_636d8a0944a22_1920.jpg', + 'timestamp': 1668460497, + + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'params': { + 'skip_download': True, + }, + }, { + 'note': 'No episode name or number', + 'url': 'https://iview.abc.net.au/show/landline/series/2021/video/RF2004Q043S00', + 'md5': '2e17dec06b13cc81dc119d2565289396', + 'info_dict': { + 'id': 'RF2004Q043S00', + 'ext': 'mp4', + 'title': 'Series 2021', + 'series': 'Landline', + 'description': 'md5:c9f30d9c0c914a7fd23842f6240be014', + 'upload_date': '20211205', + 'uploader_id': 'abc1', + 'series_id': 'RF2004Q', + 'episode_id': 'RF2004Q043S00', + 'season_number': 2021, + 'season': 'Season 2021', + 'episode_number': None, + 'episode': None, + 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/rf/RF2004Q043S00_61a950639dbc0_1920.jpg', + 'timestamp': 1638710705, + + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], 'params': { 'skip_download': True, }, @@ -254,6 +338,8 @@ def tokenize_url(url, token): 'episode_number': int_or_none(self._search_regex( r'\bEp\s+(\d+)\b', title, 'episode number', default=None)), 'episode_id': house_number, + 'episode': self._search_regex( + r'^(?:Series\s+\d+)?\s*(?:Ep\s+\d+)?\s*(.*)$', title, 'episode', default='') or None, 'uploader_id': video_params.get('channel'), 'formats': formats, 'subtitles': subtitles, From 48cceec1ddb8649b5e771df8df79eb9c39c82b90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Fri, 6 Oct 2023 19:38:26 -0300 Subject: [PATCH 004/108] [ie/lbry] Add playlist support (#8213) Closes #5982, Closes #8204 Authored by: drzraf, bashonly, Grub4K --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/lbry.py | 184 ++++++++++++++++++++------------ 2 files changed, 116 insertions(+), 69 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 908abb8aced3..ef6123e8a7b1 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -951,6 +951,7 @@ from .lbry import ( LBRYIE, LBRYChannelIE, + LBRYPlaylistIE, ) from .lci import LCIIE from .lcp import ( diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index 9a9f9256fed0..ccce300b5b51 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -22,10 +22,11 @@ class LBRYBaseIE(InfoExtractor): - _BASE_URL_REGEX = r'(?:https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/|lbry://)' + _BASE_URL_REGEX = r'(?x)(?:https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/|lbry://)' _CLAIM_ID_REGEX = r'[0-9a-f]{1,40}' - _OPT_CLAIM_ID = '[^:/?#&]+(?:[:#]%s)?' % _CLAIM_ID_REGEX + _OPT_CLAIM_ID = '[^$@:/?#&]+(?:[:#]%s)?' % _CLAIM_ID_REGEX _SUPPORTED_STREAM_TYPES = ['video', 'audio'] + _PAGE_SIZE = 50 def _call_api_proxy(self, method, display_id, params, resource): headers = {'Content-Type': 'application/json-rpc'} @@ -77,10 +78,70 @@ def _parse_stream(self, stream, url): return info + def _fetch_page(self, display_id, url, params, page): + page += 1 + page_params = { + 'no_totals': True, + 'page': page, + 'page_size': self._PAGE_SIZE, + **params, + } + result = self._call_api_proxy( + 'claim_search', display_id, page_params, f'page {page}') + for item in traverse_obj(result, ('items', lambda _, v: v['name'] and v['claim_id'])): + yield { + **self._parse_stream(item, url), + '_type': 'url', + 'id': item['claim_id'], + 'url': self._permanent_url(url, item['name'], item['claim_id']), + } + + def _playlist_entries(self, url, display_id, claim_param, metadata): + qs = parse_qs(url) + content = qs.get('content', [None])[0] + params = { + 'fee_amount': qs.get('fee_amount', ['>=0'])[0], + 'order_by': { + 'new': ['release_time'], + 'top': ['effective_amount'], + 'trending': ['trending_group', 'trending_mixed'], + }[qs.get('order', ['new'])[0]], + 'claim_type': 'stream', + 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, + **claim_param, + } + duration = qs.get('duration', [None])[0] + if duration: + params['duration'] = { + 'long': '>=1200', + 'short': '<=240', + }[duration] + language = qs.get('language', ['all'])[0] + if language != 'all': + languages = [language] + if language == 'en': + languages.append('none') + params['any_languages'] = languages + + entries = OnDemandPagedList( + functools.partial(self._fetch_page, display_id, url, params), + self._PAGE_SIZE) + + return self.playlist_result( + entries, display_id, **traverse_obj(metadata, ('value', { + 'title': 'title', + 'description': 'description', + }))) + class LBRYIE(LBRYBaseIE): IE_NAME = 'lbry' - _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P\$/[^/]+/[^/]+/{1}|@{0}/{0}|(?!@){0})'.format(LBRYBaseIE._OPT_CLAIM_ID, LBRYBaseIE._CLAIM_ID_REGEX) + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf''' + (?:\$/(?:download|embed)/)? + (?P + [^$@:/?#]+/{LBRYBaseIE._CLAIM_ID_REGEX} + |(?:@{LBRYBaseIE._OPT_CLAIM_ID}/)?{LBRYBaseIE._OPT_CLAIM_ID} + )''' _TESTS = [{ # Video 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', @@ -149,7 +210,7 @@ class LBRYIE(LBRYBaseIE): 'channel': 'Gardening In Canada', 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', - 'formats': 'mincount:3', + 'formats': 'mincount:3', # FIXME 'thumbnail': 'https://thumbnails.lbry.com/AgHSc_HzrrE', 'license': 'Copyrighted (contact publisher)', } @@ -184,12 +245,12 @@ class LBRYIE(LBRYBaseIE): 'id': '41fbfe805eb73c8d3012c0c49faa0f563274f634', 'ext': 'mp4', 'title': 'Biotechnological Invasion of Skin (April 2023)', - 'description': 'md5:709a2f4c07bd8891cda3a7cc2d6fcf5c', + 'description': 'md5:fe28689db2cb7ba3436d819ac3ffc378', 'channel': 'Wicked Truths', 'channel_id': '23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', 'channel_url': 'https://odysee.com/@wickedtruths:23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', - 'timestamp': 1685790036, - 'upload_date': '20230603', + 'timestamp': 1695114347, + 'upload_date': '20230919', 'release_timestamp': 1685617473, 'release_date': '20230601', 'duration': 1063, @@ -229,10 +290,10 @@ class LBRYIE(LBRYBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - if display_id.startswith('$/'): - display_id = display_id.split('/', 2)[-1].replace('/', ':') - else: + if display_id.startswith('@'): display_id = display_id.replace(':', '#') + else: + display_id = display_id.replace('/', ':') display_id = urllib.parse.unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') @@ -299,7 +360,7 @@ def _real_extract(self, url): class LBRYChannelIE(LBRYBaseIE): IE_NAME = 'lbry:channel' - _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P@%s)/?(?:[?&]|$)' % LBRYBaseIE._OPT_CLAIM_ID + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf'(?P@{LBRYBaseIE._OPT_CLAIM_ID})/?(?:[?&]|$)' _TESTS = [{ 'url': 'https://lbry.tv/@LBRYFoundation:0', 'info_dict': { @@ -315,65 +376,50 @@ class LBRYChannelIE(LBRYBaseIE): 'url': 'lbry://@lbry#3f', 'only_matching': True, }] - _PAGE_SIZE = 50 - - def _fetch_page(self, claim_id, url, params, page): - page += 1 - page_params = { - 'channel_ids': [claim_id], - 'claim_type': 'stream', - 'no_totals': True, - 'page': page, - 'page_size': self._PAGE_SIZE, - } - page_params.update(params) - result = self._call_api_proxy( - 'claim_search', claim_id, page_params, 'page %d' % page) - for item in (result.get('items') or []): - stream_claim_name = item.get('name') - stream_claim_id = item.get('claim_id') - if not (stream_claim_name and stream_claim_id): - continue - - yield { - **self._parse_stream(item, url), - '_type': 'url', - 'id': stream_claim_id, - 'url': self._permanent_url(url, stream_claim_name, stream_claim_id), - } def _real_extract(self, url): display_id = self._match_id(url).replace(':', '#') - result = self._resolve_url( - 'lbry://' + display_id, display_id, 'channel') + result = self._resolve_url(f'lbry://{display_id}', display_id, 'channel') claim_id = result['claim_id'] - qs = parse_qs(url) - content = qs.get('content', [None])[0] - params = { - 'fee_amount': qs.get('fee_amount', ['>=0'])[0], - 'order_by': { - 'new': ['release_time'], - 'top': ['effective_amount'], - 'trending': ['trending_group', 'trending_mixed'], - }[qs.get('order', ['new'])[0]], - 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, - } - duration = qs.get('duration', [None])[0] - if duration: - params['duration'] = { - 'long': '>=1200', - 'short': '<=240', - }[duration] - language = qs.get('language', ['all'])[0] - if language != 'all': - languages = [language] - if language == 'en': - languages.append('none') - params['any_languages'] = languages - entries = OnDemandPagedList( - functools.partial(self._fetch_page, claim_id, url, params), - self._PAGE_SIZE) - result_value = result.get('value') or {} - return self.playlist_result( - entries, claim_id, result_value.get('title'), - result_value.get('description')) + + return self._playlist_entries(url, claim_id, {'channel_ids': [claim_id]}, result) + + +class LBRYPlaylistIE(LBRYBaseIE): + IE_NAME = 'lbry:playlist' + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'\$/(?:play)?list/(?P[0-9a-f-]+)' + _TESTS = [{ + 'url': 'https://odysee.com/$/playlist/ffef782f27486f0ac138bde8777f72ebdd0548c2', + 'info_dict': { + 'id': 'ffef782f27486f0ac138bde8777f72ebdd0548c2', + 'title': 'Théâtre Classique', + 'description': 'Théâtre Classique', + }, + 'playlist_mincount': 4, + }, { + 'url': 'https://odysee.com/$/list/9c6658b3dd21e4f2a0602d523a13150e2b48b770', + 'info_dict': { + 'id': '9c6658b3dd21e4f2a0602d523a13150e2b48b770', + 'title': 'Social Media Exposed', + 'description': 'md5:98af97317aacd5b85d595775ea37d80e', + }, + 'playlist_mincount': 34, + }, { + 'url': 'https://odysee.com/$/playlist/938fb11d-215f-4d1c-ad64-723954df2184', + 'info_dict': { + 'id': '938fb11d-215f-4d1c-ad64-723954df2184', + }, + 'playlist_mincount': 1000, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + result = traverse_obj(self._call_api_proxy('claim_search', display_id, { + 'claim_ids': [display_id], + 'no_totals': True, + 'page': 1, + 'page_size': self._PAGE_SIZE, + }, 'playlist'), ('items', 0)) + claim_param = {'claim_ids': traverse_obj(result, ('value', 'claims', ..., {str}))} + + return self._playlist_entries(url, display_id, claim_param, result) From fbcc299bd8a19cf8b3c8805d6c268a9110230973 Mon Sep 17 00:00:00 2001 From: Umar Getagazov Date: Sat, 7 Oct 2023 01:45:46 +0300 Subject: [PATCH 005/108] [ie/substack] Fix embed extraction (#8218) Authored by: handlerug --- yt_dlp/extractor/substack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py index 3782ceed1c7b..5835a5a8d35f 100644 --- a/yt_dlp/extractor/substack.py +++ b/yt_dlp/extractor/substack.py @@ -50,7 +50,7 @@ def _extract_embed_urls(cls, url, webpage): if not re.search(r']+src=["\']https://substackcdn.com/[^"\']+\.js', webpage): return - mobj = re.search(r'{[^}]*["\']subdomain["\']\s*:\s*["\'](?P[^"]+)', webpage) + mobj = re.search(r'{[^}]*\\?["\']subdomain\\?["\']\s*:\s*\\?["\'](?P[^\\"\']+)', webpage) if mobj: parsed = urllib.parse.urlparse(url) yield parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl() From 2f2dda3a7e85148773da3cdbc03ac9949ec1bc45 Mon Sep 17 00:00:00 2001 From: Umar Getagazov Date: Sat, 7 Oct 2023 01:48:54 +0300 Subject: [PATCH 006/108] [ie/substack] Fix download cookies bug (#8219) Authored by: handlerug --- yt_dlp/extractor/substack.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py index 5835a5a8d35f..6ee3f75e1a6c 100644 --- a/yt_dlp/extractor/substack.py +++ b/yt_dlp/extractor/substack.py @@ -56,10 +56,10 @@ def _extract_embed_urls(cls, url, webpage): yield parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl() raise cls.StopExtraction() - def _extract_video_formats(self, video_id, username): + def _extract_video_formats(self, video_id, url): formats, subtitles = [], {} for video_format in ('hls', 'mp4'): - video_url = f'https://{username}.substack.com/api/v1/video/upload/{video_id}/src?type={video_format}' + video_url = urllib.parse.urljoin(url, f'/api/v1/video/upload/{video_id}/src?type={video_format}') if video_format == 'hls': fmts, subs = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', fatal=False) @@ -81,12 +81,17 @@ def _real_extract(self, url): r'window\._preloads\s*=\s*JSON\.parse\(', webpage, 'json string', display_id, transform_source=js_to_json, contains_pattern=r'"{(?s:.+)}"'), display_id) + canonical_url = url + domain = traverse_obj(webpage_info, ('domainInfo', 'customDomain', {str})) + if domain: + canonical_url = urllib.parse.urlparse(url)._replace(netloc=domain).geturl() + post_type = webpage_info['post']['type'] formats, subtitles = [], {} if post_type == 'podcast': formats, subtitles = [{'url': webpage_info['post']['podcast_url']}], {} elif post_type == 'video': - formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], username) + formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], canonical_url) else: self.raise_no_formats(f'Page type "{post_type}" is not supported') @@ -99,4 +104,5 @@ def _real_extract(self, url): 'thumbnail': traverse_obj(webpage_info, ('post', 'cover_image')), 'uploader': traverse_obj(webpage_info, ('pub', 'name')), 'uploader_id': str_or_none(traverse_obj(webpage_info, ('post', 'publication_id'))), + 'webpage_url': canonical_url, } From 2ad3873f0dfa9285c91d2160e36c039e69d597c7 Mon Sep 17 00:00:00 2001 From: garret Date: Fri, 6 Oct 2023 23:53:11 +0100 Subject: [PATCH 007/108] [ie/radiko] Improve extraction (#8221) Authored by: garret1317 --- yt_dlp/extractor/radiko.py | 67 ++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py index cef68eba08eb..8c8fb1a8f90b 100644 --- a/yt_dlp/extractor/radiko.py +++ b/yt_dlp/extractor/radiko.py @@ -1,4 +1,5 @@ import base64 +import random import urllib.parse from .common import InfoExtractor @@ -13,6 +14,7 @@ class RadikoBaseIE(InfoExtractor): + _GEO_BYPASS = False _FULL_KEY = None _HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED = ( 'https://c-rpaa.smartstream.ne.jp', @@ -32,7 +34,7 @@ class RadikoBaseIE(InfoExtractor): 'https://c-radiko.smartstream.ne.jp', ) - def _auth_client(self): + def _negotiate_token(self): _, auth1_handle = self._download_webpage_handle( 'https://radiko.jp/v2/api/auth1', None, 'Downloading authentication page', headers={ @@ -58,10 +60,23 @@ def _auth_client(self): 'x-radiko-partialkey': partial_key, }).split(',')[0] + if area_id == 'OUT': + self.raise_geo_restricted(countries=['JP']) + auth_data = (auth_token, area_id) self.cache.store('radiko', 'auth_data', auth_data) return auth_data + def _auth_client(self): + cachedata = self.cache.load('radiko', 'auth_data') + if cachedata is not None: + response = self._download_webpage( + 'https://radiko.jp/v2/api/auth_check', None, 'Checking cached token', expected_status=401, + headers={'X-Radiko-AuthToken': cachedata[0], 'X-Radiko-AreaId': cachedata[1]}) + if response == 'OK': + return cachedata + return self._negotiate_token() + def _extract_full_key(self): if self._FULL_KEY: return self._FULL_KEY @@ -75,7 +90,7 @@ def _extract_full_key(self): if full_key: full_key = full_key.encode() - else: # use full key ever known + else: # use only full key ever known full_key = b'bcd151073c03b352e1ef2fd66c32209da9ca0afa' self._FULL_KEY = full_key @@ -103,24 +118,24 @@ def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, m3u8_playlist_data = self._download_xml( f'https://radiko.jp/v3/station/stream/pc_html5/{station}.xml', video_id, note='Downloading stream information') - m3u8_urls = m3u8_playlist_data.findall('.//url') formats = [] found = set() - for url_tag in m3u8_urls: - pcu = url_tag.find('playlist_create_url').text - url_attrib = url_tag.attrib + + timefree_int = 0 if is_onair else 1 + + for element in m3u8_playlist_data.findall(f'.//url[@timefree="{timefree_int}"]/playlist_create_url'): + pcu = element.text + if pcu in found: + continue + found.add(pcu) playlist_url = update_url_query(pcu, { 'station_id': station, **query, 'l': '15', - 'lsid': '88ecea37e968c1f17d5413312d9f8003', + 'lsid': ''.join(random.choices('0123456789abcdef', k=32)), 'type': 'b', }) - if playlist_url in found: - continue - else: - found.add(playlist_url) time_to_skip = None if is_onair else cursor - ft @@ -138,7 +153,7 @@ def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, not is_onair and pcu.startswith(self._HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED)): sf['preference'] = -100 sf['format_note'] = 'not preferred' - if not is_onair and url_attrib['timefree'] == '1' and time_to_skip: + if not is_onair and timefree_int == 1 and time_to_skip: sf['downloader_options'] = {'ffmpeg_args': ['-ss', time_to_skip]} formats.extend(subformats) @@ -166,21 +181,7 @@ def _real_extract(self, url): vid_int = unified_timestamp(video_id, False) prog, station_program, ft, radio_begin, radio_end = self._find_program(video_id, station, vid_int) - auth_cache = self.cache.load('radiko', 'auth_data') - for attempt in range(2): - auth_token, area_id = (not attempt and auth_cache) or self._auth_client() - formats = self._extract_formats( - video_id=video_id, station=station, is_onair=False, - ft=ft, cursor=vid_int, auth_token=auth_token, area_id=area_id, - query={ - 'start_at': radio_begin, - 'ft': radio_begin, - 'end_at': radio_end, - 'to': radio_end, - 'seek': video_id, - }) - if formats: - break + auth_token, area_id = self._auth_client() return { 'id': video_id, @@ -189,8 +190,18 @@ def _real_extract(self, url): 'uploader': try_call(lambda: station_program.find('.//name').text), 'uploader_id': station, 'timestamp': vid_int, - 'formats': formats, 'is_live': True, + 'formats': self._extract_formats( + video_id=video_id, station=station, is_onair=False, + ft=ft, cursor=vid_int, auth_token=auth_token, area_id=area_id, + query={ + 'start_at': radio_begin, + 'ft': radio_begin, + 'end_at': radio_end, + 'to': radio_end, + 'seek': video_id + } + ), } From 35d9cbaf9638ccc9daf8a863063b2e7c135bc664 Mon Sep 17 00:00:00 2001 From: AS6939 <46506352+AS6939@users.noreply.github.com> Date: Sat, 7 Oct 2023 06:56:12 +0800 Subject: [PATCH 008/108] [ie/iq.com] Fix extraction and subtitles (#8260) Closes #7734, Closes #8123 Authored by: AS6939 --- yt_dlp/extractor/iqiyi.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index fa602ba8871a..3368ab1d93af 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -499,9 +499,10 @@ class IqIE(InfoExtractor): 'tm': tm, 'qdy': 'a', 'qds': 0, - 'k_ft1': 141287244169348, - 'k_ft4': 34359746564, - 'k_ft5': 1, + 'k_ft1': '143486267424900', + 'k_ft4': '1572868', + 'k_ft7': '4', + 'k_ft5': '1', 'bop': JSON.stringify({ 'version': '10.0', 'dfp': dfp @@ -529,14 +530,22 @@ def _extract_vms_player_js(self, webpage, video_id): webpack_js_url = self._proto_relative_url(self._search_regex( r'') + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats( + data.get('file_url') or data['stream_url'], video_id, 'm4a', m3u8_id='hls'), + 'age_limit': 18, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'release_timestamp': ('created_at', {parse_iso8601}), + 'modified_timestamp': ('updated_at', {parse_iso8601}), + 'uploader': ('user', 'name', {str}), + 'uploader_id': ('user', 'id', {str_or_none}), + 'uploader_url': ('user', 'permalink_url', {url_or_none}), + 'thumbnail': ('artwork_url', {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'view_count': ('plays', {int_or_none}), + 'comment_count': ('comment_count', {int_or_none}), + 'webpage_url': ('permalink_url', {url_or_none}), + }), + } From 0e722f2f3ca42e634fd7b06ee70b16bf833ce132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Fri, 6 Oct 2023 19:59:42 -0300 Subject: [PATCH 010/108] [ie/lbry] Extract `uploader_id` (#8244) Closes #123 Authored by: drzraf --- yt_dlp/extractor/lbry.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index ccce300b5b51..cc37c41e8cbe 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -70,11 +70,11 @@ def _parse_stream(self, stream, url): 'duration': ('value', stream_type, 'duration', {int_or_none}), 'channel': ('signing_channel', 'value', 'title', {str}), 'channel_id': ('signing_channel', 'claim_id', {str}), + 'uploader_id': ('signing_channel', 'name', {str}), }) - channel_name = traverse_obj(stream, ('signing_channel', 'name', {str})) - if channel_name and info.get('channel_id'): - info['channel_url'] = self._permanent_url(url, channel_name, info['channel_id']) + if info.get('uploader_id') and info.get('channel_id'): + info['channel_url'] = self._permanent_url(url, info['uploader_id'], info['channel_id']) return info @@ -159,6 +159,7 @@ class LBRYIE(LBRYBaseIE): 'height': 720, 'thumbnail': 'https://spee.ch/7/67f2d809c263288c.png', 'license': 'None', + 'uploader_id': '@Mantega', 'duration': 346, 'channel': 'LBRY/Odysee rats united!!!', 'channel_id': '1c8ad6a2ab4e889a71146ae4deeb23bb92dab627', @@ -192,6 +193,7 @@ class LBRYIE(LBRYBaseIE): 'vcodec': 'none', 'thumbnail': 'https://spee.ch/d/0bc63b0e6bf1492d.png', 'license': 'None', + 'uploader_id': '@LBRYFoundation', } }, { 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', @@ -210,7 +212,8 @@ class LBRYIE(LBRYBaseIE): 'channel': 'Gardening In Canada', 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', - 'formats': 'mincount:3', # FIXME + 'uploader_id': '@gardeningincanada', + 'formats': 'mincount:3', 'thumbnail': 'https://thumbnails.lbry.com/AgHSc_HzrrE', 'license': 'Copyrighted (contact publisher)', } @@ -235,6 +238,7 @@ class LBRYIE(LBRYBaseIE): 'formats': 'mincount:1', 'thumbnail': 'startswith:https://thumb', 'license': 'None', + 'uploader_id': '@RT', }, 'params': {'skip_download': True} }, { @@ -249,6 +253,7 @@ class LBRYIE(LBRYBaseIE): 'channel': 'Wicked Truths', 'channel_id': '23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', 'channel_url': 'https://odysee.com/@wickedtruths:23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', + 'uploader_id': '@wickedtruths', 'timestamp': 1695114347, 'upload_date': '20230919', 'release_timestamp': 1685617473, From e831c80e8b2fc025b3b67d82974cc59e3526fdc8 Mon Sep 17 00:00:00 2001 From: garret Date: Sat, 7 Oct 2023 00:05:48 +0100 Subject: [PATCH 011/108] [ie/nhk] Fix VOD extraction (#8249) Closes #8242 Authored by: garret1317 --- yt_dlp/extractor/nhk.py | 46 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index fbd6a18f6d82..bcbc2279f6f3 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -28,6 +28,44 @@ def _call_api(self, m_id, lang, is_video, is_episode, is_clip): m_id, lang, '/all' if is_video else ''), m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] or [] + def _get_api_info(self, refresh=True): + if not refresh: + return self.cache.load('nhk', 'api_info') + + self.cache.store('nhk', 'api_info', {}) + movie_player_js = self._download_webpage( + 'https://movie-a.nhk.or.jp/world/player/js/movie-player.js', None, + note='Downloading stream API information') + api_info = { + 'url': self._search_regex( + r'prod:[^;]+\bapiUrl:\s*[\'"]([^\'"]+)[\'"]', movie_player_js, None, 'stream API url'), + 'token': self._search_regex( + r'prod:[^;]+\btoken:\s*[\'"]([^\'"]+)[\'"]', movie_player_js, None, 'stream API token'), + } + self.cache.store('nhk', 'api_info', api_info) + return api_info + + def _extract_formats_and_subtitles(self, vod_id): + for refresh in (False, True): + api_info = self._get_api_info(refresh) + if not api_info: + continue + + api_url = api_info.pop('url') + stream_url = traverse_obj( + self._download_json( + api_url, vod_id, 'Downloading stream url info', fatal=False, query={ + **api_info, + 'type': 'json', + 'optional_id': vod_id, + 'active_flg': 1, + }), + ('meta', 0, 'movie_url', ('mb_auto', 'auto_sp', 'auto_pc'), {url_or_none}), get_all=False) + if stream_url: + return self._extract_m3u8_formats_and_subtitles(stream_url, vod_id) + + raise ExtractorError('Unable to extract stream url') + def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups() @@ -67,12 +105,14 @@ def get_clean_field(key): } if is_video: vod_id = episode['vod_id'] + formats, subs = self._extract_formats_and_subtitles(vod_id) + info.update({ - '_type': 'url_transparent', - 'ie_key': 'Piksel', - 'url': 'https://movie-s.nhk.or.jp/v/refid/nhkworld/prefid/' + vod_id, 'id': vod_id, + 'formats': formats, + 'subtitles': subs, }) + else: if fetch_episode: audio_path = episode['audio']['audio'] From 19c90e405b4137c06dfe6f9aaa02396df0da93e5 Mon Sep 17 00:00:00 2001 From: trainman261 Date: Sat, 7 Oct 2023 01:56:19 +0200 Subject: [PATCH 012/108] [cleanup] Update extractor tests (#7718) Authored by: trainman261 --- yt_dlp/extractor/aenetworks.py | 1 + yt_dlp/extractor/amcnetworks.py | 1 + yt_dlp/extractor/cbc.py | 7 ++++++- yt_dlp/extractor/cbs.py | 2 ++ yt_dlp/extractor/cnbc.py | 2 ++ yt_dlp/extractor/corus.py | 3 ++- yt_dlp/extractor/generic.py | 13 ++++++++++--- yt_dlp/extractor/mediaset.py | 3 ++- yt_dlp/extractor/movieclips.py | 1 + yt_dlp/extractor/nationalgeographic.py | 3 +++ yt_dlp/extractor/nbc.py | 22 +++++++++++++++++----- yt_dlp/extractor/scrippsnetworks.py | 4 ++++ yt_dlp/extractor/syfy.py | 1 + yt_dlp/extractor/theplatform.py | 6 +++--- yt_dlp/extractor/theweatherchannel.py | 20 +++++++++++--------- 15 files changed, 66 insertions(+), 23 deletions(-) diff --git a/yt_dlp/extractor/aenetworks.py b/yt_dlp/extractor/aenetworks.py index f049a0fb3cc5..cc26653c1dee 100644 --- a/yt_dlp/extractor/aenetworks.py +++ b/yt_dlp/extractor/aenetworks.py @@ -338,6 +338,7 @@ class BiographyIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': '404 Not Found', }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/amcnetworks.py b/yt_dlp/extractor/amcnetworks.py index c58bc7bfbf8f..10bd021c55ac 100644 --- a/yt_dlp/extractor/amcnetworks.py +++ b/yt_dlp/extractor/amcnetworks.py @@ -26,6 +26,7 @@ class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE # m3u8 download 'skip_download': True, }, + 'skip': '404 Not Found', }, { 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', 'only_matching': True, diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 2920b9027d8e..be2d13e442f5 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -66,6 +66,7 @@ class CBCIE(InfoExtractor): 'uploader': 'CBCC-NEW', 'timestamp': 255977160, }, + 'skip': '404 Not Found', }, { # multiple iframes 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', @@ -97,7 +98,7 @@ class CBCIE(InfoExtractor): # multiple CBC.APP.Caffeine.initInstance(...) 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238', 'info_dict': { - 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', + 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', # FIXME 'id': 'dog-indoor-exercise-winter-1.3928238', 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', }, @@ -476,6 +477,10 @@ class CBCGemPlaylistIE(InfoExtractor): 'id': 'schitts-creek/s06', 'title': 'Season 6', 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', + 'series': 'Schitt\'s Creek', + 'season_number': 6, + 'season': 'Season 6', + 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/season/perso/cbc_schitts_creek_season_06_carousel_v03.jpg?impolicy=ott&im=Resize=(_Size_)&quality=75', }, }, { 'url': 'https://gem.cbc.ca/schitts-creek/s06', diff --git a/yt_dlp/extractor/cbs.py b/yt_dlp/extractor/cbs.py index 1c0dbdea9444..d97fbd758c38 100644 --- a/yt_dlp/extractor/cbs.py +++ b/yt_dlp/extractor/cbs.py @@ -101,6 +101,7 @@ class CBSIE(CBSBaseIE): # m3u8 download 'skip_download': True, }, + 'skip': 'Subscription required', }, { 'url': 'https://www.cbs.com/shows/video/sZH1MGgomIosZgxGJ1l263MFq16oMtW1/', 'info_dict': { @@ -117,6 +118,7 @@ class CBSIE(CBSBaseIE): }, 'expected_warnings': [ 'This content expired on', 'No video formats found', 'Requested format is not available'], + 'skip': '404 Not Found', }, { 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', 'only_matching': True, diff --git a/yt_dlp/extractor/cnbc.py b/yt_dlp/extractor/cnbc.py index 68fd025b7c69..7d209b6d9012 100644 --- a/yt_dlp/extractor/cnbc.py +++ b/yt_dlp/extractor/cnbc.py @@ -19,6 +19,7 @@ class CNBCIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'skip': 'Dead link', } def _real_extract(self, url): @@ -49,6 +50,7 @@ class CNBCVideoIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Dead link', } def _real_extract(self, url): diff --git a/yt_dlp/extractor/corus.py b/yt_dlp/extractor/corus.py index c03d65310d2b..bcc34ddd8a12 100644 --- a/yt_dlp/extractor/corus.py +++ b/yt_dlp/extractor/corus.py @@ -41,7 +41,7 @@ class CorusIE(ThePlatformFeedIE): # XXX: Do not subclass from concrete IE ) ''' _TESTS = [{ - 'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/', + 'url': 'https://www.hgtv.ca/video/bryan-inc/movie-night-popcorn-with-bryan/870923331648/', 'info_dict': { 'id': '870923331648', 'ext': 'mp4', @@ -54,6 +54,7 @@ class CorusIE(ThePlatformFeedIE): # XXX: Do not subclass from concrete IE 'skip_download': True, }, 'expected_warnings': ['Failed to parse JSON'], + # FIXME: yt-dlp wrongly raises for geo restriction }, { 'url': 'http://www.foodnetwork.ca/shows/chopped/video/episode/chocolate-obsession/video.html?v=872683587753', 'only_matching': True, diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 33e71d1c5729..5e1240c13abf 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -58,6 +58,8 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'trailer', 'upload_date': '20100513', + 'direct': True, + 'timestamp': 1273772943.0, } }, # Direct link to media delivered compressed (until Accept-Encoding is *) @@ -101,6 +103,8 @@ class GenericIE(InfoExtractor): 'ext': 'webm', 'title': '5_Lennart_Poettering_-_Systemd', 'upload_date': '20141120', + 'direct': True, + 'timestamp': 1416498816.0, }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' @@ -133,6 +137,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20201204', }, }], + 'skip': 'Dead link', }, # RSS feed with item with description and thumbnails { @@ -145,12 +150,12 @@ class GenericIE(InfoExtractor): 'playlist': [{ 'info_dict': { 'ext': 'm4a', - 'id': 'c1c879525ce2cb640b344507e682c36d', + 'id': '818a5d38-01cd-152f-2231-ee479677fa82', 'title': 're:Hydrogen!', 'description': 're:.*In this episode we are going.*', 'timestamp': 1567977776, 'upload_date': '20190908', - 'duration': 459, + 'duration': 423, 'thumbnail': r're:^https?://.*\.jpg$', 'episode_number': 1, 'season_number': 1, @@ -267,6 +272,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404 Not Found', }, # MPD from http://dash-mse-test.appspot.com/media.html { @@ -278,6 +284,7 @@ class GenericIE(InfoExtractor): 'title': 'car-20120827-manifest', 'formats': 'mincount:9', 'upload_date': '20130904', + 'timestamp': 1378272859.0, }, }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 @@ -318,7 +325,7 @@ class GenericIE(InfoExtractor): 'id': 'cmQHVoWB5FY', 'ext': 'mp4', 'upload_date': '20130224', - 'uploader_id': 'TheVerge', + 'uploader_id': '@TheVerge', 'description': r're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index e3b728dcae20..2d6204298272 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -127,7 +127,8 @@ class MediasetIE(ThePlatformBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Dead link', }, { # WittyTV embed 'url': 'https://www.wittytv.it/mauriziocostanzoshow/ultima-puntata-venerdi-25-novembre/', diff --git a/yt_dlp/extractor/movieclips.py b/yt_dlp/extractor/movieclips.py index 4777f440e012..f7f2921fdbac 100644 --- a/yt_dlp/extractor/movieclips.py +++ b/yt_dlp/extractor/movieclips.py @@ -23,6 +23,7 @@ class MovieClipsIE(InfoExtractor): 'uploader': 'Movieclips', }, 'add_ie': ['ThePlatform'], + 'skip': 'redirects to YouTube', } def _real_extract(self, url): diff --git a/yt_dlp/extractor/nationalgeographic.py b/yt_dlp/extractor/nationalgeographic.py index ad525c258998..6f046bc29c10 100644 --- a/yt_dlp/extractor/nationalgeographic.py +++ b/yt_dlp/extractor/nationalgeographic.py @@ -24,6 +24,7 @@ class NationalGeographicVideoIE(InfoExtractor): 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], + 'skip': 'Redirects to main page', }, { 'url': 'http://video.nationalgeographic.com/wild/when-sharks-attack/the-real-jaws', @@ -38,6 +39,7 @@ class NationalGeographicVideoIE(InfoExtractor): 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], + 'skip': 'Redirects to main page', }, ] @@ -75,6 +77,7 @@ class NationalGeographicTVIE(FOXIE): # XXX: Do not subclass from concrete IE 'params': { 'skip_download': True, }, + 'skip': 'Content not available', }] _HOME_PAGE_URL = 'https://www.nationalgeographic.com/tv/' _API_KEY = '238bb0a0c2aba67922c48709ce0c06fd' diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index b3c28ab55d5b..666550a4913f 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -284,7 +284,7 @@ class NBCSportsIE(InfoExtractor): _TESTS = [{ # iframe src - 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', + 'url': 'https://www.nbcsports.com/watch/nfl/profootballtalk/pft-pm/unpacking-addisons-reckless-driving-citation', 'info_dict': { 'id': 'PHJSaFWbrTY9', 'ext': 'mp4', @@ -379,7 +379,7 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _TESTS = [ { 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', - 'md5': 'cf4bc9e6ce0130f00f545d80ecedd4bf', + 'md5': 'fb3dcd2d7b1dd9804305fa2fc95ab610', # md5 tends to fluctuate 'info_dict': { 'id': '269389891880', 'ext': 'mp4', @@ -387,6 +387,8 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', 'timestamp': 1401363060, 'upload_date': '20140529', + 'duration': 46.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/140529/p_tweet_snow_140529.jpg', }, }, { @@ -402,7 +404,7 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE }, { 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', - 'md5': '8eb831eca25bfa7d25ddd83e85946548', + 'md5': '40d0e48c68896359c80372306ece0fc3', 'info_dict': { 'id': '394064451844', 'ext': 'mp4', @@ -410,11 +412,13 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', 'timestamp': 1423104900, 'upload_date': '20150205', + 'duration': 1236.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/nn_netcast_150204.jpg', }, }, { 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', - 'md5': '4a8c4cec9e1ded51060bdda36ff0a5c0', + 'md5': 'ffb59bcf0733dc3c7f0ace907f5e3939', 'info_dict': { 'id': 'n431456', 'ext': 'mp4', @@ -422,11 +426,13 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', 'upload_date': '20150922', 'timestamp': 1442917800, + 'duration': 37.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/x_lon_vwhorn_150922.jpg', }, }, { 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', - 'md5': '118d7ca3f0bea6534f119c68ef539f71', + 'md5': '693d1fa21d23afcc9b04c66b227ed9ff', 'info_dict': { 'id': '669831235788', 'ext': 'mp4', @@ -434,6 +440,8 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', 'upload_date': '20160420', 'timestamp': 1461152093, + 'duration': 69.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/201604/2016-04-20T11-35-09-133Z--1280x720.jpg', }, }, { @@ -447,6 +455,7 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1406937606, 'upload_date': '20140802', + 'duration': 940.0, }, }, { @@ -535,6 +544,7 @@ class NBCOlympicsIE(InfoExtractor): 'upload_date': '20160815', 'uploader': 'NBCU-SPORTS', }, + 'skip': '404 Not Found', } def _real_extract(self, url): @@ -578,6 +588,7 @@ class NBCOlympicsStreamIE(AdobePassIE): 'params': { 'skip_download': 'm3u8', }, + 'skip': 'Livestream', }, { 'note': 'Plain m3u8 source URL', 'url': 'https://stream.nbcolympics.com/gymnastics-event-finals-mens-floor-pommel-horse-womens-vault-bars', @@ -589,6 +600,7 @@ class NBCOlympicsStreamIE(AdobePassIE): 'params': { 'skip_download': 'm3u8', }, + 'skip': 'Livestream', }, ] diff --git a/yt_dlp/extractor/scrippsnetworks.py b/yt_dlp/extractor/scrippsnetworks.py index adfd7e5f297e..7f0bc9645610 100644 --- a/yt_dlp/extractor/scrippsnetworks.py +++ b/yt_dlp/extractor/scrippsnetworks.py @@ -39,6 +39,7 @@ class ScrippsNetworksWatchIE(AWSIE): 'skip_download': True, }, 'add_ie': [AnvatoIE.ie_key()], + 'skip': '404 Not Found', }] _SNI_TABLE = { @@ -113,6 +114,9 @@ class ScrippsNetworksIE(InfoExtractor): 'timestamp': 1475678834, 'upload_date': '20161005', 'uploader': 'SCNI-SCND', + 'duration': 29.995, + 'chapters': [{'start_time': 0.0, 'end_time': 29.995, 'title': ''}], + 'thumbnail': 'https://images.dds.discovery.com/up/tp/Scripps_-_Food_Category_Prod/122/987/0260338_630x355.jpg', }, 'add_ie': ['ThePlatform'], 'expected_warnings': ['No HLS formats found'], diff --git a/yt_dlp/extractor/syfy.py b/yt_dlp/extractor/syfy.py index c79d27a0de5c..afcdbf78043a 100644 --- a/yt_dlp/extractor/syfy.py +++ b/yt_dlp/extractor/syfy.py @@ -23,6 +23,7 @@ class SyfyIE(AdobePassIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': 'Redirects to main page', }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index 99caeb5f99ab..433ce8427c5d 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -167,7 +167,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): # rtmp download 'skip_download': True, }, - 'skip': '404 Not Found', + 'skip': 'CNet no longer uses ThePlatform', }, { 'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD', 'info_dict': { @@ -177,7 +177,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'title': 'HIGHLIGHTS: USA bag first ever series Cup win', 'uploader': 'EGSM', }, - 'skip': '404 Not Found', + 'skip': 'Dead link', }, { 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', 'only_matching': True, @@ -195,7 +195,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'upload_date': '20150701', 'uploader': 'NBCU-NEWS', }, - 'skip': '404 Not Found', + 'skip': 'Error: Player PID "nbcNewsOffsite" is disabled', }, { # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 # geo-restricted (US), HLS encrypted with AES-128 diff --git a/yt_dlp/extractor/theweatherchannel.py b/yt_dlp/extractor/theweatherchannel.py index 682e4335d2c5..d1921e4f9a83 100644 --- a/yt_dlp/extractor/theweatherchannel.py +++ b/yt_dlp/extractor/theweatherchannel.py @@ -11,17 +11,19 @@ class TheWeatherChannelIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?weather\.com(?P(?:/(?P[a-z]{2}-[A-Z]{2}))?/(?:[^/]+/)*video/(?P[^/?#]+))' _TESTS = [{ - 'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock', - 'md5': 'c4cbe74c9c17c5676b704b950b73dd92', + 'url': 'https://weather.com/storms/hurricane/video/invest-95l-in-atlantic-has-a-medium-chance-of-development', + 'md5': '68f0cf616435683f27ce36bd9c927394', 'info_dict': { - 'id': 'cc82397e-cc3f-4d11-9390-a785add090e8', + 'id': '81acef2d-ee8c-4545-ba83-bff3cc80db97', 'ext': 'mp4', - 'title': 'Ice Climber Is In For A Shock', - 'description': 'md5:55606ce1378d4c72e6545e160c9d9695', - 'uploader': 'TWC - Digital (No Distro)', - 'uploader_id': '6ccd5455-16bb-46f2-9c57-ff858bb9f62c', - 'upload_date': '20160720', - 'timestamp': 1469018835, + 'title': 'Invest 95L In Atlantic Has A Medium Chance Of Development', + 'description': 'md5:0de720fd5f0d0e32207bd4c270fff824', + 'uploader': 'TWC - Digital', + 'uploader_id': 'b5a999e0-9e04-11e1-9ee2-001d092f5a10', + 'upload_date': '20230721', + 'timestamp': 1689967343, + 'display_id': 'invest-95l-in-atlantic-has-a-medium-chance-of-development', + 'duration': 34.0, } }, { 'url': 'https://weather.com/en-CA/international/videos/video/unidentified-object-falls-from-sky-in-india', From 792f1e64f6a2beac51e85408d142b3118115c4fd Mon Sep 17 00:00:00 2001 From: Aleri Kaisattera <73682764+alerikaisattera@users.noreply.github.com> Date: Sat, 7 Oct 2023 05:56:47 +0600 Subject: [PATCH 013/108] [ie/theta] Remove extractors (#8251) Authored by: alerikaisattera --- yt_dlp/extractor/_extractors.py | 4 -- yt_dlp/extractor/theta.py | 90 --------------------------------- 2 files changed, 94 deletions(-) delete mode 100644 yt_dlp/extractor/theta.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b10ef2f33207..55c3c2f8e88b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2004,10 +2004,6 @@ ) from .thestar import TheStarIE from .thesun import TheSunIE -from .theta import ( - ThetaVideoIE, - ThetaStreamIE, -) from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE diff --git a/yt_dlp/extractor/theta.py b/yt_dlp/extractor/theta.py deleted file mode 100644 index ecf0ea091dd3..000000000000 --- a/yt_dlp/extractor/theta.py +++ /dev/null @@ -1,90 +0,0 @@ -from .common import InfoExtractor -from ..utils import try_get - - -class ThetaStreamIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?theta\.tv/(?!video/)(?P[a-z0-9-]+)' - _TESTS = [{ - 'url': 'https://www.theta.tv/davirus', - 'skip': 'The live may have ended', - 'info_dict': { - 'id': 'DaVirus', - 'ext': 'mp4', - 'title': 'I choose you - My Community is King -👀 - YO HABLO ESPANOL - CODE DAVIRUS', - 'thumbnail': r're:https://live-thumbnails-prod-theta-tv\.imgix\.net/thumbnail/.+\.jpg', - } - }, { - 'url': 'https://www.theta.tv/mst3k', - 'note': 'This channel is live 24/7', - 'info_dict': { - 'id': 'MST3K', - 'ext': 'mp4', - 'title': 'Mystery Science Theatre 3000 24/7 Powered by the THETA Network.', - 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg', - } - }, { - 'url': 'https://www.theta.tv/contv-anime', - 'info_dict': { - 'id': 'ConTVAnime', - 'ext': 'mp4', - 'title': 'CONTV ANIME 24/7. Powered by THETA Network.', - 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg', - } - }] - - def _real_extract(self, url): - channel_id = self._match_id(url) - info = self._download_json(f'https://api.theta.tv/v1/channel?alias={channel_id}', channel_id)['body'] - - m3u8_playlist = next( - data['url'] for data in info['live_stream']['video_urls'] - if data.get('type') != 'embed' and data.get('resolution') in ('master', 'source')) - - formats = self._extract_m3u8_formats(m3u8_playlist, channel_id, 'mp4', m3u8_id='hls', live=True) - - channel = try_get(info, lambda x: x['user']['username']) # using this field instead of channel_id due to capitalization - - return { - 'id': channel, - 'title': try_get(info, lambda x: x['live_stream']['title']), - 'channel': channel, - 'view_count': try_get(info, lambda x: x['live_stream']['view_count']), - 'is_live': True, - 'formats': formats, - 'thumbnail': try_get(info, lambda x: x['live_stream']['thumbnail_url']), - } - - -class ThetaVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?theta\.tv/video/(?Pvid[a-z0-9]+)' - _TEST = { - 'url': 'https://www.theta.tv/video/vidiq6aaet3kzf799p0', - 'md5': '633d8c29eb276bb38a111dbd591c677f', - 'info_dict': { - 'id': 'vidiq6aaet3kzf799p0', - 'ext': 'mp4', - 'title': 'Theta EdgeCast Tutorial', - 'uploader': 'Pixiekittie', - 'description': 'md5:e316253f5bdced8b5a46bb50ae60a09f', - 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+/vod_thumb/.+.jpg', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json(f'https://api.theta.tv/v1/video/{video_id}/raw', video_id)['body'] - - m3u8_playlist = try_get(info, lambda x: x['video_urls'][0]['url']) - - formats = self._extract_m3u8_formats(m3u8_playlist, video_id, 'mp4', m3u8_id='hls') - - return { - 'id': video_id, - 'title': info.get('title'), - 'uploader': try_get(info, lambda x: x['user']['username']), - 'description': info.get('description'), - 'view_count': info.get('view_count'), - 'like_count': info.get('like_count'), - 'formats': formats, - 'thumbnail': info.get('thumbnail_url'), - } From 03e85ea99db76a2fddb65bf46f8819bda780aaf3 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 6 Oct 2023 20:00:15 -0500 Subject: [PATCH 014/108] [ie/youtube] Fix `heatmap` extraction (#8299) Closes #8189 Authored by: bashonly --- yt_dlp/extractor/youtube.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 7e13aa77979d..b7ac3e9cc1ce 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3292,16 +3292,15 @@ def _extract_chapters_from_engagement_panel(self, data, duration): chapter_time, chapter_title, duration) for contents in content_list)), []) - def _extract_heatmap_from_player_overlay(self, data): - content_list = traverse_obj(data, ( - 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer', 'decoratedPlayerBarRenderer', 'playerBar', - 'multiMarkersPlayerBarRenderer', 'markersMap', ..., 'value', 'heatmap', 'heatmapRenderer', 'heatMarkers', {list})) - return next(filter(None, ( - traverse_obj(contents, (..., 'heatMarkerRenderer', { - 'start_time': ('timeRangeStartMillis', {functools.partial(float_or_none, scale=1000)}), - 'end_time': {lambda x: (x['timeRangeStartMillis'] + x['markerDurationMillis']) / 1000}, - 'value': ('heatMarkerIntensityScoreNormalized', {float_or_none}), - })) for contents in content_list)), None) + def _extract_heatmap(self, data): + return traverse_obj(data, ( + 'frameworkUpdates', 'entityBatchUpdate', 'mutations', + lambda _, v: v['payload']['macroMarkersListEntity']['markersList']['markerType'] == 'MARKER_TYPE_HEATMAP', + 'payload', 'macroMarkersListEntity', 'markersList', 'markers', ..., { + 'start_time': ('startMillis', {functools.partial(float_or_none, scale=1000)}), + 'end_time': {lambda x: (int(x['startMillis']) + int(x['durationMillis'])) / 1000}, + 'value': ('intensityScoreNormalized', {float_or_none}), + })) or None def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') @@ -4435,7 +4434,7 @@ def process_language(container, base_url, lang_code, sub_name, query): or self._extract_chapters_from_description(video_description, duration) or None) - info['heatmap'] = self._extract_heatmap_from_player_overlay(initial_data) + info['heatmap'] = self._extract_heatmap(initial_data) contents = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'), From 377e85a1797db9e98b78b38203ed9d4ded229991 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sat, 7 Oct 2023 03:02:45 +0200 Subject: [PATCH 015/108] [cleanup] Misc (#8300) * Simplify nuxt regex * Fix tmz quotes and tests * Update test python versions Authored by: dirkf, gamer191, Grub4K --- .github/workflows/core.yml | 4 +- .github/workflows/download.yml | 2 +- yt_dlp/extractor/common.py | 2 +- yt_dlp/extractor/tmz.py | 266 +++++++++++++++++---------------- 4 files changed, 138 insertions(+), 136 deletions(-) diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 689408c500d6..7fcf11dfa2a2 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -13,7 +13,7 @@ jobs: matrix: os: [ubuntu-latest] # CPython 3.11 is in quick-test - python-version: ['3.8', '3.9', '3.10', '3.12-dev', pypy-3.7, pypy-3.8, pypy-3.10] + python-version: ['3.8', '3.9', '3.10', '3.12', pypy-3.7, pypy-3.8, pypy-3.10] run-tests-ext: [sh] include: # atleast one of each CPython/PyPy tests must be in windows @@ -21,7 +21,7 @@ jobs: python-version: '3.7' run-tests-ext: bat - os: windows-latest - python-version: '3.12-dev' + python-version: '3.12' run-tests-ext: bat - os: windows-latest python-version: pypy-3.9 diff --git a/.github/workflows/download.yml b/.github/workflows/download.yml index 2b2387d4f134..c3478721c3f8 100644 --- a/.github/workflows/download.yml +++ b/.github/workflows/download.yml @@ -28,7 +28,7 @@ jobs: fail-fast: true matrix: os: [ubuntu-latest] - python-version: ['3.7', '3.10', 3.11-dev, pypy-3.7, pypy-3.8] + python-version: ['3.7', '3.10', '3.12', pypy-3.7, pypy-3.8, pypy-3.10] run-tests-ext: [sh] include: # atleast one of each CPython/PyPy tests must be in windows diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index c94b4abdc210..c3ceb0039172 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1687,7 +1687,7 @@ def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" rectx = re.escape(context_name) - FUNCTION_RE = r'\(function\((?P.*?)\){(?:.*?)return\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' + FUNCTION_RE = r'\(function\((?P.*?)\){.*?\breturn\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' js, arg_keys, arg_vals = self._search_regex( (rf'', rf'{rectx}\(.*?{FUNCTION_RE}'), webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), diff --git a/yt_dlp/extractor/tmz.py b/yt_dlp/extractor/tmz.py index ffb30c6b8798..edd16bc5b29c 100644 --- a/yt_dlp/extractor/tmz.py +++ b/yt_dlp/extractor/tmz.py @@ -8,158 +8,160 @@ class TMZIE(InfoExtractor): - _VALID_URL = r"https?://(?:www\.)?tmz\.com/.*" + _VALID_URL = r'https?://(?:www\.)?tmz\.com/.*' _TESTS = [ { - "url": "http://www.tmz.com/videos/0-cegprt2p/", - "info_dict": { - "id": "http://www.tmz.com/videos/0-cegprt2p/", - "ext": "mp4", - "title": "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet", - "description": "Harvey talks about Director Comey’s decision not to prosecute Hillary Clinton.", - "timestamp": 1467831837, - "uploader": "TMZ Staff", - "upload_date": "20160706", - "thumbnail": "https://imagez.tmz.com/image/5e/4by3/2016/07/06/5eea7dc01baa5c2e83eb06930c170e46_xl.jpg", - "duration": 772.0, + 'url': 'http://www.tmz.com/videos/0-cegprt2p/', + 'info_dict': { + 'id': 'http://www.tmz.com/videos/0-cegprt2p/', + 'ext': 'mp4', + 'title': 'No Charges Against Hillary Clinton? Harvey Says It Ain\'t Over Yet', + 'description': 'Harvey talks about Director Comey’s decision not to prosecute Hillary Clinton.', + 'timestamp': 1467831837, + 'uploader': 'TMZ Staff', + 'upload_date': '20160706', + 'thumbnail': 'https://imagez.tmz.com/image/5e/4by3/2016/07/06/5eea7dc01baa5c2e83eb06930c170e46_xl.jpg', + 'duration': 772.0, }, }, { - "url": "https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/", - "info_dict": { - "id": "https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/", - "ext": "mp4", - "title": "Angry Bagel Shop Guy Says He Doesn't Trust Women", - "description": "The enraged man who went viral for ranting about women on dating sites before getting ragdolled in a bagel shop is defending his misogyny ... he says it's women's fault in the first place.", - "timestamp": 1562889485, - "uploader": "TMZ Staff", - "upload_date": "20190711", - "thumbnail": "https://imagez.tmz.com/image/a8/4by3/2019/07/12/a85480d27b2f50a7bfea2322151d67a5_xl.jpg", - "duration": 123.0, + 'url': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/', + 'info_dict': { + 'id': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/', + 'ext': 'mp4', + 'title': 'Angry Bagel Shop Guy Says He Doesn\'t Trust Women', + 'description': 'The enraged man who went viral for ranting about women on dating sites before getting ragdolled in a bagel shop is defending his misogyny ... he says it\'s women\'s fault in the first place.', + 'timestamp': 1562889485, + 'uploader': 'TMZ Staff', + 'upload_date': '20190711', + 'thumbnail': 'https://imagez.tmz.com/image/a8/4by3/2019/07/12/a85480d27b2f50a7bfea2322151d67a5_xl.jpg', + 'duration': 123.0, }, }, { - "url": "http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert", - "md5": "5429c85db8bde39a473a56ca8c4c5602", - "info_dict": { - "id": "http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert", - "ext": "mp4", - "title": "Bobby Brown Tells Crowd ... Bobbi Kristina is Awake", - "description": 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', - "timestamp": 1429467813, - "uploader": "TMZ Staff", - "upload_date": "20150419", - "duration": 29.0, - "thumbnail": "https://imagez.tmz.com/image/15/4by3/2015/04/20/1539c7ae136359fc979236fa6a9449dd_xl.jpg", + 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', + 'md5': '5429c85db8bde39a473a56ca8c4c5602', + 'info_dict': { + 'id': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', + 'ext': 'mp4', + 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake', + 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', + 'timestamp': 1429467813, + 'uploader': 'TMZ Staff', + 'upload_date': '20150419', + 'duration': 29.0, + 'thumbnail': 'https://imagez.tmz.com/image/15/4by3/2015/04/20/1539c7ae136359fc979236fa6a9449dd_xl.jpg', }, }, { - "url": "http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/", - "info_dict": { - "id": "http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/", - "ext": "mp4", - "title": "Patti LaBelle -- Goes Nuclear On Stripping Fan", - "description": "Patti LaBelle made it known loud and clear last night ... NO " - "ONE gets on her stage and strips down.", - "timestamp": 1442683746, - "uploader": "TMZ Staff", - "upload_date": "20150919", - "duration": 104.0, - "thumbnail": "https://imagez.tmz.com/image/5e/4by3/2015/09/20/5e57d7575062528082994e18ac3f0f48_xl.jpg", + 'url': 'http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/', + 'info_dict': { + 'id': 'http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/', + 'ext': 'mp4', + 'title': 'Patti LaBelle -- Goes Nuclear On Stripping Fan', + 'description': 'Patti LaBelle made it known loud and clear last night ... NO ' + 'ONE gets on her stage and strips down.', + 'timestamp': 1442683746, + 'uploader': 'TMZ Staff', + 'upload_date': '20150919', + 'duration': 104.0, + 'thumbnail': 'https://imagez.tmz.com/image/5e/4by3/2015/09/20/5e57d7575062528082994e18ac3f0f48_xl.jpg', }, }, { - "url": "http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/", - "info_dict": { - "id": "http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/", - "ext": "mp4", - "title": "NBA's Adam Silver -- Blake Griffin's a Great Guy ... He'll Learn from This", - "description": "Two pretty parts of this video with NBA Commish Adam Silver.", - "timestamp": 1454010989, - "uploader": "TMZ Staff", - "upload_date": "20160128", - "duration": 59.0, - "thumbnail": "https://imagez.tmz.com/image/38/4by3/2016/01/29/3856e83e0beb57059ec412122b842fb1_xl.jpg", + 'url': 'http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/', + 'info_dict': { + 'id': 'http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/', + 'ext': 'mp4', + 'title': 'NBA\'s Adam Silver -- Blake Griffin\'s a Great Guy ... He\'ll Learn from This', + 'description': 'Two pretty parts of this video with NBA Commish Adam Silver.', + 'timestamp': 1454010989, + 'uploader': 'TMZ Staff', + 'upload_date': '20160128', + 'duration': 59.0, + 'thumbnail': 'https://imagez.tmz.com/image/38/4by3/2016/01/29/3856e83e0beb57059ec412122b842fb1_xl.jpg', }, }, { - "url": "http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/", - "info_dict": { - "id": "http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/", - "ext": "mp4", - "title": "Trump Star Vandal -- I'm Not Afraid of Donald or the Cops!", - "description": "James Otis is the the guy who took a pickaxe to Donald Trump's star on the Walk of Fame, and he tells TMZ .. he's ready and willing to go to jail for the crime.", - "timestamp": 1477500095, - "uploader": "TMZ Staff", - "upload_date": "20161026", - "thumbnail": "https://imagez.tmz.com/image/0d/4by3/2016/10/27/0d904814d4a75dcf9cc3b8cfd1edc1a3_xl.jpg", - "duration": 128.0, + 'url': 'http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/', + 'info_dict': { + 'id': 'http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/', + 'ext': 'mp4', + 'title': 'Trump Star Vandal -- I\'m Not Afraid of Donald or the Cops!', + 'description': 'James Otis is the the guy who took a pickaxe to Donald Trump\'s star on the Walk of Fame, and he tells TMZ .. he\'s ready and willing to go to jail for the crime.', + 'timestamp': 1477500095, + 'uploader': 'TMZ Staff', + 'upload_date': '20161026', + 'thumbnail': 'https://imagez.tmz.com/image/0d/4by3/2016/10/27/0d904814d4a75dcf9cc3b8cfd1edc1a3_xl.jpg', + 'duration': 128.0, }, }, { - "url": "https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/", - "info_dict": { - "id": "https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/", - "ext": "mp4", - "title": "Cops Use Billy Clubs Against Pro-Trump and Anti-Fascist " - "Demonstrators", - "description": "Beverly Hills may be an omen of what's coming next week, " - "because things got crazy on the streets and cops started " - "swinging their billy clubs at both Anti-Fascist and Pro-Trump " - "demonstrators.", - "timestamp": 1604182772, - "uploader": "TMZ Staff", - "upload_date": "20201031", - "duration": 96.0, - "thumbnail": "https://imagez.tmz.com/image/f3/4by3/2020/10/31/f37bd5a8aef84497866f425130c58be3_xl.jpg", + 'url': 'https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/', + 'info_dict': { + 'id': 'https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/', + 'ext': 'mp4', + 'title': 'Cops Use Billy Clubs Against Pro-Trump and Anti-Fascist ' + 'Demonstrators', + 'description': 'Beverly Hills may be an omen of what\'s coming next week, ' + 'because things got crazy on the streets and cops started ' + 'swinging their billy clubs at both Anti-Fascist and Pro-Trump ' + 'demonstrators.', + 'timestamp': 1604182772, + 'uploader': 'TMZ Staff', + 'upload_date': '20201031', + 'duration': 96.0, + 'thumbnail': 'https://imagez.tmz.com/image/f3/4by3/2020/10/31/f37bd5a8aef84497866f425130c58be3_xl.jpg', }, }, { - "url": "https://www.tmz.com/2020/11/05/gervonta-davis-car-crash-hit-and-run-police/", - "info_dict": { - "id": "Dddb6IGe-ws", - "ext": "mp4", - "title": "SICK LAMBO GERVONTA DAVIS IN HIS NEW RIDE RIGHT AFTER KO AFTER LEO EsNews Boxing", - "uploader": "ESNEWS", - "description": "md5:49675bc58883ccf80474b8aa701e1064", - "upload_date": "20201102", - "uploader_id": "ESNEWS", - "uploader_url": "http://www.youtube.com/user/ESNEWS", - "like_count": int, - "channel_id": "UCI-Oq7oFGakzSzHFlTtsUsQ", - "channel": "ESNEWS", - "view_count": int, - "duration": 225, - "live_status": "not_live", - "thumbnail": "https://i.ytimg.com/vi_webp/Dddb6IGe-ws/maxresdefault.webp", - "channel_url": "https://www.youtube.com/channel/UCI-Oq7oFGakzSzHFlTtsUsQ", - "channel_follower_count": int, - "playable_in_embed": True, - "categories": ["Sports"], - "age_limit": 0, - "tags": "count:10", - "availability": "public", + 'url': 'https://www.tmz.com/2020/11/05/gervonta-davis-car-crash-hit-and-run-police/', + 'info_dict': { + 'id': 'Dddb6IGe-ws', + 'ext': 'mp4', + 'title': 'SICK LAMBO GERVONTA DAVIS IN HIS NEW RIDE RIGHT AFTER KO AFTER LEO EsNews Boxing', + 'uploader': 'ESNEWS', + 'description': 'md5:49675bc58883ccf80474b8aa701e1064', + 'upload_date': '20201102', + 'uploader_id': '@ESNEWS', + 'uploader_url': 'https://www.youtube.com/@ESNEWS', + 'like_count': int, + 'channel_id': 'UCI-Oq7oFGakzSzHFlTtsUsQ', + 'channel': 'ESNEWS', + 'view_count': int, + 'duration': 225, + 'live_status': 'not_live', + 'thumbnail': 'https://i.ytimg.com/vi_webp/Dddb6IGe-ws/maxresdefault.webp', + 'channel_url': 'https://www.youtube.com/channel/UCI-Oq7oFGakzSzHFlTtsUsQ', + 'channel_follower_count': int, + 'playable_in_embed': True, + 'categories': ['Sports'], + 'age_limit': 0, + 'tags': 'count:10', + 'availability': 'public', + 'comment_count': int, }, }, { - "url": "https://www.tmz.com/2020/11/19/conor-mcgregor-dustin-poirier-contract-fight-ufc-257-fight-island/", - "info_dict": { - "id": "1329450007125225473", - "ext": "mp4", - "title": "The Mac Life - BREAKING: Conor McGregor (@thenotoriousmma) has signed his bout agreement for his rematch with Dustin Poirier for January 23.", - "uploader": "The Mac Life", - "description": "md5:56e6009bbc3d12498e10d08a8e1f1c69", - "upload_date": "20201119", - "uploader_id": "TheMacLife", - "timestamp": 1605800556, - "thumbnail": "https://pbs.twimg.com/media/EnMmfT8XYAExgxJ.jpg?name=small", - "like_count": int, - "duration": 11.812, - "uploader_url": "https://twitter.com/TheMacLife", - "age_limit": 0, - "repost_count": int, - "tags": [], - "comment_count": int, + 'url': 'https://www.tmz.com/2020/11/19/conor-mcgregor-dustin-poirier-contract-fight-ufc-257-fight-island/', + 'info_dict': { + 'id': '1329448013937471491', + 'ext': 'mp4', + 'title': 'The Mac Life - BREAKING: Conor McGregor (@thenotoriousmma) has signed his bout agreement for his rematch with Dustin Poirier for January 23.', + 'uploader': 'The Mac Life', + 'description': 'md5:56e6009bbc3d12498e10d08a8e1f1c69', + 'upload_date': '20201119', + 'display_id': '1329450007125225473', + 'uploader_id': 'TheMacLife', + 'timestamp': 1605800556, + 'thumbnail': 'https://pbs.twimg.com/media/EnMmfT8XYAExgxJ.jpg?name=small', + 'like_count': int, + 'duration': 11.812, + 'uploader_url': 'https://twitter.com/TheMacLife', + 'age_limit': 0, + 'repost_count': int, + 'tags': [], + 'comment_count': int, }, }, ] @@ -167,25 +169,25 @@ class TMZIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, url) jsonld = self._search_json_ld(webpage, url) - if not jsonld or "url" not in jsonld: + if not jsonld or 'url' not in jsonld: # try to extract from YouTube Player API # see https://developers.google.com/youtube/iframe_api_reference#Video_Queueing_Functions match_obj = re.search(r'\.cueVideoById\(\s*(?P[\'"])(?P.*?)(?P=quote)', webpage) if match_obj: - res = self.url_result(match_obj.group("id")) + res = self.url_result(match_obj.group('id')) return res # try to extract from twitter - blockquote_el = get_element_by_attribute("class", "twitter-tweet", webpage) + blockquote_el = get_element_by_attribute('class', 'twitter-tweet', webpage) if blockquote_el: matches = re.findall( r']+href=\s*(?P[\'"])(?P.*?)(?P=quote)', blockquote_el) if matches: for _, match in matches: - if "/status/" in match: + if '/status/' in match: res = self.url_result(match) return res - raise ExtractorError("No video found!") + raise ExtractorError('No video found!') if id not in jsonld: - jsonld["id"] = url + jsonld['id'] = url return jsonld From 4392c4680c383b221b6aa26d25c6e4b5581a5ad6 Mon Sep 17 00:00:00 2001 From: github-actions Date: Sat, 7 Oct 2023 01:28:34 +0000 Subject: [PATCH 016/108] Release 2023.10.07 Created by: Grub4K :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 ++--- .../ISSUE_TEMPLATE/2_site_support_request.yml | 8 ++--- .../ISSUE_TEMPLATE/3_site_feature_request.yml | 8 ++--- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 ++--- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 ++--- .github/ISSUE_TEMPLATE/6_question.yml | 8 ++--- CONTRIBUTORS | 6 ++++ Changelog.md | 29 +++++++++++++++++++ supportedsites.md | 4 +-- yt_dlp/version.py | 4 +-- 10 files changed, 63 insertions(+), 28 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index f0fc71d57532..dacb41758d43 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -64,7 +64,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -72,8 +72,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index ac9a72a1c1c4..ec6e298a1921 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -76,7 +76,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -84,8 +84,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 577e4d491004..cf3cdd21f317 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -72,7 +72,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -80,8 +80,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 9529c1bd6c6d..1bbcf6895663 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,8 +65,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index b17a6e046c8f..d3bc06e80917 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -53,7 +53,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -61,7 +61,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 5345e8917c20..30311d5b5694 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -59,7 +59,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -67,7 +67,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 72b9584ecf30..8eda41307264 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -503,3 +503,9 @@ Yalab7 zhallgato zhong-yiyu Zprokkel +AS6939 +drzraf +handlerug +jiru +madewokherd +xofe diff --git a/Changelog.md b/Changelog.md index 04511927faf9..48dcbf10295c 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,35 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2023.10.07 + +#### Extractor changes +- **abc.net.au**: iview: [Improve `episode` extraction](https://github.com/yt-dlp/yt-dlp/commit/a9efb4b8d74f3583450ffda0ee57259a47d39c70) ([#8201](https://github.com/yt-dlp/yt-dlp/issues/8201)) by [xofe](https://github.com/xofe) +- **erocast**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/47c598783c98c179e04dd12c2a3fee0f3dc53087) ([#8264](https://github.com/yt-dlp/yt-dlp/issues/8264)) by [madewokherd](https://github.com/madewokherd) +- **gofile**: [Fix token cookie bug](https://github.com/yt-dlp/yt-dlp/commit/0730d5a966fa8a937d84bfb7f68be5198acb039b) by [bashonly](https://github.com/bashonly) +- **iq.com**: [Fix extraction and subtitles](https://github.com/yt-dlp/yt-dlp/commit/35d9cbaf9638ccc9daf8a863063b2e7c135bc664) ([#8260](https://github.com/yt-dlp/yt-dlp/issues/8260)) by [AS6939](https://github.com/AS6939) +- **lbry** + - [Add playlist support](https://github.com/yt-dlp/yt-dlp/commit/48cceec1ddb8649b5e771df8df79eb9c39c82b90) ([#8213](https://github.com/yt-dlp/yt-dlp/issues/8213)) by [bashonly](https://github.com/bashonly), [drzraf](https://github.com/drzraf), [Grub4K](https://github.com/Grub4K) + - [Extract `uploader_id`](https://github.com/yt-dlp/yt-dlp/commit/0e722f2f3ca42e634fd7b06ee70b16bf833ce132) ([#8244](https://github.com/yt-dlp/yt-dlp/issues/8244)) by [drzraf](https://github.com/drzraf) +- **litv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/91a670a4f7babe9c8aa2018f57d8c8952a6f49d8) ([#7785](https://github.com/yt-dlp/yt-dlp/issues/7785)) by [jiru](https://github.com/jiru) +- **neteasemusic**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/f980df734cf5c0eaded2f7b38c6c60bccfeebb48) ([#8181](https://github.com/yt-dlp/yt-dlp/issues/8181)) by [c-basalt](https://github.com/c-basalt) +- **nhk**: [Fix VOD extraction](https://github.com/yt-dlp/yt-dlp/commit/e831c80e8b2fc025b3b67d82974cc59e3526fdc8) ([#8249](https://github.com/yt-dlp/yt-dlp/issues/8249)) by [garret1317](https://github.com/garret1317) +- **radiko**: [Improve extraction](https://github.com/yt-dlp/yt-dlp/commit/2ad3873f0dfa9285c91d2160e36c039e69d597c7) ([#8221](https://github.com/yt-dlp/yt-dlp/issues/8221)) by [garret1317](https://github.com/garret1317) +- **substack** + - [Fix download cookies bug](https://github.com/yt-dlp/yt-dlp/commit/2f2dda3a7e85148773da3cdbc03ac9949ec1bc45) ([#8219](https://github.com/yt-dlp/yt-dlp/issues/8219)) by [handlerug](https://github.com/handlerug) + - [Fix embed extraction](https://github.com/yt-dlp/yt-dlp/commit/fbcc299bd8a19cf8b3c8805d6c268a9110230973) ([#8218](https://github.com/yt-dlp/yt-dlp/issues/8218)) by [handlerug](https://github.com/handlerug) +- **theta**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/792f1e64f6a2beac51e85408d142b3118115c4fd) ([#8251](https://github.com/yt-dlp/yt-dlp/issues/8251)) by [alerikaisattera](https://github.com/alerikaisattera) +- **wrestleuniversevod**: [Call API with device ID](https://github.com/yt-dlp/yt-dlp/commit/b095fd3fa9d58a65dc9b830bd63b9d909422aa86) ([#8272](https://github.com/yt-dlp/yt-dlp/issues/8272)) by [bashonly](https://github.com/bashonly) +- **xhamster**: user: [Support creator urls](https://github.com/yt-dlp/yt-dlp/commit/cc8d8441524ec3442d7c0d3f8f33f15b66aa06f3) ([#8232](https://github.com/yt-dlp/yt-dlp/issues/8232)) by [Grub4K](https://github.com/Grub4K) +- **youtube** + - [Fix `heatmap` extraction](https://github.com/yt-dlp/yt-dlp/commit/03e85ea99db76a2fddb65bf46f8819bda780aaf3) ([#8299](https://github.com/yt-dlp/yt-dlp/issues/8299)) by [bashonly](https://github.com/bashonly) + - [Raise a warning for `Incomplete Data` instead of an error](https://github.com/yt-dlp/yt-dlp/commit/eb5bdbfa70126c7d5355cc0954b63720522e462c) ([#8238](https://github.com/yt-dlp/yt-dlp/issues/8238)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **cleanup** + - [Update extractor tests](https://github.com/yt-dlp/yt-dlp/commit/19c90e405b4137c06dfe6f9aaa02396df0da93e5) ([#7718](https://github.com/yt-dlp/yt-dlp/issues/7718)) by [trainman261](https://github.com/trainman261) + - Miscellaneous: [377e85a](https://github.com/yt-dlp/yt-dlp/commit/377e85a1797db9e98b78b38203ed9d4ded229991) by [dirkf](https://github.com/dirkf), [gamer191](https://github.com/gamer191), [Grub4K](https://github.com/Grub4K) + ### 2023.09.24 #### Important changes diff --git a/supportedsites.md b/supportedsites.md index 620e0f3058e4..ecef4dc2d1e0 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -422,6 +422,7 @@ - **eplus:inbound**: e+ (イープラス) overseas - **Epoch** - **Eporner** + - **Erocast** - **EroProfile**: [*eroprofile*](## "netrc machine") - **EroProfile:album** - **ertflix**: ERTFLIX videos @@ -699,6 +700,7 @@ - **LastFMUser** - **lbry** - **lbry:channel** + - **lbry:playlist** - **LCI** - **Lcp** - **LcpPlay** @@ -1474,8 +1476,6 @@ - **ThePlatformFeed** - **TheStar** - **TheSun** - - **ThetaStream** - - **ThetaVideo** - **TheWeatherChannel** - **ThisAmericanLife** - **ThisAV** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 2a7c84b93f41..60c1c94cc35f 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2023.09.24' +__version__ = '2023.10.07' -RELEASE_GIT_HEAD = '088add9567d39b758737e4299a0e619fd89d2e8f' +RELEASE_GIT_HEAD = '377e85a1797db9e98b78b38203ed9d4ded229991' VARIANT = None From 9d7ded6419089c1bf252496073f73ad90ed71004 Mon Sep 17 00:00:00 2001 From: Awal Garg Date: Sun, 8 Oct 2023 01:57:23 +0200 Subject: [PATCH 017/108] [utils] `js_to_json`: Fix `Date` constructor parsing (#8295) Authored by: awalgarg, Grub4K --- test/test_utils.py | 7 ++++++- yt_dlp/utils/_utils.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index fd612ff86f91..77040f29c604 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1209,6 +1209,9 @@ def test_js_to_json_edgecases(self): on = js_to_json('\'"\\""\'') self.assertEqual(json.loads(on), '"""', msg='Unnecessary quote escape should be escaped') + on = js_to_json('[new Date("spam"), \'("eggs")\']') + self.assertEqual(json.loads(on), ['spam', '("eggs")'], msg='Date regex should match a single string') + def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') @@ -1220,11 +1223,13 @@ def test_js_to_json_template_literal(self): self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""') self.assertEqual(js_to_json('`${name}`', {}), '"name"') - def test_js_to_json_map_array_constructors(self): + def test_js_to_json_common_constructors(self): self.assertEqual(json.loads(js_to_json('new Map([["a", 5]])')), {'a': 5}) self.assertEqual(json.loads(js_to_json('Array(5, 10)')), [5, 10]) self.assertEqual(json.loads(js_to_json('new Array(15,5)')), [15, 5]) self.assertEqual(json.loads(js_to_json('new Map([Array(5, 10),new Array(15,5)])')), {'5': 10, '15': 5}) + self.assertEqual(json.loads(js_to_json('new Date("123")')), "123") + self.assertEqual(json.loads(js_to_json('new Date(\'2023-10-19\')')), "2023-10-19") def test_extract_attributes(self): self.assertEqual(extract_attributes(''), {'x': 'y'}) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index ba6242380638..3dc17bf59312 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2744,7 +2744,7 @@ def create_map(mobj): code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code) code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) if not strict: - code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) + code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code) code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code) code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code) code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code) From 1c51c520f7b511ebd9e4eb7322285a8c31eedbbd Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 8 Oct 2023 02:01:01 +0200 Subject: [PATCH 018/108] [fd/fragment] Improve progress calculation (#8241) This uses the download speed from all threads and also adds smoothing to speed and eta Authored by: Grub4K --- yt_dlp/downloader/fragment.py | 48 ++++++--------- yt_dlp/utils/progress.py | 109 ++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 29 deletions(-) create mode 100644 yt_dlp/utils/progress.py diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index b4b680dae119..b4f003d37f54 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -14,6 +14,7 @@ from ..networking.exceptions import HTTPError, IncompleteRead from ..utils import DownloadError, RetryManager, encodeFilename, traverse_obj from ..utils.networking import HTTPHeaderDict +from ..utils.progress import ProgressCalculator class HttpQuietDownloader(HttpFD): @@ -226,8 +227,7 @@ def _start_frag_download(self, ctx, info_dict): resume_len = ctx['complete_frags_downloaded_bytes'] total_frags = ctx['total_frags'] ctx_id = ctx.get('ctx_id') - # This dict stores the download progress, it's updated by the progress - # hook + # Stores the download progress, updated by the progress hook state = { 'status': 'downloading', 'downloaded_bytes': resume_len, @@ -237,14 +237,8 @@ def _start_frag_download(self, ctx, info_dict): 'tmpfilename': ctx['tmpfilename'], } - start = time.time() - ctx.update({ - 'started': start, - 'fragment_started': start, - # Amount of fragment's bytes downloaded by the time of the previous - # frag progress hook invocation - 'prev_frag_downloaded_bytes': 0, - }) + ctx['started'] = time.time() + progress = ProgressCalculator(resume_len) def frag_progress_hook(s): if s['status'] not in ('downloading', 'finished'): @@ -259,38 +253,35 @@ def frag_progress_hook(s): state['max_progress'] = ctx.get('max_progress') state['progress_idx'] = ctx.get('progress_idx') - time_now = time.time() - state['elapsed'] = time_now - start + state['elapsed'] = progress.elapsed frag_total_bytes = s.get('total_bytes') or 0 s['fragment_info_dict'] = s.pop('info_dict', {}) + + # XXX: Fragment resume is not accounted for here if not ctx['live']: estimated_size = ( (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) / (state['fragment_index'] + 1) * total_frags) - state['total_bytes_estimate'] = estimated_size + progress.total = estimated_size + progress.update(s.get('downloaded_bytes')) + state['total_bytes_estimate'] = progress.total + else: + progress.update(s.get('downloaded_bytes')) if s['status'] == 'finished': state['fragment_index'] += 1 ctx['fragment_index'] = state['fragment_index'] - state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] - ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] - ctx['speed'] = state['speed'] = self.calc_speed( - ctx['fragment_started'], time_now, frag_total_bytes) - ctx['fragment_started'] = time.time() - ctx['prev_frag_downloaded_bytes'] = 0 - else: - frag_downloaded_bytes = s['downloaded_bytes'] - state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] - ctx['speed'] = state['speed'] = self.calc_speed( - ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx.get('frag_resume_len', 0)) - if not ctx['live']: - state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes']) - ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes + progress.thread_reset() + + state['downloaded_bytes'] = ctx['complete_frags_downloaded_bytes'] = progress.downloaded + state['speed'] = ctx['speed'] = progress.speed.smooth + state['eta'] = progress.eta.smooth + self._hook_progress(state, info_dict) ctx['dl'].add_progress_hook(frag_progress_hook) - return start + return ctx['started'] def _finish_frag_download(self, ctx, info_dict): ctx['dest_stream'].close() @@ -500,7 +491,6 @@ def _download_fragment(fragment): download_fragment(fragment, ctx_copy) return fragment, fragment['frag_index'], ctx_copy.get('fragment_filename_sanitized') - self.report_warning('The download speed shown is only of one thread. This is a known issue') with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool: try: for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments): diff --git a/yt_dlp/utils/progress.py b/yt_dlp/utils/progress.py new file mode 100644 index 000000000000..f254a3887e43 --- /dev/null +++ b/yt_dlp/utils/progress.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +import bisect +import threading +import time + + +class ProgressCalculator: + # Time to calculate the speed over (seconds) + SAMPLING_WINDOW = 3 + # Minimum timeframe before to sample next downloaded bytes (seconds) + SAMPLING_RATE = 0.05 + # Time before showing eta (seconds) + GRACE_PERIOD = 1 + + def __init__(self, initial: int): + self._initial = initial or 0 + self.downloaded = self._initial + + self.elapsed: float = 0 + self.speed = SmoothValue(0, smoothing=0.7) + self.eta = SmoothValue(None, smoothing=0.9) + + self._total = 0 + self._start_time = time.monotonic() + self._last_update = self._start_time + + self._lock = threading.Lock() + self._thread_sizes: dict[int, int] = {} + + self._times = [self._start_time] + self._downloaded = [self.downloaded] + + @property + def total(self): + return self._total + + @total.setter + def total(self, value: int | None): + with self._lock: + if value is not None and value < self.downloaded: + value = self.downloaded + + self._total = value + + def thread_reset(self): + current_thread = threading.get_ident() + with self._lock: + self._thread_sizes[current_thread] = 0 + + def update(self, size: int | None): + if not size: + return + + current_thread = threading.get_ident() + + with self._lock: + last_size = self._thread_sizes.get(current_thread, 0) + self._thread_sizes[current_thread] = size + self._update(size - last_size) + + def _update(self, size: int): + current_time = time.monotonic() + + self.downloaded += size + self.elapsed = current_time - self._start_time + if self.total is not None and self.downloaded > self.total: + self._total = self.downloaded + + if self._last_update + self.SAMPLING_RATE > current_time: + return + self._last_update = current_time + + self._times.append(current_time) + self._downloaded.append(self.downloaded) + + offset = bisect.bisect_left(self._times, current_time - self.SAMPLING_WINDOW) + del self._times[:offset] + del self._downloaded[:offset] + if len(self._times) < 2: + self.speed.reset() + self.eta.reset() + return + + download_time = current_time - self._times[0] + if not download_time: + return + + self.speed.set((self.downloaded - self._downloaded[0]) / download_time) + if self.total and self.speed.value and self.elapsed > self.GRACE_PERIOD: + self.eta.set((self.total - self.downloaded) / self.speed.value) + else: + self.eta.reset() + + +class SmoothValue: + def __init__(self, initial: float | None, smoothing: float): + self.value = self.smooth = self._initial = initial + self._smoothing = smoothing + + def set(self, value: float): + self.value = value + if self.smooth is None: + self.smooth = self.value + else: + self.smooth = (1 - self._smoothing) * value + self._smoothing * self.smooth + + def reset(self): + self.value = self.smooth = self._initial From b7098d46b552a9322c6cea39ba80be5229f922de Mon Sep 17 00:00:00 2001 From: naginatana <96737708+naginatana@users.noreply.github.com> Date: Tue, 10 Oct 2023 01:46:16 +0800 Subject: [PATCH 019/108] [ie/youku] Improve tudou.com support (#8160) Authored by: naginatana --- yt_dlp/extractor/youku.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youku.py b/yt_dlp/extractor/youku.py index 7ecd9f18392b..e351765868e7 100644 --- a/yt_dlp/extractor/youku.py +++ b/yt_dlp/extractor/youku.py @@ -20,7 +20,7 @@ class YoukuIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?://( - (?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| + (?:v|play(?:er)?)\.(?:youku|tudou)\.com/(?:v_show/id_|player\.php/sid/)| video\.tudou\.com/v/)| youku:) (?P[A-Za-z0-9]+)(?:\.html|/v\.swf|) @@ -87,6 +87,19 @@ class YoukuIE(InfoExtractor): 'uploader_url': 'https://www.youku.com/profile/index/?uid=UNjU2MzY1MzM1Ng==', 'tags': list, }, + }, { + 'url': 'https://play.tudou.com/v_show/id_XNjAxNjI2OTU3Ng==.html?', + 'info_dict': { + 'id': 'XNjAxNjI2OTU3Ng', + 'ext': 'mp4', + 'title': '阿斯塔意识到哈里杀了人,自己被骗了', + 'thumbnail': 'https://m.ykimg.com/0541010164F732752794D4D7B70331D1', + 'uploader_id': '88758207', + 'tags': [], + 'uploader_url': 'https://www.youku.com/profile/index/?uid=UMzU1MDMyODI4', + 'uploader': '英美剧场', + 'duration': 72.91, + }, }] @staticmethod From 09f815ad52843219a7ee3f2a0dddf6c250c91f0c Mon Sep 17 00:00:00 2001 From: Stefan Lobbenmeier Date: Mon, 9 Oct 2023 19:51:37 +0200 Subject: [PATCH 020/108] [ie/ArteTV] Support age-restricted content (#8301) Closes #7782 Authored by: StefanLobbenmeier --- yt_dlp/extractor/arte.py | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index a19cd2a3aee7..139a3a729f9b 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -48,17 +48,7 @@ class ArteTVIE(ArteTVBaseIE): }, { 'note': 'No alt_title', 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/', - 'info_dict': { - 'id': '110371-000-A', - 'ext': 'mp4', - 'upload_date': '20220718', - 'duration': 154, - 'timestamp': 1658162460, - 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786', - 'title': 'La chaleur, supplice des arbres de rue', - 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530', - }, - 'params': {'skip_download': 'm3u8'} + 'only_matching': True, }, { 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', 'only_matching': True, @@ -67,19 +57,20 @@ class ArteTVIE(ArteTVBaseIE): 'only_matching': True, }, { 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/', + 'only_matching': True, + }, { + 'note': 'age-restricted', + 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/', 'info_dict': { - 'id': '110203-006-A', - 'chapters': 'count:16', - 'description': 'md5:cf592f1df52fe52007e3f8eac813c084', - 'alt_title': 'Zaz', - 'title': 'Baloise Session 2022', - 'timestamp': 1668445200, - 'duration': 4054, - 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530', - 'upload_date': '20221114', + 'id': '006785-000-A', + 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba', + 'title': 'The Element of Crime', + 'timestamp': 1696111200, + 'duration': 5849, + 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530', + 'upload_date': '20230930', 'ext': 'mp4', - }, - 'expected_warnings': ['geo restricted'] + } }] _GEO_BYPASS = True @@ -136,7 +127,9 @@ def _real_extract(self, url): lang = mobj.group('lang') or mobj.group('lang_2') langauge_code = self._LANG_MAP.get(lang) - config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id) + config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={ + 'x-validated-age': '18' + }) geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {} if geoblocking.get('restrictedArea'): From 88a99c87b680ae59002534a517e191f46c42cbd4 Mon Sep 17 00:00:00 2001 From: Midnight Veil Date: Tue, 10 Oct 2023 04:55:46 +1100 Subject: [PATCH 021/108] [ie/tenplay] Add support for seasons (#7939) Closes #7744 Authored by: midnightveil --- yt_dlp/extractor/_extractors.py | 5 ++- yt_dlp/extractor/tenplay.py | 58 +++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 55c3c2f8e88b..6717a6039fde 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1992,7 +1992,10 @@ WeTvSeriesIE, ) from .tennistv import TennisTVIE -from .tenplay import TenPlayIE +from .tenplay import ( + TenPlayIE, + TenPlaySeasonIE, +) from .testurl import TestURLIE from .tf1 import TF1IE from .tfo import TFOIE diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index c7097cf02548..7ce7cbf849ea 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -1,9 +1,11 @@ -from datetime import datetime import base64 +import functools +import itertools +from datetime import datetime from .common import InfoExtractor from ..networking import HEADRequest -from ..utils import int_or_none, urlencode_postdata +from ..utils import int_or_none, traverse_obj, urlencode_postdata, urljoin class TenPlayIE(InfoExtractor): @@ -113,3 +115,55 @@ def _real_extract(self, url): 'uploader': 'Channel 10', 'uploader_id': '2199827728001', } + + +class TenPlaySeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?P[^/?#]+)/episodes/(?P[^/?#]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://10play.com.au/masterchef/episodes/season-14', + 'info_dict': { + 'title': 'Season 14', + 'id': 'MjMyOTIy', + }, + 'playlist_mincount': 64, + }, { + 'url': 'https://10play.com.au/the-bold-and-the-beautiful-fast-tracked/episodes/season-2022', + 'info_dict': { + 'title': 'Season 2022', + 'id': 'Mjc0OTIw', + }, + 'playlist_mincount': 256, + }] + + def _entries(self, load_more_url, display_id=None): + skip_ids = [] + for page in itertools.count(1): + episodes_carousel = self._download_json( + load_more_url, display_id, query={'skipIds[]': skip_ids}, + note=f'Fetching episodes page {page}') + + episodes_chunk = episodes_carousel['items'] + skip_ids.extend(ep['id'] for ep in episodes_chunk) + + for ep in episodes_chunk: + yield ep['cardLink'] + if not episodes_carousel['hasMore']: + break + + def _real_extract(self, url): + show, season = self._match_valid_url(url).group('show', 'season') + season_info = self._download_json( + f'https://10play.com.au/api/shows/{show}/episodes/{season}', f'{show}/{season}') + + episodes_carousel = traverse_obj(season_info, ( + 'content', 0, 'components', ( + lambda _, v: v['title'].lower() == 'episodes', + (..., {dict}), + )), get_all=False) or {} + + playlist_id = episodes_carousel['tpId'] + + return self.playlist_from_matches( + self._entries(urljoin(url, episodes_carousel['loadMoreUrl']), playlist_id), + playlist_id, traverse_obj(season_info, ('content', 0, 'title', {str})), + getter=functools.partial(urljoin, url)) From 4de94b9e165bfd6421a692f5f2eabcdb08edcb71 Mon Sep 17 00:00:00 2001 From: garret Date: Mon, 9 Oct 2023 19:00:26 +0100 Subject: [PATCH 022/108] [ie/nhk] Fix Japanese-language VOD extraction (#8309) Closes #8303 Authored by: garret1317 --- yt_dlp/extractor/nhk.py | 68 ++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index bcbc2279f6f3..f6b5c501bb05 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -68,11 +68,12 @@ def _extract_formats_and_subtitles(self, vod_id): def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None - lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups() - if len(episode_id) == 7: + lang, m_type, episode_id = NhkVodIE._match_valid_url(url).group('lang', 'type', 'id') + is_video = m_type == 'video' + + if is_video: episode_id = episode_id[:4] + '-' + episode_id[4:] - is_video = m_type == 'video' if fetch_episode: episode = self._call_api( episode_id, lang, is_video, True, episode_id[:4] == '9999')[0] @@ -133,47 +134,46 @@ def get_clean_field(key): class NhkVodIE(NhkBaseIE): # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg - _VALID_URL = r'%s%s(?P[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + _VALID_URL = [rf'{NhkBaseIE._BASE_URL_REGEX}/(?Pvideo)/(?P[0-9a-z]+)', + rf'{NhkBaseIE._BASE_URL_REGEX}/(?Paudio)/(?P[^/?#]+?-\d{{8}}-[0-9a-z]+)'] # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2061601/', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2049126/', 'info_dict': { - 'id': 'yd8322ch', + 'id': 'nw_vod_v_en_2049_126_20230413233000_01_1681398302', 'ext': 'mp4', - 'description': 'md5:109c8b05d67a62d0592f2b445d2cd898', - 'title': 'GRAND SUMO Highlights - [Recap] May Tournament Day 1 (Opening Day)', - 'upload_date': '20230514', - 'timestamp': 1684083791, - 'series': 'GRAND SUMO Highlights', - 'episode': '[Recap] May Tournament Day 1 (Opening Day)', - 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1684084443/4028649.jpg?w=1920&h=1080', + 'title': 'Japan Railway Journal - The Tohoku Shinkansen: Full Speed Ahead', + 'description': 'md5:49f7c5b206e03868a2fdf0d0814b92f6', + 'thumbnail': 'md5:51bcef4a21936e7fea1ff4e06353f463', + 'episode': 'The Tohoku Shinkansen: Full Speed Ahead', + 'series': 'Japan Railway Journal', }, }, { # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', - 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', + 'md5': '153c3016dfd252ba09726588149cf0e7', 'info_dict': { - 'id': 'a95j5iza', + 'id': 'lpZXIwaDE6_Z-976CPsFdxyICyWUzlT5', 'ext': 'mp4', - 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU", + 'title': 'Dining with the Chef - Chef Saito\'s Family recipe: MENCHI-KATSU', 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', - 'timestamp': 1565965194, - 'upload_date': '20190816', - 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1567086278/3715195.jpg?w=1920&h=1080', + 'thumbnail': 'md5:d6a4d9b6e9be90aaadda0bcce89631ed', 'series': 'Dining with the Chef', 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU', }, }, { - # audio clip - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/', + # radio + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/livinginjapan-20231001-1/', 'info_dict': { - 'id': 'r_inventions-20201104-1-en', + 'id': 'livinginjapan-20231001-1-en', 'ext': 'm4a', - 'title': "Japan's Top Inventions - Miniature Video Cameras", - 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', + 'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines', + 'series': 'Living in Japan', + 'description': 'md5:850611969932874b4a3309e0cae06c2f', + 'thumbnail': 'md5:960622fb6e06054a4a1a0c97ea752545', + 'episode': 'Tips for Travelers to Japan / Ramen Vending Machines' }, - 'skip': '404 Not Found', }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, @@ -199,6 +199,19 @@ class NhkVodIE(NhkBaseIE): 'timestamp': 1623722008, }, 'skip': '404 Not Found', + }, { + # japanese-language, longer id than english + 'url': 'https://www3.nhk.or.jp/nhkworld/ja/ondemand/video/0020271111/', + 'info_dict': { + 'id': 'nw_ja_v_jvod_ohayou_20231008', + 'ext': 'mp4', + 'title': 'おはよう日本(7時台) - 10月8日放送', + 'series': 'おはよう日本(7時台)', + 'episode': '10月8日放送', + 'thumbnail': 'md5:d733b1c8e965ab68fb02b2d347d0e9b4', + 'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0', + }, + 'skip': 'expires 2023-10-15', }] def _real_extract(self, url): @@ -206,7 +219,7 @@ def _real_extract(self, url): class NhkVodProgramIE(NhkBaseIE): - _VALID_URL = r'%s/program%s(?P[0-9a-z]+)(?:.+?\btype=(?Pclip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + _VALID_URL = rf'{NhkBaseIE._BASE_URL_REGEX}/program{NhkBaseIE._TYPE_REGEX}(?P\w+)(?:.+?\btype=(?Pclip|(?:radio|tv)Episode))?' _TESTS = [{ # video program episodes 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo', @@ -240,8 +253,7 @@ class NhkVodProgramIE(NhkBaseIE): }] def _real_extract(self, url): - lang, m_type, program_id, episode_type = self._match_valid_url(url).groups() - + lang, m_type, program_id, episode_type = self._match_valid_url(url).group('lang', 'type', 'id', 'episode_type') episodes = self._call_api( program_id, lang, m_type == 'video', False, episode_type == 'clip') From 84e26038d4002e763ea51ca1bdce4f7e63c540bf Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 9 Oct 2023 13:30:36 -0500 Subject: [PATCH 023/108] [utils] `write_xattr`: Use `os.setxattr` if available (#8205) Closes #8193 Authored by: bashonly, Grub4K Co-authored-by: Simon Sawicki --- README.md | 2 +- yt_dlp/utils/_utils.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a0b69c9a1a16..a26482faaad3 100644 --- a/README.md +++ b/README.md @@ -280,7 +280,7 @@ While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly * [**mutagen**](https://github.com/quodlibet/mutagen)\* - For `--embed-thumbnail` in certain formats. Licensed under [GPLv2+](https://github.com/quodlibet/mutagen/blob/master/COPYING) * [**AtomicParsley**](https://github.com/wez/atomicparsley) - For `--embed-thumbnail` in `mp4`/`m4a` files when `mutagen`/`ffmpeg` cannot. Licensed under [GPLv2+](https://github.com/wez/atomicparsley/blob/master/COPYING) -* [**xattr**](https://github.com/xattr/xattr), [**pyxattr**](https://github.com/iustin/pyxattr) or [**setfattr**](http://savannah.nongnu.org/projects/attr) - For writing xattr metadata (`--xattr`) on **Linux**. Licensed under [MIT](https://github.com/xattr/xattr/blob/master/LICENSE.txt), [LGPL2.1](https://github.com/iustin/pyxattr/blob/master/COPYING) and [GPLv2+](http://git.savannah.nongnu.org/cgit/attr.git/tree/doc/COPYING) respectively +* [**xattr**](https://github.com/xattr/xattr), [**pyxattr**](https://github.com/iustin/pyxattr) or [**setfattr**](http://savannah.nongnu.org/projects/attr) - For writing xattr metadata (`--xattr`) on **Mac** and **BSD**. Licensed under [MIT](https://github.com/xattr/xattr/blob/master/LICENSE.txt), [LGPL2.1](https://github.com/iustin/pyxattr/blob/master/COPYING) and [GPLv2+](http://git.savannah.nongnu.org/cgit/attr.git/tree/doc/COPYING) respectively ### Misc diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 3dc17bf59312..10c7c431103b 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -4441,10 +4441,12 @@ def write_xattr(path, key, value): raise XAttrMetadataError(e.errno, e.strerror) return - # UNIX Method 1. Use xattrs/pyxattrs modules + # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules setxattr = None - if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr': + if callable(getattr(os, 'setxattr', None)): + setxattr = os.setxattr + elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr': # Unicode arguments are not supported in pyxattr until version 0.5.0 # See https://github.com/ytdl-org/youtube-dl/issues/5498 if version_tuple(xattr.__version__) >= (0, 5, 0): From feebf6d02fc9651331eee2af5e08e6112288163b Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Thu, 12 Oct 2023 12:20:52 +0200 Subject: [PATCH 024/108] [ie/youtube] Fix bug with `--extractor-retries inf` (#8328) Authored by: Grub4K --- yt_dlp/extractor/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index b7ac3e9cc1ce..c5be366362ee 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -947,7 +947,10 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers icd_rm = next(icd_retries) main_retries = iter(self.RetryManager()) main_rm = next(main_retries) - for _ in range(main_rm.retries + icd_rm.retries + 1): + # Manual retry loop for multiple RetryManagers + # The proper RetryManager MUST be advanced after an error + # and it's result MUST be checked if the manager is non fatal + while True: try: response = self._call_api( ep=ep, fatal=True, headers=headers, From b9316642313bbc9e209ac0d2276d37ba60bceb49 Mon Sep 17 00:00:00 2001 From: bashonly Date: Fri, 13 Oct 2023 14:23:39 -0500 Subject: [PATCH 025/108] [ie/radiko] Fix bug with `downloader_options` Closes #8333 Authored by: bashonly --- yt_dlp/extractor/radiko.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py index 8c8fb1a8f90b..c363d9ba5f7b 100644 --- a/yt_dlp/extractor/radiko.py +++ b/yt_dlp/extractor/radiko.py @@ -154,7 +154,7 @@ def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, sf['preference'] = -100 sf['format_note'] = 'not preferred' if not is_onair and timefree_int == 1 and time_to_skip: - sf['downloader_options'] = {'ffmpeg_args': ['-ss', time_to_skip]} + sf['downloader_options'] = {'ffmpeg_args': ['-ss', str(time_to_skip)]} formats.extend(subformats) return formats From e030b6b6fba7b2f4614ad2ab9f7649d40a2dd305 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Fri, 13 Oct 2023 21:29:56 +0200 Subject: [PATCH 026/108] [ie/mbn] Add extractor (#8312) Authored by: seproDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/mbn.py | 89 +++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 yt_dlp/extractor/mbn.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 6717a6039fde..45073628c879 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1053,6 +1053,7 @@ from .massengeschmacktv import MassengeschmackTVIE from .masters import MastersIE from .matchtv import MatchTVIE +from .mbn import MBNIE from .mdr import MDRIE from .medaltv import MedalTVIE from .mediaite import MediaiteIE diff --git a/yt_dlp/extractor/mbn.py b/yt_dlp/extractor/mbn.py new file mode 100644 index 000000000000..4917c4698e66 --- /dev/null +++ b/yt_dlp/extractor/mbn.py @@ -0,0 +1,89 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class MBNIE(InfoExtractor): + IE_DESC = 'mbn.co.kr (매일방송)' + _VALID_URL = r'https?://(?:www\.)?mbn\.co\.kr/vod/programContents/preview(?:list)?/\d+/\d+/(?P\d+)' + _TESTS = [{ + 'url': 'https://mbn.co.kr/vod/programContents/previewlist/861/5433/1276155', + 'md5': '85e1694e5b247c04d1386b7e3c90fd76', + 'info_dict': { + 'id': '1276155', + 'ext': 'mp4', + 'title': '결국 사로잡힌 권유리, 그녀를 목숨 걸고 구하려는 정일우!', + 'duration': 3891, + 'release_date': '20210703', + 'thumbnail': 'http://img.vod.mbn.co.kr/mbnvod2img/861/2021/07/03/20210703230811_20_861_1276155_360_7_0.jpg', + 'series': '보쌈 - 운명을 훔치다', + 'episode': 'Episode 19', + 'episode_number': 19, + }, + }, { + 'url': 'https://www.mbn.co.kr/vod/programContents/previewlist/835/5294/1084744', + 'md5': 'fc65d3aac85e85e0b5056f4ef99cde4a', + 'info_dict': { + 'id': '1084744', + 'ext': 'mp4', + 'title': '김정은♥최원영, 제자리를 찾은 위험한 부부! "결혼은 투쟁이면서, 어려운 방식이야.."', + 'duration': 93, + 'release_date': '20201124', + 'thumbnail': 'http://img.vod.mbn.co.kr/mbnvod2img/835/2020/11/25/20201125000221_21_835_1084744_360_7_0.jpg', + 'series': '나의 위험한 아내', + }, + }, { + 'url': 'https://www.mbn.co.kr/vod/programContents/preview/952/6088/1054797?next=1', + 'md5': 'c711103c72aeac8323a5cf1751f10097', + 'info_dict': { + 'id': '1054797', + 'ext': 'mp4', + 'title': '[2차 티저] MBN 주말 미니시리즈 <완벽한 결혼의 정석> l 그녀에게 주어진 두 번째 인생', + 'duration': 65, + 'release_date': '20231028', + 'thumbnail': 'http://img.vod.mbn.co.kr/vod2/952/2023/09/11/20230911130223_22_952_1054797_1080_7.jpg', + 'series': '완벽한 결혼의 정석', + }, + }] + + def _real_extract(self, url): + content_id = self._match_id(url) + webpage = self._download_webpage(url, content_id) + + content_cls_cd = self._search_regex( + r'"\?content_cls_cd=(\d+)&', webpage, 'content cls cd', fatal=False) or '20' + media_info = self._download_json( + 'https://www.mbn.co.kr/player/mbnVodPlayer_2020.mbn', content_id, + note='Fetching playback data', query={ + 'content_cls_cd': content_cls_cd, + 'content_id': content_id, + 'relay_type': '1', + }) + + formats = [] + for stream_url in traverse_obj(media_info, ('movie_list', ..., 'url', {url_or_none})): + stream_url = re.sub(r'/(?:chunk|play)list(?:_pd\d+)?\.m3u8', '/manifest.m3u8', stream_url) + final_url = url_or_none(self._download_webpage( + f'https://www.mbn.co.kr/player/mbnStreamAuth_new_vod.mbn?vod_url={stream_url}', + content_id, note='Fetching authenticated m3u8 url')) + + formats.extend(self._extract_m3u8_formats(final_url, content_id, fatal=False)) + + return { + 'id': content_id, + **traverse_obj(media_info, { + 'title': ('movie_title', {str}), + 'duration': ('play_sec', {int_or_none}), + 'release_date': ('bcast_date', {lambda x: x.replace('.', '')}, {unified_strdate}), + 'thumbnail': ('movie_start_Img', {url_or_none}), + 'series': ('prog_nm', {str}), + 'episode_number': ('ad_contentnumber', {int_or_none}), + }), + 'formats': formats, + } From b286ec68f1f28798b3e371f888a2ed97d399cf77 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Fri, 13 Oct 2023 21:30:24 +0200 Subject: [PATCH 027/108] [ie/jtbc] Add extractors (#8314) Authored by: seproDev --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/jtbc.py | 156 ++++++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 yt_dlp/extractor/jtbc.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 45073628c879..ca4571182828 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -896,6 +896,10 @@ from .jove import JoveIE from .joj import JojIE from .jstream import JStreamIE +from .jtbc import ( + JTBCIE, + JTBCProgramIE, +) from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE diff --git a/yt_dlp/extractor/jtbc.py b/yt_dlp/extractor/jtbc.py new file mode 100644 index 000000000000..573f7492fe71 --- /dev/null +++ b/yt_dlp/extractor/jtbc.py @@ -0,0 +1,156 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class JTBCIE(InfoExtractor): + IE_DESC = 'jtbc.co.kr' + _VALID_URL = r'''(?x) + https?://(?: + vod\.jtbc\.co\.kr/player/(?:program|clip) + |tv\.jtbc\.co\.kr/(?:replay|trailer|clip)/pr\d+/pm\d+ + )/(?P(?:ep|vo)\d+)''' + _GEO_COUNTRIES = ['KR'] + + _TESTS = [{ + 'url': 'https://tv.jtbc.co.kr/replay/pr10011629/pm10067930/ep20216321/view', + 'md5': 'e6ade71d8c8685bbfd6e6ce4167c6a6c', + 'info_dict': { + 'id': 'VO10721192', + 'display_id': 'ep20216321', + 'ext': 'mp4', + 'title': '힘쎈여자 강남순 2회 다시보기', + 'description': 'md5:043c1d9019100ce271dba09995dbd1e2', + 'duration': 3770.0, + 'release_date': '20231008', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/drama/stronggirlnamsoon/img/20231008_163541_522_1.jpg', + 'series': '힘쎈여자 강남순', + }, + }, { + 'url': 'https://vod.jtbc.co.kr/player/program/ep20216733', + 'md5': '217a6d190f115a75e4bda0ceaa4cd7f4', + 'info_dict': { + 'id': 'VO10721429', + 'display_id': 'ep20216733', + 'ext': 'mp4', + 'title': '헬로 마이 닥터 친절한 진료실 149회 다시보기', + 'description': 'md5:1d70788a982dd5de26874a92fcffddb8', + 'duration': 2720.0, + 'release_date': '20231009', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/culture/hellomydoctor/img/20231009_095002_528_1.jpg', + 'series': '헬로 마이 닥터 친절한 진료실', + }, + }, { + 'url': 'https://vod.jtbc.co.kr/player/clip/vo10721270', + 'md5': '05782e2dc22a9c548aebefe62ae4328a', + 'info_dict': { + 'id': 'VO10721270', + 'display_id': 'vo10721270', + 'ext': 'mp4', + 'title': '뭉쳐야 찬다3 2회 예고편 - A매치로 향하는 마지막 관문💥', + 'description': 'md5:d48b51a8655c84843b4ed8d0c39aae68', + 'duration': 46.0, + 'release_date': '20231015', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/enter/soccer3/img/20231008_210957_775_1.jpg', + 'series': '뭉쳐야 찬다3', + }, + }, { + 'url': 'https://tv.jtbc.co.kr/trailer/pr10010392/pm10032526/vo10720912/view', + 'md5': '367d480eb3ef54a9cd7a4b4d69c4b32d', + 'info_dict': { + 'id': 'VO10720912', + 'display_id': 'vo10720912', + 'ext': 'mp4', + 'title': '아는 형님 404회 예고편 | 10월 14일(토) 저녁 8시 50분 방송!', + 'description': 'md5:2743bb1079ceb85bb00060f2ad8f0280', + 'duration': 148.0, + 'release_date': '20231014', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/enter/jtbcbros/img/20231006_230023_802_1.jpg', + 'series': '아는 형님', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + if display_id.startswith('vo'): + video_id = display_id.upper() + else: + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'data-vod="(VO\d+)"', webpage, 'vod id') + + playback_data = self._download_json( + f'https://api.jtbc.co.kr/vod/{video_id}', video_id, note='Downloading VOD playback data') + + subtitles = {} + for sub in traverse_obj(playback_data, ('tracks', lambda _, v: v['file'])): + subtitles.setdefault(sub.get('label', 'und'), []).append({'url': sub['file']}) + + formats = [] + for stream_url in traverse_obj(playback_data, ('sources', 'HLS', ..., 'file', {url_or_none})): + stream_url = re.sub(r'/playlist(?:_pd\d+)?\.m3u8', '/index.m3u8', stream_url) + formats.extend(self._extract_m3u8_formats(stream_url, video_id, fatal=False)) + + metadata = self._download_json( + 'https://now-api.jtbc.co.kr/v1/vod/detail', video_id, + note='Downloading mobile details', fatal=False, query={'vodFileId': video_id}) + return { + 'id': video_id, + 'display_id': display_id, + **traverse_obj(metadata, ('vodDetail', { + 'title': 'vodTitleView', + 'series': 'programTitle', + 'age_limit': ('watchAge', {int_or_none}), + 'release_date': ('broadcastDate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0), + 'description': 'episodeContents', + 'thumbnail': ('imgFileUrl', {url_or_none}), + })), + 'duration': parse_duration(playback_data.get('playTime')), + 'formats': formats, + 'subtitles': subtitles, + } + + +class JTBCProgramIE(InfoExtractor): + IE_NAME = 'JTBC:program' + _VALID_URL = r'https?://(?:vod\.jtbc\.co\.kr/program|tv\.jtbc\.co\.kr/replay)/(?Ppr\d+)/(?:replay|pm\d+)/?(?:$|[?#])' + + _TESTS = [{ + 'url': 'https://tv.jtbc.co.kr/replay/pr10010392/pm10032710', + 'info_dict': { + '_type': 'playlist', + 'id': 'pr10010392', + }, + 'playlist_count': 398, + }, { + 'url': 'https://vod.jtbc.co.kr/program/pr10011491/replay', + 'info_dict': { + '_type': 'playlist', + 'id': 'pr10011491', + }, + 'playlist_count': 59, + }] + + def _real_extract(self, url): + program_id = self._match_id(url) + + vod_list = self._download_json( + 'https://now-api.jtbc.co.kr/v1/vodClip/programHome/programReplayVodList', program_id, + note='Downloading program replay list', query={ + 'programId': program_id, + 'rowCount': '10000', + }) + + entries = [self.url_result(f'https://vod.jtbc.co.kr/player/program/{video_id}', JTBCIE, video_id) + for video_id in traverse_obj(vod_list, ('programReplayVodList', ..., 'episodeId'))] + return self.playlist_result(entries, program_id) From 2acd1d555ef89851c73773776715d3de9a0e30b9 Mon Sep 17 00:00:00 2001 From: Riteo Date: Fri, 13 Oct 2023 22:01:39 +0200 Subject: [PATCH 028/108] [core] Ensure thumbnail output directory exists (#7985) Closes #8203 Authored by: Riteo --- yt_dlp/YoutubeDL.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index f322b12a22bf..71d17ac01c88 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -4221,7 +4221,7 @@ def _write_subtitles(self, info_dict, filename): return ret def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None): - ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) ''' + ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename); or None if error ''' write_all = self.params.get('write_all_thumbnails', False) thumbnails, ret = [], [] if write_all or self.params.get('writethumbnail', False): @@ -4237,6 +4237,9 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None self.write_debug(f'Skipping writing {label} thumbnail') return ret + if not self._ensure_dir_exists(filename): + return None + for idx, t in list(enumerate(thumbnails))[::-1]: thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg') thumb_display_id = f'{label} thumbnail {t["id"]}' From b634ba742d8f38ce9ecfa0546485728b0c6c59d1 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 13 Oct 2023 17:15:35 -0500 Subject: [PATCH 029/108] [cleanup] Misc (#8338) Authored by: bashonly, gamer191 --- README.md | 3 +-- yt_dlp/extractor/banbye.py | 4 ++-- yt_dlp/extractor/breitbart.py | 2 +- yt_dlp/extractor/craftsy.py | 2 +- yt_dlp/extractor/cybrary.py | 4 ++-- yt_dlp/extractor/fifa.py | 2 +- yt_dlp/extractor/filmmodu.py | 2 +- yt_dlp/extractor/itprotv.py | 4 ++-- yt_dlp/extractor/jable.py | 4 ++-- yt_dlp/extractor/kommunetv.py | 2 +- yt_dlp/extractor/mainstreaming.py | 2 +- yt_dlp/extractor/mediaite.py | 2 +- yt_dlp/extractor/mocha.py | 2 +- yt_dlp/extractor/nfl.py | 4 ++-- yt_dlp/extractor/novaplay.py | 2 +- yt_dlp/extractor/nubilesporn.py | 2 +- yt_dlp/extractor/oftv.py | 4 ++-- yt_dlp/extractor/sina.py | 2 +- yt_dlp/extractor/twitter.py | 2 +- yt_dlp/extractor/utreon.py | 2 +- yt_dlp/extractor/vk.py | 4 ++-- yt_dlp/extractor/weverse.py | 12 ++++++------ yt_dlp/extractor/wimtv.py | 2 +- yt_dlp/extractor/xhamster.py | 4 ++-- yt_dlp/extractor/youtube.py | 2 +- yt_dlp/extractor/zoom.py | 2 +- yt_dlp/options.py | 2 +- 27 files changed, 40 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index a26482faaad3..dd4652d43aa5 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,6 @@ yt-dlp is a [youtube-dl](https://github.com/ytdl-org/youtube-dl) fork based on t * Fix for [n-sig based throttling](https://github.com/ytdl-org/youtube-dl/issues/29326) **\*** * Supports some (but not all) age-gated content without cookies * Download livestreams from the start using `--live-from-start` (*experimental*) - * `255kbps` audio is extracted (if available) from YouTube Music when premium cookies are given * Channel URLs download all uploads of the channel, including shorts and live * **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER]` @@ -913,7 +912,7 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git Defaults to ~/.netrc --netrc-cmd NETRC_CMD Command to execute to get the credentials for an extractor. - --video-password PASSWORD Video password (vimeo, youku) + --video-password PASSWORD Video-specific password --ap-mso MSO Adobe Pass multiple-system operator (TV provider) identifier, use --ap-list-mso for a list of available MSOs diff --git a/yt_dlp/extractor/banbye.py b/yt_dlp/extractor/banbye.py index e0fc93b973f2..dfcc82f02165 100644 --- a/yt_dlp/extractor/banbye.py +++ b/yt_dlp/extractor/banbye.py @@ -31,7 +31,7 @@ def _extract_playlist(self, playlist_id): class BanByeIE(BanByeBaseIE): - _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?banbye\.com/(?:en/)?watch/(?P[\w-]+)' _TESTS = [{ 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T', 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5', @@ -120,7 +120,7 @@ def _real_extract(self, url): class BanByeChannelIE(BanByeBaseIE): - _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?channel/(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?banbye\.com/(?:en/)?channel/(?P\w+)' _TESTS = [{ 'url': 'https://banbye.com/channel/ch_wrealu24', 'info_dict': { diff --git a/yt_dlp/extractor/breitbart.py b/yt_dlp/extractor/breitbart.py index ea0a59c8668e..b5abb7f19471 100644 --- a/yt_dlp/extractor/breitbart.py +++ b/yt_dlp/extractor/breitbart.py @@ -2,7 +2,7 @@ class BreitBartIE(InfoExtractor): - _VALID_URL = r'https?:\/\/(?:www\.)breitbart.com/videos/v/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?breitbart\.com/videos/v/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.breitbart.com/videos/v/5cOz1yup/?pl=Ij6NDOji', 'md5': '0aa6d1d6e183ac5ca09207fe49f17ade', diff --git a/yt_dlp/extractor/craftsy.py b/yt_dlp/extractor/craftsy.py index 307bfb94600e..5d3733143a7b 100644 --- a/yt_dlp/extractor/craftsy.py +++ b/yt_dlp/extractor/craftsy.py @@ -10,7 +10,7 @@ class CraftsyIE(InfoExtractor): - _VALID_URL = r'https?://www.craftsy.com/class/(?P[a-z0-9_-]+)/' + _VALID_URL = r'https?://www\.craftsy\.com/class/(?P[\w-]+)' _TESTS = [{ 'url': 'https://www.craftsy.com/class/the-midnight-quilt-show-season-5/', 'info_dict': { diff --git a/yt_dlp/extractor/cybrary.py b/yt_dlp/extractor/cybrary.py index 73f2439b3143..aeffe93b41a9 100644 --- a/yt_dlp/extractor/cybrary.py +++ b/yt_dlp/extractor/cybrary.py @@ -45,7 +45,7 @@ def _get_vimeo_id(self, activity_id): class CybraryIE(CybraryBaseIE): - _VALID_URL = r'https?://app.cybrary.it/immersive/(?P[0-9]+)/activity/(?P[0-9]+)' + _VALID_URL = r'https?://app\.cybrary\.it/immersive/(?P[0-9]+)/activity/(?P[0-9]+)' _TESTS = [{ 'url': 'https://app.cybrary.it/immersive/12487950/activity/63102', 'md5': '9ae12d37e555cb2ed554223a71a701d0', @@ -110,7 +110,7 @@ def _real_extract(self, url): class CybraryCourseIE(CybraryBaseIE): - _VALID_URL = r'https://app.cybrary.it/browse/course/(?P[\w-]+)/?(?:$|[#?])' + _VALID_URL = r'https://app\.cybrary\.it/browse/course/(?P[\w-]+)/?(?:$|[#?])' _TESTS = [{ 'url': 'https://app.cybrary.it/browse/course/az-500-microsoft-azure-security-technologies', 'info_dict': { diff --git a/yt_dlp/extractor/fifa.py b/yt_dlp/extractor/fifa.py index 8b4db3a8ae01..f604cbd40de7 100644 --- a/yt_dlp/extractor/fifa.py +++ b/yt_dlp/extractor/fifa.py @@ -8,7 +8,7 @@ class FifaIE(InfoExtractor): - _VALID_URL = r'https?://www.fifa.com/fifaplus/(?P\w{2})/watch/([^#?]+/)?(?P\w+)' + _VALID_URL = r'https?://www\.fifa\.com/fifaplus/(?P\w{2})/watch/([^#?]+/)?(?P\w+)' _TESTS = [{ 'url': 'https://www.fifa.com/fifaplus/en/watch/7on10qPcnyLajDDU3ntg6y', 'info_dict': { diff --git a/yt_dlp/extractor/filmmodu.py b/yt_dlp/extractor/filmmodu.py index 9eb550eed543..1e793560d46e 100644 --- a/yt_dlp/extractor/filmmodu.py +++ b/yt_dlp/extractor/filmmodu.py @@ -3,7 +3,7 @@ class FilmmoduIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?filmmodu.org/(?P[^/]+-(?:turkce-dublaj-izle|altyazili-izle))' + _VALID_URL = r'https?://(?:www\.)?filmmodu\.org/(?P[^/]+-(?:turkce-dublaj-izle|altyazili-izle))' _TESTS = [{ 'url': 'https://www.filmmodu.org/f9-altyazili-izle', 'md5': 'aeefd955c2a508a5bdaa3bcec8eeb0d4', diff --git a/yt_dlp/extractor/itprotv.py b/yt_dlp/extractor/itprotv.py index 4ac12603ae8f..b9d5c196d065 100644 --- a/yt_dlp/extractor/itprotv.py +++ b/yt_dlp/extractor/itprotv.py @@ -31,7 +31,7 @@ def _check_if_logged_in(self, webpage): class ITProTVIE(ITProTVBaseIE): - _VALID_URL = r'https://app.itpro.tv/course/(?P[\w-]+)/(?P[\w-]+)' + _VALID_URL = r'https://app\.itpro\.tv/course/(?P[\w-]+)/(?P[\w-]+)' _TESTS = [{ 'url': 'https://app.itpro.tv/course/guided-tour/introductionitprotv', 'md5': 'bca4a28c2667fd1a63052e71a94bb88c', @@ -102,7 +102,7 @@ def _real_extract(self, url): class ITProTVCourseIE(ITProTVBaseIE): - _VALID_URL = r'https?://app.itpro.tv/course/(?P[\w-]+)/?(?:$|[#?])' + _VALID_URL = r'https?://app\.itpro\.tv/course/(?P[\w-]+)/?(?:$|[#?])' _TESTS = [ { 'url': 'https://app.itpro.tv/course/guided-tour', diff --git a/yt_dlp/extractor/jable.py b/yt_dlp/extractor/jable.py index 84c3225e4858..71fed49ea08c 100644 --- a/yt_dlp/extractor/jable.py +++ b/yt_dlp/extractor/jable.py @@ -10,7 +10,7 @@ class JableIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?jable.tv/videos/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?jable\.tv/videos/(?P[\w-]+)' _TESTS = [{ 'url': 'https://jable.tv/videos/pppd-812/', 'md5': 'f1537283a9bc073c31ff86ca35d9b2a6', @@ -64,7 +64,7 @@ def _real_extract(self, url): class JablePlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?jable.tv/(?:categories|models|tags)/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?jable\.tv/(?:categories|models|tags)/(?P[\w-]+)' _TESTS = [{ 'url': 'https://jable.tv/models/kaede-karen/', 'info_dict': { diff --git a/yt_dlp/extractor/kommunetv.py b/yt_dlp/extractor/kommunetv.py index e21e556be346..a30905b579ca 100644 --- a/yt_dlp/extractor/kommunetv.py +++ b/yt_dlp/extractor/kommunetv.py @@ -3,7 +3,7 @@ class KommunetvIE(InfoExtractor): - _VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P\w+)' + _VALID_URL = r'https://\w+\.kommunetv\.no/archive/(?P\w+)' _TEST = { 'url': 'https://oslo.kommunetv.no/archive/921', 'md5': '5f102be308ee759be1e12b63d5da4bbc', diff --git a/yt_dlp/extractor/mainstreaming.py b/yt_dlp/extractor/mainstreaming.py index fe5589d598d8..fd9bba8bcb98 100644 --- a/yt_dlp/extractor/mainstreaming.py +++ b/yt_dlp/extractor/mainstreaming.py @@ -13,7 +13,7 @@ class MainStreamingIE(InfoExtractor): - _VALID_URL = r'https?://(?:webtools-?)?(?P[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P\w+)' + _VALID_URL = r'https?://(?:webtools-?)?(?P[A-Za-z0-9-]*\.msvdn\.net)/(?:embed|amp_embed|content)/(?P\w+)' _EMBED_REGEX = [rf']+?src=["\']?(?P{_VALID_URL})["\']?'] IE_DESC = 'MainStreaming Player' diff --git a/yt_dlp/extractor/mediaite.py b/yt_dlp/extractor/mediaite.py index ab253920b671..32887cbdef04 100644 --- a/yt_dlp/extractor/mediaite.py +++ b/yt_dlp/extractor/mediaite.py @@ -2,7 +2,7 @@ class MediaiteIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mediaite.com(?!/category)(?:/[\w-]+){2}' + _VALID_URL = r'https?://(?:www\.)?mediaite\.com(?!/category)(?:/[\w-]+){2}' _TESTS = [{ 'url': 'https://www.mediaite.com/sports/bill-burr-roasts-nfl-for-promoting-black-lives-matter-while-scheduling-more-games-after-all-the-sht-they-know-about-cte/', 'info_dict': { diff --git a/yt_dlp/extractor/mocha.py b/yt_dlp/extractor/mocha.py index 5f72b810bb5e..2fbc0e9110b3 100644 --- a/yt_dlp/extractor/mocha.py +++ b/yt_dlp/extractor/mocha.py @@ -3,7 +3,7 @@ class MochaVideoIE(InfoExtractor): - _VALID_URL = r'https?://video.mocha.com.vn/(?P[\w-]+)' + _VALID_URL = r'https?://video\.mocha\.com\.vn/(?P[\w-]+)' _TESTS = [{ 'url': 'http://video.mocha.com.vn/chuyen-meo-gia-su-tu-thong-diep-cuoc-song-v18694039', 'info_dict': { diff --git a/yt_dlp/extractor/nfl.py b/yt_dlp/extractor/nfl.py index bd060dba9db7..3f83cd20ef7f 100644 --- a/yt_dlp/extractor/nfl.py +++ b/yt_dlp/extractor/nfl.py @@ -247,7 +247,7 @@ def _real_extract(self, url): class NFLPlusReplayIE(NFLBaseIE): IE_NAME = 'nfl.com:plus:replay' - _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/games/(?P[\w-]+)(?:/(?P\d+))?' + _VALID_URL = r'https?://(?:www\.)?nfl\.com/plus/games/(?P[\w-]+)(?:/(?P\d+))?' _TESTS = [{ 'url': 'https://www.nfl.com/plus/games/giants-at-vikings-2022-post-1/1572108', 'info_dict': { @@ -342,7 +342,7 @@ def entries(): class NFLPlusEpisodeIE(NFLBaseIE): IE_NAME = 'nfl.com:plus:episode' - _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/episodes/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?nfl\.com/plus/episodes/(?P[\w-]+)' _TESTS = [{ 'note': 'Subscription required', 'url': 'https://www.nfl.com/plus/episodes/kurt-s-qb-insider-conference-championships', diff --git a/yt_dlp/extractor/novaplay.py b/yt_dlp/extractor/novaplay.py index 92d1d136c7c9..d8849cd88de3 100644 --- a/yt_dlp/extractor/novaplay.py +++ b/yt_dlp/extractor/novaplay.py @@ -3,7 +3,7 @@ class NovaPlayIE(InfoExtractor): - _VALID_URL = r'https://play.nova\.bg/video/.*/(?P\d+)' + _VALID_URL = r'https://play\.nova\.bg/video/[^?#]+/(?P\d+)' _TESTS = [ { 'url': 'https://play.nova.bg/video/ochakvaite/season-0/ochakvaite-2022-07-22-sybudi-se-sat/606627', diff --git a/yt_dlp/extractor/nubilesporn.py b/yt_dlp/extractor/nubilesporn.py index d4f1d9d67a8c..1d630f547dd9 100644 --- a/yt_dlp/extractor/nubilesporn.py +++ b/yt_dlp/extractor/nubilesporn.py @@ -19,7 +19,7 @@ class NubilesPornIE(InfoExtractor): _NETRC_MACHINE = 'nubiles-porn' _VALID_URL = r'''(?x) - https://members.nubiles-porn.com/video/watch/(?P\d+) + https://members\.nubiles-porn\.com/video/watch/(?P\d+) (?:/(?P[\w\-]+-s(?P\d+)e(?P\d+)))? ''' diff --git a/yt_dlp/extractor/oftv.py b/yt_dlp/extractor/oftv.py index 3ae7278fb917..4cac5184639f 100644 --- a/yt_dlp/extractor/oftv.py +++ b/yt_dlp/extractor/oftv.py @@ -4,7 +4,7 @@ class OfTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?of.tv/video/(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?of\.tv/video/(?P\w+)' _TESTS = [{ 'url': 'https://of.tv/video/627d7d95b353db0001dadd1a', 'md5': 'cb9cd5db3bb9ee0d32bfd7e373d6ef0a', @@ -34,7 +34,7 @@ def _real_extract(self, url): class OfTVPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?of.tv/creators/(?P[a-zA-Z0-9-]+)/.?' + _VALID_URL = r'https?://(?:www\.)?of\.tv/creators/(?P[a-zA-Z0-9-]+)/?(?:$|[?#])' _TESTS = [{ 'url': 'https://of.tv/creators/this-is-fire/', 'playlist_count': 8, diff --git a/yt_dlp/extractor/sina.py b/yt_dlp/extractor/sina.py index 98428118885b..eeb9ebb44c9f 100644 --- a/yt_dlp/extractor/sina.py +++ b/yt_dlp/extractor/sina.py @@ -11,7 +11,7 @@ class SinaIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/ + _VALID_URL = r'''(?x)https?://(?:[^/?#]+\.)?video\.sina\.com\.cn/ (?: (?:view/|.*\#)(?P\d+)| .+?/(?P[^/?#]+)(?:\.s?html)| diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 4065acbaaaa3..b6386214d9c7 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1741,7 +1741,7 @@ def _real_extract(self, url): class TwitterShortenerIE(TwitterBaseIE): IE_NAME = 'twitter:shortener' - _VALID_URL = r'https?://t.co/(?P[^?]+)|tco:(?P[^?]+)' + _VALID_URL = r'https?://t\.co/(?P[^?#]+)|tco:(?P[^?#]+)' _BASE_URL = 'https://t.co/' def _real_extract(self, url): diff --git a/yt_dlp/extractor/utreon.py b/yt_dlp/extractor/utreon.py index 90c10c051a79..8a91691019e3 100644 --- a/yt_dlp/extractor/utreon.py +++ b/yt_dlp/extractor/utreon.py @@ -10,7 +10,7 @@ class UtreonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?utreon.com/v/(?P[a-zA-Z0-9_-]+)' + _VALID_URL = r'https?://(?:www\.)?utreon\.com/v/(?P[\w-]+)' _TESTS = [{ 'url': 'https://utreon.com/v/z_I7ikQbuDw', 'info_dict': { diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 915422817a55..c12e873623fc 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -97,12 +97,12 @@ class VKIE(VKBaseIE): (?: (?: (?:(?:m|new)\.)?vk\.com/video_| - (?:www\.)?daxab.com/ + (?:www\.)?daxab\.com/ ) ext\.php\?(?P.*?\boid=(?P-?\d+).*?\bid=(?P\d+).*)| (?: (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?(?:video|clip)| - (?:www\.)?daxab.com/embed/ + (?:www\.)?daxab\.com/embed/ ) (?P-?\d+_\d+)(?:.*\blist=(?P([\da-f]+)|(ln-[\da-zA-Z]+)))? ) diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py index bbf62856a6c3..47f36806bf03 100644 --- a/yt_dlp/extractor/weverse.py +++ b/yt_dlp/extractor/weverse.py @@ -182,7 +182,7 @@ def _extract_live_status(self, data): class WeverseIE(WeverseBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/live/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/live/(?P[\d-]+)' _TESTS = [{ 'url': 'https://weverse.io/billlie/live/0-107323480', 'md5': '1fa849f00181eef9100d3c8254c47979', @@ -344,7 +344,7 @@ def _real_extract(self, url): class WeverseMediaIE(WeverseBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/media/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/media/(?P[\d-]+)' _TESTS = [{ 'url': 'https://weverse.io/billlie/media/4-116372884', 'md5': '8efc9cfd61b2f25209eb1a5326314d28', @@ -420,7 +420,7 @@ def _real_extract(self, url): class WeverseMomentIE(WeverseBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/moment/(?P[\da-f]+)/post/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/moment/(?P[\da-f]+)/post/(?P[\d-]+)' _TESTS = [{ 'url': 'https://weverse.io/secretnumber/moment/66a07e164b56a696ee71c99315ffe27b/post/1-117229444', 'md5': '87733ac19a54081b7dfc2442036d282b', @@ -516,7 +516,7 @@ def _real_extract(self, url): class WeverseLiveTabIE(WeverseTabBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/live/?(?:[?#]|$)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/live/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://weverse.io/billlie/live/', 'playlist_mincount': 55, @@ -534,7 +534,7 @@ class WeverseLiveTabIE(WeverseTabBaseIE): class WeverseMediaTabIE(WeverseTabBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/media(?:/|/all|/new)?(?:[?#]|$)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/media(?:/|/all|/new)?(?:[?#]|$)' _TESTS = [{ 'url': 'https://weverse.io/billlie/media/', 'playlist_mincount': 231, @@ -558,7 +558,7 @@ class WeverseMediaTabIE(WeverseTabBaseIE): class WeverseLiveIE(WeverseBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/?(?:[?#]|$)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://weverse.io/purplekiss', 'info_dict': { diff --git a/yt_dlp/extractor/wimtv.py b/yt_dlp/extractor/wimtv.py index 5711123903e0..f9bf092df5de 100644 --- a/yt_dlp/extractor/wimtv.py +++ b/yt_dlp/extractor/wimtv.py @@ -11,7 +11,7 @@ class WimTVIE(InfoExtractor): _player = None _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _VALID_URL = r'''(?x: - https?://platform.wim.tv/ + https?://platform\.wim\.tv/ (?: (?:embed/)?\? |\#/webtv/.+?/ diff --git a/yt_dlp/extractor/xhamster.py b/yt_dlp/extractor/xhamster.py index aec1f20bb813..01ac5ddb65ed 100644 --- a/yt_dlp/extractor/xhamster.py +++ b/yt_dlp/extractor/xhamster.py @@ -24,7 +24,7 @@ class XHamsterIE(InfoExtractor): _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com|xhvid\.com)' _VALID_URL = r'''(?x) https?:// - (?:.+?\.)?%s/ + (?:[^/?#]+\.)?%s/ (?: movies/(?P[\dA-Za-z]+)/(?P[^/]*)\.html| videos/(?P[^/]*)-(?P[\dA-Za-z]+) @@ -372,7 +372,7 @@ def get_height(s): class XHamsterEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P\d+)' % XHamsterIE._DOMAINS + _VALID_URL = r'https?://(?:[^/?#]+\.)?%s/xembed\.php\?video=(?P\d+)' % XHamsterIE._DOMAINS _EMBED_REGEX = [r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1'] _TEST = { 'url': 'http://xhamster.com/xembed.php?video=3328539', diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index c5be366362ee..ac28ed7d282e 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -949,7 +949,7 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers main_rm = next(main_retries) # Manual retry loop for multiple RetryManagers # The proper RetryManager MUST be advanced after an error - # and it's result MUST be checked if the manager is non fatal + # and its result MUST be checked if the manager is non fatal while True: try: response = self._call_api( diff --git a/yt_dlp/extractor/zoom.py b/yt_dlp/extractor/zoom.py index 1e41d04349f5..329ba1415ee3 100644 --- a/yt_dlp/extractor/zoom.py +++ b/yt_dlp/extractor/zoom.py @@ -13,7 +13,7 @@ class ZoomIE(InfoExtractor): IE_NAME = 'zoom' - _VALID_URL = r'(?Phttps?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?Pplay|share)/(?P[A-Za-z0-9_.-]+)' + _VALID_URL = r'(?Phttps?://(?:[^.]+\.)?zoom\.us/)rec(?:ording)?/(?Pplay|share)/(?P[\w.-]+)' _TESTS = [{ 'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', 'md5': 'ab445e8c911fddc4f9adc842c2c5d434', diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 163809706a9d..85a6402a6d60 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -727,7 +727,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): authentication.add_option( '--video-password', dest='videopassword', metavar='PASSWORD', - help='Video password (vimeo, youku)') + help='Video-specific password') authentication.add_option( '--ap-mso', dest='ap_mso', metavar='MSO', From b73c4093187cffddcb6fbc4bfbdc0fea244ff1e9 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 13 Oct 2023 22:22:31 +0000 Subject: [PATCH 030/108] Release 2023.10.13 Created by: bashonly :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 +++---- .../ISSUE_TEMPLATE/2_site_support_request.yml | 8 +++---- .../ISSUE_TEMPLATE/3_site_feature_request.yml | 8 +++---- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 +++---- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 +++---- .github/ISSUE_TEMPLATE/6_question.yml | 8 +++---- CONTRIBUTORS | 4 ++++ Changelog.md | 24 +++++++++++++++++++ supportedsites.md | 4 ++++ yt_dlp/version.py | 4 ++-- 10 files changed, 58 insertions(+), 26 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index dacb41758d43..6c713e5a8348 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.13** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -64,7 +64,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.13 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -72,8 +72,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + Latest version: 2023.10.13, Current version: 2023.10.13 + yt-dlp is up to date (2023.10.13) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index ec6e298a1921..e20036ce8dc1 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.13** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -76,7 +76,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.13 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -84,8 +84,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + Latest version: 2023.10.13, Current version: 2023.10.13 + yt-dlp is up to date (2023.10.13) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index cf3cdd21f317..a9845b6b8382 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.13** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -72,7 +72,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.13 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -80,8 +80,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + Latest version: 2023.10.13, Current version: 2023.10.13 + yt-dlp is up to date (2023.10.13) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 1bbcf6895663..d3d60a11e5d9 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.13** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.13 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,8 +65,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + Latest version: 2023.10.13, Current version: 2023.10.13 + yt-dlp is up to date (2023.10.13) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index d3bc06e80917..57de148d0454 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.13** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -53,7 +53,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.13 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -61,7 +61,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + Latest version: 2023.10.13, Current version: 2023.10.13 + yt-dlp is up to date (2023.10.13) render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 30311d5b5694..7b55a7427b31 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.13** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -59,7 +59,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.13 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -67,7 +67,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + Latest version: 2023.10.13, Current version: 2023.10.13 + yt-dlp is up to date (2023.10.13) render: shell diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 8eda41307264..3035ee2961d9 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -509,3 +509,7 @@ handlerug jiru madewokherd xofe +awalgarg +midnightveil +naginatana +Riteo diff --git a/Changelog.md b/Changelog.md index 48dcbf10295c..6f45eab2f243 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,30 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2023.10.13 + +#### Core changes +- [Ensure thumbnail output directory exists](https://github.com/yt-dlp/yt-dlp/commit/2acd1d555ef89851c73773776715d3de9a0e30b9) ([#7985](https://github.com/yt-dlp/yt-dlp/issues/7985)) by [Riteo](https://github.com/Riteo) +- **utils** + - `js_to_json`: [Fix `Date` constructor parsing](https://github.com/yt-dlp/yt-dlp/commit/9d7ded6419089c1bf252496073f73ad90ed71004) ([#8295](https://github.com/yt-dlp/yt-dlp/issues/8295)) by [awalgarg](https://github.com/awalgarg), [Grub4K](https://github.com/Grub4K) + - `write_xattr`: [Use `os.setxattr` if available](https://github.com/yt-dlp/yt-dlp/commit/84e26038d4002e763ea51ca1bdce4f7e63c540bf) ([#8205](https://github.com/yt-dlp/yt-dlp/issues/8205)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **artetv**: [Support age-restricted content](https://github.com/yt-dlp/yt-dlp/commit/09f815ad52843219a7ee3f2a0dddf6c250c91f0c) ([#8301](https://github.com/yt-dlp/yt-dlp/issues/8301)) by [StefanLobbenmeier](https://github.com/StefanLobbenmeier) +- **jtbc**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b286ec68f1f28798b3e371f888a2ed97d399cf77) ([#8314](https://github.com/yt-dlp/yt-dlp/issues/8314)) by [seproDev](https://github.com/seproDev) +- **mbn**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e030b6b6fba7b2f4614ad2ab9f7649d40a2dd305) ([#8312](https://github.com/yt-dlp/yt-dlp/issues/8312)) by [seproDev](https://github.com/seproDev) +- **nhk**: [Fix Japanese-language VOD extraction](https://github.com/yt-dlp/yt-dlp/commit/4de94b9e165bfd6421a692f5f2eabcdb08edcb71) ([#8309](https://github.com/yt-dlp/yt-dlp/issues/8309)) by [garret1317](https://github.com/garret1317) +- **radiko**: [Fix bug with `downloader_options`](https://github.com/yt-dlp/yt-dlp/commit/b9316642313bbc9e209ac0d2276d37ba60bceb49) by [bashonly](https://github.com/bashonly) +- **tenplay**: [Add support for seasons](https://github.com/yt-dlp/yt-dlp/commit/88a99c87b680ae59002534a517e191f46c42cbd4) ([#7939](https://github.com/yt-dlp/yt-dlp/issues/7939)) by [midnightveil](https://github.com/midnightveil) +- **youku**: [Improve tudou.com support](https://github.com/yt-dlp/yt-dlp/commit/b7098d46b552a9322c6cea39ba80be5229f922de) ([#8160](https://github.com/yt-dlp/yt-dlp/issues/8160)) by [naginatana](https://github.com/naginatana) +- **youtube**: [Fix bug with `--extractor-retries inf`](https://github.com/yt-dlp/yt-dlp/commit/feebf6d02fc9651331eee2af5e08e6112288163b) ([#8328](https://github.com/yt-dlp/yt-dlp/issues/8328)) by [Grub4K](https://github.com/Grub4K) + +#### Downloader changes +- **fragment**: [Improve progress calculation](https://github.com/yt-dlp/yt-dlp/commit/1c51c520f7b511ebd9e4eb7322285a8c31eedbbd) ([#8241](https://github.com/yt-dlp/yt-dlp/issues/8241)) by [Grub4K](https://github.com/Grub4K) + +#### Misc. changes +- **cleanup**: Miscellaneous: [b634ba7](https://github.com/yt-dlp/yt-dlp/commit/b634ba742d8f38ce9ecfa0546485728b0c6c59d1) by [bashonly](https://github.com/bashonly), [gamer191](https://github.com/gamer191) + ### 2023.10.07 #### Extractor changes diff --git a/supportedsites.md b/supportedsites.md index ecef4dc2d1e0..0ab61d68d05e 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -657,6 +657,8 @@ - **Joj** - **Jove** - **JStream** + - **JTBC**: jtbc.co.kr + - **JTBC:program** - **JWPlatform** - **Kakao** - **Kaltura** @@ -766,6 +768,7 @@ - **massengeschmack.tv** - **Masters** - **MatchTV** + - **MBN**: mbn.co.kr (매일방송) - **MDR**: MDR.DE and KiKA - **MedalTV** - **media.ccc.de** @@ -1468,6 +1471,7 @@ - **Tempo** - **TennisTV**: [*tennistv*](## "netrc machine") - **TenPlay**: [*10play*](## "netrc machine") + - **TenPlaySeason** - **TF1** - **TFO** - **TheHoleTv** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 60c1c94cc35f..9d00963162f3 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2023.10.07' +__version__ = '2023.10.13' -RELEASE_GIT_HEAD = '377e85a1797db9e98b78b38203ed9d4ded229991' +RELEASE_GIT_HEAD = 'b634ba742d8f38ce9ecfa0546485728b0c6c59d1' VARIANT = None From 700444c23ddb65f618c2abd942acdc0c58c650b1 Mon Sep 17 00:00:00 2001 From: bashonly Date: Fri, 13 Oct 2023 18:02:06 -0500 Subject: [PATCH 031/108] [ci] Run core tests with dependencies Authored by: bashonly, coletdjnz --- .github/workflows/core.yml | 2 +- devscripts/make_changelog.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 7fcf11dfa2a2..7acaee1e83ee 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -33,7 +33,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install pytest - run: pip install pytest + run: pip install pytest -r requirements.txt - name: Run tests continue-on-error: False run: | diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 9ff65db1467d..d0e893e58112 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -56,6 +56,7 @@ def subgroup_lookup(cls): }, cls.MISC: { 'build', + 'ci', 'cleanup', 'devscripts', 'docs', From 8a8b54523addf46dfd50ef599761a81bc22362e6 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 14 Oct 2023 12:33:00 +1300 Subject: [PATCH 032/108] [rh:requests] Add handler for `requests` HTTP library (#3668) Adds support for HTTPS proxies and persistent connections (keep-alive) Closes https://github.com/yt-dlp/yt-dlp/issues/1890 Resolves https://github.com/yt-dlp/yt-dlp/issues/4070 Resolves https://github.com/ytdl-org/youtube-dl/issues/32549 Resolves https://github.com/ytdl-org/youtube-dl/issues/14523 Resolves https://github.com/ytdl-org/youtube-dl/issues/13734 Authored by: coletdjnz, Grub4K, bashonly --- .github/workflows/core.yml | 2 +- README.md | 4 +- requirements.txt | 2 + setup.py | 9 +- test/test_networking.py | 168 +++++++++--- test/test_socks.py | 36 +-- yt_dlp/YoutubeDL.py | 7 +- yt_dlp/__pyinstaller/hook-yt_dlp.py | 4 +- yt_dlp/dependencies/__init__.py | 9 + yt_dlp/networking/__init__.py | 10 + yt_dlp/networking/_helper.py | 20 +- yt_dlp/networking/_requests.py | 398 ++++++++++++++++++++++++++++ yt_dlp/networking/_urllib.py | 26 +- yt_dlp/options.py | 3 +- 14 files changed, 619 insertions(+), 79 deletions(-) create mode 100644 yt_dlp/networking/_requests.py diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 7acaee1e83ee..049faf3738d4 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -32,7 +32,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install pytest + - name: Install dependencies run: pip install pytest -r requirements.txt - name: Run tests continue-on-error: False diff --git a/README.md b/README.md index dd4652d43aa5..3b7432474d64 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,7 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu * yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior * yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: [~~aria2c~~](https://github.com/yt-dlp/yt-dlp/issues/5931)). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is * yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filter` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this +* yt-dlp uses modern http client backends such as `requests`. Use `--compat-options prefer-legacy-http-handler` to prefer the legacy http handler (`urllib`) to be used for standard http requests. For ease of use, a few more compat options are available: @@ -164,7 +165,7 @@ For ease of use, a few more compat options are available: * `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter` * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` -* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress`. Use this to enable all future compat options +* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler`. Use this to enable all future compat options # INSTALLATION @@ -274,6 +275,7 @@ While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly * [**certifi**](https://github.com/certifi/python-certifi)\* - Provides Mozilla's root certificate bundle. Licensed under [MPLv2](https://github.com/certifi/python-certifi/blob/master/LICENSE) * [**brotli**](https://github.com/google/brotli)\* or [**brotlicffi**](https://github.com/python-hyper/brotlicffi) - [Brotli](https://en.wikipedia.org/wiki/Brotli) content encoding support. Both licensed under MIT [1](https://github.com/google/brotli/blob/master/LICENSE) [2](https://github.com/python-hyper/brotlicffi/blob/master/LICENSE) * [**websockets**](https://github.com/aaugustin/websockets)\* - For downloading over websocket. Licensed under [BSD-3-Clause](https://github.com/aaugustin/websockets/blob/main/LICENSE) +* [**requests**](https://github.com/psf/requests)\* - HTTP library. For HTTPS proxy and persistent connections support. Licensed under [Apache-2.0](https://github.com/psf/requests/blob/main/LICENSE) ### Metadata diff --git a/requirements.txt b/requirements.txt index dde37120f7c7..112c30aeb7ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ websockets brotli; platform_python_implementation=='CPython' brotlicffi; platform_python_implementation!='CPython' certifi +requests>=2.31.0,<3 +urllib3>=1.26.17,<3 \ No newline at end of file diff --git a/setup.py b/setup.py index a2f9f55c365c..1740db27d839 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,14 @@ def py2exe_params(): 'compressed': 1, 'optimize': 2, 'dist_dir': './dist', - 'excludes': ['Crypto', 'Cryptodome'], # py2exe cannot import Crypto + 'excludes': [ + # py2exe cannot import Crypto + 'Crypto', + 'Cryptodome', + # py2exe appears to confuse this with our socks library. + # We don't use pysocks and urllib3.contrib.socks would fail to import if tried. + 'urllib3.contrib.socks' + ], 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], # Modules that are only imported dynamically must be added here 'includes': ['yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated', diff --git a/test/test_networking.py b/test/test_networking.py index 5308c8d6faf4..2b45deac79be 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -28,7 +28,7 @@ from test.helper import FakeYDL, http_server_port from yt_dlp.cookies import YoutubeDLCookieJar -from yt_dlp.dependencies import brotli +from yt_dlp.dependencies import brotli, requests, urllib3 from yt_dlp.networking import ( HEADRequest, PUTRequest, @@ -43,6 +43,7 @@ HTTPError, IncompleteRead, NoSupportingHandlers, + ProxyError, RequestError, SSLError, TransportError, @@ -305,7 +306,7 @@ def setup_class(cls): class TestHTTPRequestHandler(TestRequestHandlerBase): - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_verify_cert(self, handler): with handler() as rh: with pytest.raises(CertificateVerifyError): @@ -316,7 +317,7 @@ def test_verify_cert(self, handler): assert r.status == 200 r.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_ssl_error(self, handler): # HTTPS server with too old TLS version # XXX: is there a better way to test this than to create a new server? @@ -334,7 +335,7 @@ def test_ssl_error(self, handler): validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) assert not issubclass(exc_info.type, CertificateVerifyError) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_percent_encode(self, handler): with handler() as rh: # Unicode characters should be encoded with uppercase percent-encoding @@ -346,7 +347,7 @@ def test_percent_encode(self, handler): assert res.status == 200 res.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_remove_dot_segments(self, handler): with handler() as rh: # This isn't a comprehensive test, @@ -361,14 +362,14 @@ def test_remove_dot_segments(self, handler): assert res.url == f'http://127.0.0.1:{self.http_port}/headers' res.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_unicode_path_redirection(self, handler): with handler() as rh: r = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/302-non-ascii-redirect')) assert r.url == f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html' r.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_raise_http_error(self, handler): with handler() as rh: for bad_status in (400, 500, 599, 302): @@ -378,7 +379,7 @@ def test_raise_http_error(self, handler): # Should not raise an error validate_and_send(rh, Request('http://127.0.0.1:%d/gen_200' % self.http_port)).close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_response_url(self, handler): with handler() as rh: # Response url should be that of the last url in redirect chain @@ -389,7 +390,7 @@ def test_response_url(self, handler): assert res2.url == f'http://127.0.0.1:{self.http_port}/gen_200' res2.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_redirect(self, handler): with handler() as rh: def do_req(redirect_status, method, assert_no_content=False): @@ -444,7 +445,7 @@ def do_req(redirect_status, method, assert_no_content=False): with pytest.raises(HTTPError): do_req(code, 'GET') - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_request_cookie_header(self, handler): # We should accept a Cookie header being passed as in normal headers and handle it appropriately. with handler() as rh: @@ -476,19 +477,19 @@ def test_request_cookie_header(self, handler): assert b'Cookie: test=ytdlp' not in data assert b'Cookie: test=test' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_redirect_loop(self, handler): with handler() as rh: with pytest.raises(HTTPError, match='redirect loop'): validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_loop')) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_incompleteread(self, handler): with handler(timeout=2) as rh: with pytest.raises(IncompleteRead): validate_and_send(rh, Request('http://127.0.0.1:%d/incompleteread' % self.http_port)).read() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_cookies(self, handler): cookiejar = YoutubeDLCookieJar() cookiejar.set_cookie(http.cookiejar.Cookie( @@ -505,7 +506,7 @@ def test_cookies(self, handler): rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={'cookiejar': cookiejar})).read() assert b'Cookie: test=ytdlp' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_headers(self, handler): with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh: @@ -521,7 +522,7 @@ def test_headers(self, handler): assert b'Test2: test2' not in data assert b'Test3: test3' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_timeout(self, handler): with handler() as rh: # Default timeout is 20 seconds, so this should go through @@ -537,7 +538,7 @@ def test_timeout(self, handler): validate_and_send( rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1', extensions={'timeout': 4})) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_source_address(self, handler): source_address = f'127.0.0.{random.randint(5, 255)}' with handler(source_address=source_address) as rh: @@ -545,13 +546,13 @@ def test_source_address(self, handler): rh, Request(f'http://127.0.0.1:{self.http_port}/source_address')).read().decode() assert source_address == data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_gzip_trailing_garbage(self, handler): with handler() as rh: data = validate_and_send(rh, Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode() assert data == '