From 9f458377531209793ba92d4666994b36f906bfd7 Mon Sep 17 00:00:00 2001 From: tsukumijima Date: Tue, 22 Sep 2020 03:50:11 +0900 Subject: [PATCH 1/8] [tver] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tver.py | 133 +++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 youtube_dl/extractor/tver.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ae7079a6a4d..f65f0189ce9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1198,6 +1198,7 @@ TVANouvellesIE, TVANouvellesArticleIE, ) +from .tver import TVerIE from .tvc import ( TVCIE, TVCArticleIE, diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py new file mode 100644 index 00000000000..eefd6f1af78 --- /dev/null +++ b/youtube_dl/extractor/tver.py @@ -0,0 +1,133 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import csv +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveNewIE +from ..compat import ( + compat_HTTPError, +) +from ..utils import ( + ExtractorError, + js_to_json, + unsmuggle_url, +) + + +class TVerIE(BrightcoveNewIE): + + _TESTS = [ + { + 'url': 'https://tver.jp/feature/f0057485', # 'feature' + 'md5': '1c1c09662252571992dee0441028b4ec', # MD5 hash of a short video downloaded by running youtube-dl with the --test option + 'info_dict': { + 'id': 'f0057485', # TVer ID + 'display_id': 'ref:hanzawa_naoki---s2----323-001', # Brightcove ID + 'ext': 'mp4', + 'title': '半沢直樹(新シリーズ) 第1話 子会社VS銀行!飛ばされた半沢の新たな下剋上が始まる', + 'description': '大和田(香川照之)の不正を糾弾し、子会社へ出向を命じられた半沢直樹(堺雅人)は、東京セントラル証券営業企画部長に。ある日1500億円超の買収案件が舞い込むが…。', + 'thumbnail': 'https://cf-images.ap-northeast-1.prod.boltdns.net/v1/jit/4031511847001/37b5f176-3989-48d9-81d1-4688e80c5531/main/1920x1080/34m10s16ms/match/image.jpg', + 'duration': 4100.032, + 'timestamp': 1600308623, + 'upload_date': '20200917', + 'uploader_id': '4031511847001', + }, + 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', + }, + { + 'url': 'https://tver.jp/corner/f0056997', # 'corner' + 'md5': 'aac4e681dcdb775fc44497da4f7bdd05', # MD5 hash of a short video downloaded by running youtube-dl with the --test option + 'info_dict': { + 'id': 'f0056997', # TVer ID + 'display_id': 'ref:kanokari_10', # Brightcove ID + 'ext': 'mp4', + 'title': '彼女、お借りします 第10話「友達の彼女」-トモカノ-', + 'description': 'バイトの初任給を何に使おうか考える和也だったが、ふと栗林のことが脳裏をよぎる。最近栗林の様子がおかしいと、木部から話を聞いていたのだ。ボーッとしていたり、女性不信のつぶやきをしているという。和也は意を決して、栗林を呼び出すことに。翌日、栗林が和也を待っていると──「駿君、だよね?」。待ち合わせ場所にやって来たのは、千鶴だった……!', + 'thumbnail': 'https://cf-images.ap-northeast-1.prod.boltdns.net/v1/jit/5102072605001/900216cc-2e97-4c19-93bb-1a531de358d6/main/1920x1080/12m18s37ms/match/image.jpg', + 'duration': 1476.075, + 'timestamp': 1599554409, + 'upload_date': '20200908', + 'uploader_id': '5102072605001', + }, + 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', + }, + { + 'url': 'https://tver.jp/episode/76799350', # 'episode' + 'md5': 'ad893db02b8a3e949216c463af7ce51e', # MD5 hash of a short video downloaded by running youtube-dl with the --test option + 'info_dict': { + 'id': '76799350', # TVer ID + 'display_id': '2366_2365_4533', # Brightcove ID + 'ext': 'mp4', + 'title': '港時間 #49 神奈川県/リビエラシーボニアマリーナ 9月18日(金)放送分', + 'description': '【毎週金曜 よる12時15分から放送】\n\n日本のヨット文化 を育んできた三浦半島の西海岸、小網代湾にあるリビエラシーボニアマリーナ。昨年から始まったSailGPの日本チームを率いるヨット界のレジェンドに会いました。', + 'thumbnail': 'https://cf-images.ap-northeast-1.prod.boltdns.net/v1/jit/4394098883001/904361ca-40d3-4028-8478-8916b9a0ff49/main/1920x1080/58s80ms/match/image.jpg', + 'duration': 116.16, + 'timestamp': 1600052421, + 'upload_date': '20200914', + 'uploader_id': '4394098883001', + }, + 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', + }, + ] + + IE_NAME = 'TVer' + IE_DESC = 'TVer' + + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(corner|episode|feature)/(?Pf?[0-9]+)' + _GEO_COUNTRIES = ['JP'] # TVer service is limited to Japan only + + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=ref:%s' + + def _real_extract(self, url): + + # extract video id + video_id = self._match_id(url) + + # download webpage + webpage = self._download_webpage(url, video_id) + + # extract video information + video_info_csv = self._search_regex(r'addPlayer\((?P.*?)\);', webpage, 'video information', flags=re.DOTALL).strip() + video_info_csv = video_info_csv.replace('\t', '').replace('\n', '').replace('\'', '') # remove \t and \n and ' + video_info = video_info_csv.split(',') + + # extract brightcove account id + brightcove_account_id = video_info[3] + + # extract brightcove video id + brightcove_video_id = video_info[4] + + # brightcove url + brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % (brightcove_account_id, brightcove_video_id) + + # debug output + if self._downloader.params.get('verbose', False): + self.to_screen('Video Information: %s' % video_info) + self.to_screen('Brightcove Account ID: %s' % brightcove_account_id) + self.to_screen('Brightcove Video ID: %s' % brightcove_video_id) + self.to_screen('Brightcove URL: %s' % brightcove_url) + + # evacuate _VALID_URL + _VALID_URL = self._VALID_URL + + # temporarily replace _VALID_URL + # prevent _VALID_URL from being the URL of Tver when executing the parent class's _real_extract () method + self._VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*(?Pvideo|playlist)Id=(?P\d+|ref:[^&]+)' + + # get video information + info_dict = super(TVerIE, self)._real_extract(brightcove_url) + + # undo _VALID_URL + self._VALID_URL = _VALID_URL + + # replacement + info_dict['id'] = video_id # TVer ID + info_dict['display_id'] = brightcove_video_id # Brightcove ID + info_dict['thumbnail'] = info_dict['thumbnail'].replace('160x90', '1920x1080') # select large thumbnail + info_dict['description'] = self._html_search_regex( # desctiption + r'
(?P.*?)
', + webpage, 'description', flags=re.DOTALL) + + return info_dict From e57602b380b07f4c7fe2a5249e25d350b8ed59ac Mon Sep 17 00:00:00 2001 From: tsukumijima Date: Tue, 22 Sep 2020 04:17:54 +0900 Subject: [PATCH 2/8] [tver] fix the code pointed out in flake8 --- youtube_dl/extractor/tver.py | 69 +++++++++++++++++------------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index eefd6f1af78..4132be9185a 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -1,30 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals -import csv import re -from .common import InfoExtractor from .brightcove import BrightcoveNewIE -from ..compat import ( - compat_HTTPError, -) -from ..utils import ( - ExtractorError, - js_to_json, - unsmuggle_url, -) class TVerIE(BrightcoveNewIE): - + _TESTS = [ { - 'url': 'https://tver.jp/feature/f0057485', # 'feature' - 'md5': '1c1c09662252571992dee0441028b4ec', # MD5 hash of a short video downloaded by running youtube-dl with the --test option + 'url': 'https://tver.jp/feature/f0057485', # 'feature' + 'md5': '1c1c09662252571992dee0441028b4ec', # MD5 hash of a short video downloaded by running youtube-dl with the --test option 'info_dict': { - 'id': 'f0057485', # TVer ID - 'display_id': 'ref:hanzawa_naoki---s2----323-001', # Brightcove ID + 'id': 'f0057485', # TVer ID + 'display_id': 'ref:hanzawa_naoki---s2----323-001', # Brightcove ID 'ext': 'mp4', 'title': '半沢直樹(新シリーズ) 第1話 子会社VS銀行!飛ばされた半沢の新たな下剋上が始まる', 'description': '大和田(香川照之)の不正を糾弾し、子会社へ出向を命じられた半沢直樹(堺雅人)は、東京セントラル証券営業企画部長に。ある日1500億円超の買収案件が舞い込むが…。', @@ -37,11 +27,11 @@ class TVerIE(BrightcoveNewIE): 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', }, { - 'url': 'https://tver.jp/corner/f0056997', # 'corner' - 'md5': 'aac4e681dcdb775fc44497da4f7bdd05', # MD5 hash of a short video downloaded by running youtube-dl with the --test option + 'url': 'https://tver.jp/corner/f0056997', # 'corner' + 'md5': 'aac4e681dcdb775fc44497da4f7bdd05', # MD5 hash of a short video downloaded by running youtube-dl with the --test option 'info_dict': { - 'id': 'f0056997', # TVer ID - 'display_id': 'ref:kanokari_10', # Brightcove ID + 'id': 'f0056997', # TVer ID + 'display_id': 'ref:kanokari_10', # Brightcove ID 'ext': 'mp4', 'title': '彼女、お借りします 第10話「友達の彼女」-トモカノ-', 'description': 'バイトの初任給を何に使おうか考える和也だったが、ふと栗林のことが脳裏をよぎる。最近栗林の様子がおかしいと、木部から話を聞いていたのだ。ボーッとしていたり、女性不信のつぶやきをしているという。和也は意を決して、栗林を呼び出すことに。翌日、栗林が和也を待っていると──「駿君、だよね?」。待ち合わせ場所にやって来たのは、千鶴だった……!', @@ -54,11 +44,11 @@ class TVerIE(BrightcoveNewIE): 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', }, { - 'url': 'https://tver.jp/episode/76799350', # 'episode' - 'md5': 'ad893db02b8a3e949216c463af7ce51e', # MD5 hash of a short video downloaded by running youtube-dl with the --test option + 'url': 'https://tver.jp/episode/76799350', # 'episode' + 'md5': 'ad893db02b8a3e949216c463af7ce51e', # MD5 hash of a short video downloaded by running youtube-dl with the --test option 'info_dict': { - 'id': '76799350', # TVer ID - 'display_id': '2366_2365_4533', # Brightcove ID + 'id': '76799350', # TVer ID + 'display_id': '2366_2365_4533', # Brightcove ID 'ext': 'mp4', 'title': '港時間 #49 神奈川県/リビエラシーボニアマリーナ 9月18日(金)放送分', 'description': '【毎週金曜 よる12時15分から放送】\n\n日本のヨット文化 を育んできた三浦半島の西海岸、小網代湾にあるリビエラシーボニアマリーナ。昨年から始まったSailGPの日本チームを率いるヨット界のレジェンドに会いました。', @@ -71,13 +61,13 @@ class TVerIE(BrightcoveNewIE): 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', }, ] - + IE_NAME = 'TVer' IE_DESC = 'TVer' _VALID_URL = r'https?://(?:www\.)?tver\.jp/(corner|episode|feature)/(?Pf?[0-9]+)' - _GEO_COUNTRIES = ['JP'] # TVer service is limited to Japan only - + _GEO_COUNTRIES = ['JP'] # TVer service is limited to Japan only + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=ref:%s' def _real_extract(self, url): @@ -90,7 +80,7 @@ def _real_extract(self, url): # extract video information video_info_csv = self._search_regex(r'addPlayer\((?P.*?)\);', webpage, 'video information', flags=re.DOTALL).strip() - video_info_csv = video_info_csv.replace('\t', '').replace('\n', '').replace('\'', '') # remove \t and \n and ' + video_info_csv = video_info_csv.replace('\t', '').replace('\n', '').replace('\'', '') # remove \t and \n and ' video_info = video_info_csv.split(',') # extract brightcove account id @@ -100,7 +90,7 @@ def _real_extract(self, url): brightcove_video_id = video_info[4] # brightcove url - brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % (brightcove_account_id, brightcove_video_id) + brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % (brightcove_account_id, brightcove_video_id) # debug output if self._downloader.params.get('verbose', False): @@ -119,15 +109,22 @@ def _real_extract(self, url): # get video information info_dict = super(TVerIE, self)._real_extract(brightcove_url) + # get video description + description = \ + self._og_search_description(webpage) or \ + self._html_search_meta('twitter:description', webpage) or \ + self._html_search_regex(r'
(?P.*?)
', webpage, 'description', default=None, flags=re.DOTALL) + # undo _VALID_URL self._VALID_URL = _VALID_URL - # replacement - info_dict['id'] = video_id # TVer ID - info_dict['display_id'] = brightcove_video_id # Brightcove ID - info_dict['thumbnail'] = info_dict['thumbnail'].replace('160x90', '1920x1080') # select large thumbnail - info_dict['description'] = self._html_search_regex( # desctiption - r'
(?P.*?)
', - webpage, 'description', flags=re.DOTALL) - + # TVer ID + info_dict['id'] = video_id + # Brightcove ID + info_dict['display_id'] = brightcove_video_id + # select large thumbnail + info_dict['thumbnail'] = info_dict['thumbnail'].replace('160x90', '1920x1080') + # desctiption + info_dict['description'] = description + return info_dict From a109b5fe110ca7817d92e587ef1262fbce57a295 Mon Sep 17 00:00:00 2001 From: tsukumijima Date: Tue, 22 Sep 2020 04:34:12 +0900 Subject: [PATCH 3/8] [tver] fix to more verbose code --- youtube_dl/extractor/tver.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index 4132be9185a..6f51ead8563 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -111,9 +111,8 @@ def _real_extract(self, url): # get video description description = \ - self._og_search_description(webpage) or \ - self._html_search_meta('twitter:description', webpage) or \ - self._html_search_regex(r'
(?P.*?)
', webpage, 'description', default=None, flags=re.DOTALL) + self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage, 'description', default=None) or \ + self._html_search_regex(r']+class="description"[^>]*>(?P.*?)', webpage, 'description', default=None, flags=re.DOTALL) # undo _VALID_URL self._VALID_URL = _VALID_URL @@ -123,7 +122,7 @@ def _real_extract(self, url): # Brightcove ID info_dict['display_id'] = brightcove_video_id # select large thumbnail - info_dict['thumbnail'] = info_dict['thumbnail'].replace('160x90', '1920x1080') + info_dict['thumbnail'] = info_dict.get('thumbnail').replace('160x90', '1920x1080') # desctiption info_dict['description'] = description From bf6f1048b2662dd316321fa54fc9af319bc45e3d Mon Sep 17 00:00:00 2001 From: tsukumijima Date: Sun, 27 Sep 2020 18:30:01 +0900 Subject: [PATCH 4/8] [tver] Significantly improved extraction logic and added 'creator' and 'uploader' keys to extraction information --- youtube_dl/extractor/tver.py | 186 +++++++++++++++++------------------ 1 file changed, 88 insertions(+), 98 deletions(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index 6f51ead8563..52b714b8b07 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -4,63 +4,31 @@ import re from .brightcove import BrightcoveNewIE - - -class TVerIE(BrightcoveNewIE): - - _TESTS = [ - { - 'url': 'https://tver.jp/feature/f0057485', # 'feature' - 'md5': '1c1c09662252571992dee0441028b4ec', # MD5 hash of a short video downloaded by running youtube-dl with the --test option - 'info_dict': { - 'id': 'f0057485', # TVer ID - 'display_id': 'ref:hanzawa_naoki---s2----323-001', # Brightcove ID - 'ext': 'mp4', - 'title': '半沢直樹(新シリーズ) 第1話 子会社VS銀行!飛ばされた半沢の新たな下剋上が始まる', - 'description': '大和田(香川照之)の不正を糾弾し、子会社へ出向を命じられた半沢直樹(堺雅人)は、東京セントラル証券営業企画部長に。ある日1500億円超の買収案件が舞い込むが…。', - 'thumbnail': 'https://cf-images.ap-northeast-1.prod.boltdns.net/v1/jit/4031511847001/37b5f176-3989-48d9-81d1-4688e80c5531/main/1920x1080/34m10s16ms/match/image.jpg', - 'duration': 4100.032, - 'timestamp': 1600308623, - 'upload_date': '20200917', - 'uploader_id': '4031511847001', - }, - 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', - }, - { - 'url': 'https://tver.jp/corner/f0056997', # 'corner' - 'md5': 'aac4e681dcdb775fc44497da4f7bdd05', # MD5 hash of a short video downloaded by running youtube-dl with the --test option - 'info_dict': { - 'id': 'f0056997', # TVer ID - 'display_id': 'ref:kanokari_10', # Brightcove ID - 'ext': 'mp4', - 'title': '彼女、お借りします 第10話「友達の彼女」-トモカノ-', - 'description': 'バイトの初任給を何に使おうか考える和也だったが、ふと栗林のことが脳裏をよぎる。最近栗林の様子がおかしいと、木部から話を聞いていたのだ。ボーッとしていたり、女性不信のつぶやきをしているという。和也は意を決して、栗林を呼び出すことに。翌日、栗林が和也を待っていると──「駿君、だよね?」。待ち合わせ場所にやって来たのは、千鶴だった……!', - 'thumbnail': 'https://cf-images.ap-northeast-1.prod.boltdns.net/v1/jit/5102072605001/900216cc-2e97-4c19-93bb-1a531de358d6/main/1920x1080/12m18s37ms/match/image.jpg', - 'duration': 1476.075, - 'timestamp': 1599554409, - 'upload_date': '20200908', - 'uploader_id': '5102072605001', - }, - 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', +from .common import InfoExtractor +from ..utils import ( + js_to_json, +) + + +class TVerIE(InfoExtractor): + + _TEST = { + 'url': 'https://tver.jp/feature/f0057485', # In addition to 'feature', there are also categories such as 'corner' and 'episode'. + 'md5': '4ae1bc00e6d55af8f7e2b2c17029f1a3', # MD5 hash of a short video downloaded by running youtube-dl with the --test option + 'info_dict': { + 'id': 'f0057485', # TVer ID + 'display_id': 'ref:hanzawa_naoki---s2----323-001', # Brightcove ID + 'ext': 'mp4', + 'title': '半沢直樹(新シリーズ) 第1話 子会社VS銀行!飛ばされた半沢の新たな下剋上が始まる', + 'description': 'md5:92ce839312ee1e9b162de73fa08b6374', + 'thumbnail': r're:https?://.*\.jpg$', + 'duration': 4100.032, + 'timestamp': 1600308623, + 'upload_date': '20200917', + 'uploader_id': '4031511847001', }, - { - 'url': 'https://tver.jp/episode/76799350', # 'episode' - 'md5': 'ad893db02b8a3e949216c463af7ce51e', # MD5 hash of a short video downloaded by running youtube-dl with the --test option - 'info_dict': { - 'id': '76799350', # TVer ID - 'display_id': '2366_2365_4533', # Brightcove ID - 'ext': 'mp4', - 'title': '港時間 #49 神奈川県/リビエラシーボニアマリーナ 9月18日(金)放送分', - 'description': '【毎週金曜 よる12時15分から放送】\n\n日本のヨット文化 を育んできた三浦半島の西海岸、小網代湾にあるリビエラシーボニアマリーナ。昨年から始まったSailGPの日本チームを率いるヨット界のレジェンドに会いました。', - 'thumbnail': 'https://cf-images.ap-northeast-1.prod.boltdns.net/v1/jit/4394098883001/904361ca-40d3-4028-8478-8916b9a0ff49/main/1920x1080/58s80ms/match/image.jpg', - 'duration': 116.16, - 'timestamp': 1600052421, - 'upload_date': '20200914', - 'uploader_id': '4394098883001', - }, - 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', - }, - ] + 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', + } IE_NAME = 'TVer' IE_DESC = 'TVer' @@ -68,62 +36,84 @@ class TVerIE(BrightcoveNewIE): _VALID_URL = r'https?://(?:www\.)?tver\.jp/(corner|episode|feature)/(?Pf?[0-9]+)' _GEO_COUNTRIES = ['JP'] # TVer service is limited to Japan only - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=ref:%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + # TODO: FOD対応 def _real_extract(self, url): - # extract video id video_id = self._match_id(url) - - # download webpage webpage = self._download_webpage(url, video_id) - # extract video information - video_info_csv = self._search_regex(r'addPlayer\((?P.*?)\);', webpage, 'video information', flags=re.DOTALL).strip() - video_info_csv = video_info_csv.replace('\t', '').replace('\n', '').replace('\'', '') # remove \t and \n and ' - video_info = video_info_csv.split(',') + # extract tver information + tver_info_csv = self._search_regex(r'addPlayer\((?P.*?)\);', webpage, 'tver information', flags=re.DOTALL).strip() + tver_info_csv = tver_info_csv.replace('\t', '').replace('\n', '').replace('\'', '') # remove \t and \n and ' + tver_info = tver_info_csv.split(',') - # extract brightcove account id - brightcove_account_id = video_info[3] + # extract brightcove information + brightcove_account_id = tver_info[3] + brightcove_video_id = 'ref:' + tver_info[4] + brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % (brightcove_account_id, brightcove_video_id) + brightcove_info = self._extract_brightcove_info(brightcove_url, 'https://tver.jp/') - # extract brightcove video id - brightcove_video_id = video_info[4] + # extract tver description + description = \ + self._html_search_meta(['og:description', 'twitter:description'], webpage, 'description', default=None) or \ + self._html_search_regex(r']+class="description"[^>]*>(?P.*?)', webpage, 'description', default=None, flags=re.DOTALL) - # brightcove url - brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % (brightcove_account_id, brightcove_video_id) + # Note: Delegate extraction to BrightcoveNewIE by specifying url_transparent, + # while also making TverIE's own acquired entities such as description available. + info_dict = { + '_type': 'url_transparent', + 'url': brightcove_url, + 'ie_key': BrightcoveNewIE.ie_key(), + 'id': video_id, # Tver ID + 'display_id': brightcove_video_id, # Brightcove ID + 'title': brightcove_info.get('name'), + 'description': description, + 'thumbnail': re.sub(r'/[0-9]+x[0-9]+/', r'/1920x1080/', brightcove_info.get('poster')), # select large thumbnail + 'creator': tver_info[7], # Broadcaster name e.g. 'tbs', 'ntv' + 'uploader': tver_info[8], # Delivery platform name e.g. 'TBS FREE', '日テレ無料' + } - # debug output - if self._downloader.params.get('verbose', False): - self.to_screen('Video Information: %s' % video_info) - self.to_screen('Brightcove Account ID: %s' % brightcove_account_id) - self.to_screen('Brightcove Video ID: %s' % brightcove_video_id) - self.to_screen('Brightcove URL: %s' % brightcove_url) + return info_dict - # evacuate _VALID_URL - _VALID_URL = self._VALID_URL + def _extract_brightcove_info(self, url, referrer): - # temporarily replace _VALID_URL - # prevent _VALID_URL from being the URL of Tver when executing the parent class's _real_extract () method - self._VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*(?Pvideo|playlist)Id=(?P\d+|ref:[^&]+)' + valid_url = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*(?Pvideo|playlist)Id=(?P\d+|ref:[^&]+)' - # get video information - info_dict = super(TVerIE, self)._real_extract(brightcove_url) + account_id, player_id, embed, content_type, video_id = re.match(valid_url, url).groups() - # get video description - description = \ - self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage, 'description', default=None) or \ - self._html_search_regex(r']+class="description"[^>]*>(?P.*?)', webpage, 'description', default=None, flags=re.DOTALL) + def extract_policy_key(): + webpage = self._download_webpage( + 'http://players.brightcove.net/%s/%s_%s/index.min.js' + % (account_id, player_id, embed), video_id) - # undo _VALID_URL - self._VALID_URL = _VALID_URL + policy_key = None - # TVer ID - info_dict['id'] = video_id - # Brightcove ID - info_dict['display_id'] = brightcove_video_id - # select large thumbnail - info_dict['thumbnail'] = info_dict.get('thumbnail').replace('160x90', '1920x1080') - # desctiption - info_dict['description'] = description + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') - return info_dict + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P.+?)\1', + webpage, 'policy key', group='pk') + + return policy_key + + # brightcove api url + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) + + # set header + headers = { + 'Accept': 'application/json;pk=%s' % extract_policy_key(), + 'Origin': re.search(r'https?://[^/]+', referrer).group(0), + 'Referer': referrer, + } + + # return brightcove api info + return self._download_json(api_url, video_id, headers=headers) From bcae9d7a02999c0ecec0cbc875925c5780fe9932 Mon Sep 17 00:00:00 2001 From: tsukumijima Date: Wed, 30 Sep 2020 02:43:27 +0900 Subject: [PATCH 5/8] [tver] Added support for videos distributed by Fuji TV (FOD, Fuji TV On Demand) hosted by its own distribution system --- youtube_dl/extractor/tver.py | 147 ++++++++++++++++++++++++++--------- 1 file changed, 109 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index 52b714b8b07..752848872c0 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -12,23 +12,44 @@ class TVerIE(InfoExtractor): - _TEST = { - 'url': 'https://tver.jp/feature/f0057485', # In addition to 'feature', there are also categories such as 'corner' and 'episode'. - 'md5': '4ae1bc00e6d55af8f7e2b2c17029f1a3', # MD5 hash of a short video downloaded by running youtube-dl with the --test option - 'info_dict': { - 'id': 'f0057485', # TVer ID - 'display_id': 'ref:hanzawa_naoki---s2----323-001', # Brightcove ID - 'ext': 'mp4', - 'title': '半沢直樹(新シリーズ) 第1話 子会社VS銀行!飛ばされた半沢の新たな下剋上が始まる', - 'description': 'md5:92ce839312ee1e9b162de73fa08b6374', - 'thumbnail': r're:https?://.*\.jpg$', - 'duration': 4100.032, - 'timestamp': 1600308623, - 'upload_date': '20200917', - 'uploader_id': '4031511847001', + _TESTS = [ + { + # Delivery from Brightcove + 'url': 'https://tver.jp/feature/f0057485', # In addition to 'feature', there are also categories such as 'corner' and 'episode'. + 'md5': '4ae1bc00e6d55af8f7e2b2c17029f1a3', # MD5 hash of a short video downloaded by running youtube-dl with the --test option + 'info_dict': { + 'id': 'f0057485', # TVer ID + 'display_id': 'ref:hanzawa_naoki---s2----323-001', # Brightcove ID + 'ext': 'mp4', + 'title': '半沢直樹(新シリーズ) 第1話 子会社VS銀行!飛ばされた半沢の新たな下剋上が始まる', + 'description': 'md5:92ce839312ee1e9b162de73fa08b6374', + 'thumbnail': r're:https?://.*\.jpg$', + 'duration': 4100.032, + 'timestamp': 1600308623, + 'upload_date': '20200917', + 'uploader_id': '4031511847001', + 'creator': 'tbs', # Means TBS TV + 'uploader': 'TBS FREE', # Content provider + }, + 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', }, - 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', - } + { + # Delivery from FOD (Fuji TV On Demand) + 'url': 'https://tver.jp/corner/f0057932', # In addition to 'feature', there are also categories such as 'corner' and 'episode'. + 'md5': '6d1970594e532f4b1d6403b5bf9d0d67', # MD5 hash of a short video downloaded by running youtube-dl with the --test option + 'info_dict': { + 'id': 'f0057932', # TVer ID + 'display_id': '5d40810015', # FOD ID + 'ext': 'mp4', + 'title': 'ちびまる子ちゃん #1258「秋のお楽しみメニュー~まる子の昔ばなし~ 『まる子の涼しい大作戦』の巻/『まる子のおむすびころりん』の巻」', + 'description': 'md5:328c6ef38bed76588a1f6eb5d69c4a7c', + 'thumbnail': r're:https?://.*\.jpg$', + 'creator': 'cx', # Means Fuji TV + 'uploader': 'FOD見逃し無料', # Content provider + }, + 'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos', + }, + ] IE_NAME = 'TVer' IE_DESC = 'TVer' @@ -36,9 +57,6 @@ class TVerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tver\.jp/(corner|episode|feature)/(?Pf?[0-9]+)' _GEO_COUNTRIES = ['JP'] # TVer service is limited to Japan only - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' - - # TODO: FOD対応 def _real_extract(self, url): video_id = self._match_id(url) @@ -49,31 +67,84 @@ def _real_extract(self, url): tver_info_csv = tver_info_csv.replace('\t', '').replace('\n', '').replace('\'', '') # remove \t and \n and ' tver_info = tver_info_csv.split(',') - # extract brightcove information - brightcove_account_id = tver_info[3] - brightcove_video_id = 'ref:' + tver_info[4] - brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % (brightcove_account_id, brightcove_video_id) - brightcove_info = self._extract_brightcove_info(brightcove_url, 'https://tver.jp/') + # extract tver title + title = tver_info[5] + ' ' + tver_info[6].lstrip() # title + subtitle # extract tver description description = \ self._html_search_meta(['og:description', 'twitter:description'], webpage, 'description', default=None) or \ self._html_search_regex(r']+class="description"[^>]*>(?P.*?)', webpage, 'description', default=None, flags=re.DOTALL) - # Note: Delegate extraction to BrightcoveNewIE by specifying url_transparent, - # while also making TverIE's own acquired entities such as description available. - info_dict = { - '_type': 'url_transparent', - 'url': brightcove_url, - 'ie_key': BrightcoveNewIE.ie_key(), - 'id': video_id, # Tver ID - 'display_id': brightcove_video_id, # Brightcove ID - 'title': brightcove_info.get('name'), - 'description': description, - 'thumbnail': re.sub(r'/[0-9]+x[0-9]+/', r'/1920x1080/', brightcove_info.get('poster')), # select large thumbnail - 'creator': tver_info[7], # Broadcaster name e.g. 'tbs', 'ntv' - 'uploader': tver_info[8], # Delivery platform name e.g. 'TBS FREE', '日テレ無料' - } + # Note: Of the videos on TVer, only the videos distributed by Fuji TV (FOD, Fuji TV On Demand) + # use our own distribution system instead of Brightcove. + if tver_info[7] == 'cx': + + # extract fod information + fod_video_id = tver_info[3] + fod_url = 'https://i.fod.fujitv.co.jp/abr/pc_html5/%s.m3u8' % fod_video_id + fod_thumbnail = 'https://i.fod.fujitv.co.jp/pc/image/wbtn/wbtn_%s.jpg' % fod_video_id + + # extract fod formats + fod_formats = self._extract_m3u8_formats(fod_url, fod_video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + + # Note: All 'RESOLUTION' values in the playlist are 360p, + # but this is a fake value and will be replaced based on what you actually downloaded and measured. + for index, fod_fotmat in enumerate(fod_formats): + # 720p, 2000kbps + if fod_fotmat['format_id'] == 'hls-2000': + fod_formats[index]['width'] = 1280 + fod_formats[index]['height'] = 720 + # 720p, 1200kbps + elif fod_fotmat['format_id'] == 'hls-1200': + fod_formats[index]['width'] = 1280 + fod_formats[index]['height'] = 720 + # 360p, 800kbps + elif fod_fotmat['format_id'] == 'hls-800': + fod_formats[index]['width'] = 640 + fod_formats[index]['height'] = 360 + # 180p, 300kbps + elif fod_fotmat['format_id'] == 'hls-300': + fod_formats[index]['width'] = 320 + fod_formats[index]['height'] = 180 + + # reverse the format order + fod_formats.reverse() + + info_dict = { + 'id': video_id, # Tver ID + 'display_id': fod_video_id, # FOD ID + 'formats': fod_formats, + 'title': title, + 'description': description, + 'thumbnail': fod_thumbnail, + 'creator': tver_info[7], # Broadcaster name e.g. 'cx' + 'uploader': tver_info[8], # Delivery platform name e.g. 'FOD見逃し無料' + 'tags': [tver_info[5]], + 'is_live': False, + } + + else: + + # extract brightcove information + brightcove_account_id = tver_info[3] + brightcove_video_id = 'ref:' + tver_info[4] + brightcove_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (brightcove_account_id, brightcove_video_id) + brightcove_info = self._extract_brightcove_info(brightcove_url, 'https://tver.jp/') + + # Note: Delegate extraction to BrightcoveNewIE by specifying url_transparent, + # while also making TverIE's own acquired entities such as description available. + info_dict = { + '_type': 'url_transparent', + 'url': brightcove_url, + 'ie_key': BrightcoveNewIE.ie_key(), + 'id': video_id, # Tver ID + 'display_id': brightcove_video_id, # Brightcove ID + 'title': title or brightcove_info.get('name'), + 'description': description, + 'thumbnail': re.sub(r'/[0-9]+x[0-9]+/', r'/1920x1080/', brightcove_info.get('poster')), # select large thumbnail + 'creator': tver_info[7], # Broadcaster name e.g. 'tbs', 'ntv' + 'uploader': tver_info[8], # Delivery platform name e.g. 'TBS FREE', '日テレ無料' + } return info_dict From e9c481f27a66137d79755dd9facd96cb5eea190a Mon Sep 17 00:00:00 2001 From: tsukumijima Date: Mon, 26 Oct 2020 02:48:14 +0900 Subject: [PATCH 6/8] [tver] Update supportedsites.md --- docs/supportedsites.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 367545a96b2..4b9bac2ae63 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -935,6 +935,7 @@ - **TVANouvellesArticle** - **TVC** - **TVCArticle** + - **TVer**: TVer - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** - **TVN24** From dbda3bcfcafd823738e8a3a2b1b1980c98adae36 Mon Sep 17 00:00:00 2001 From: tsukumijima Date: Mon, 26 Oct 2020 05:46:04 +0900 Subject: [PATCH 7/8] [tver] Fix incorrect case --- youtube_dl/extractor/tver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index 752848872c0..ac70182dccb 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -111,7 +111,7 @@ def _real_extract(self, url): fod_formats.reverse() info_dict = { - 'id': video_id, # Tver ID + 'id': video_id, # TVer ID 'display_id': fod_video_id, # FOD ID 'formats': fod_formats, 'title': title, @@ -132,12 +132,12 @@ def _real_extract(self, url): brightcove_info = self._extract_brightcove_info(brightcove_url, 'https://tver.jp/') # Note: Delegate extraction to BrightcoveNewIE by specifying url_transparent, - # while also making TverIE's own acquired entities such as description available. + # while also making TVerIE's own acquired entities such as description available. info_dict = { '_type': 'url_transparent', 'url': brightcove_url, 'ie_key': BrightcoveNewIE.ie_key(), - 'id': video_id, # Tver ID + 'id': video_id, # TVer ID 'display_id': brightcove_video_id, # Brightcove ID 'title': title or brightcove_info.get('name'), 'description': description, From c7ccb218b477d96d83891357db44c40d6c41f372 Mon Sep 17 00:00:00 2001 From: tsukumijima Date: Fri, 13 Nov 2020 04:32:37 +0900 Subject: [PATCH 8/8] [tver] Fix issue where some videos would fail to download with an HTTP Error 404 --- youtube_dl/extractor/tver.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index ac70182dccb..63562991b90 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -5,8 +5,10 @@ from .brightcove import BrightcoveNewIE from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( js_to_json, + ExtractorError, ) @@ -127,9 +129,11 @@ def _real_extract(self, url): # extract brightcove information brightcove_account_id = tver_info[3] + brightcove_player_id = tver_info[1] brightcove_video_id = 'ref:' + tver_info[4] - brightcove_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (brightcove_account_id, brightcove_video_id) - brightcove_info = self._extract_brightcove_info(brightcove_url, 'https://tver.jp/') + brightcove_url = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' \ + % (brightcove_account_id, brightcove_player_id, brightcove_video_id) + brightcove_url, brightcove_info = self._extract_brightcove_info(brightcove_url, 'https://tver.jp/') # Note: Delegate extraction to BrightcoveNewIE by specifying url_transparent, # while also making TVerIE's own acquired entities such as description available. @@ -186,5 +190,13 @@ def extract_policy_key(): 'Referer': referrer, } - # return brightcove api info - return self._download_json(api_url, video_id, headers=headers) + # get brightcove api info + try: + response = self._download_json(api_url, video_id, headers=headers) + except ExtractorError as e: + # Remove 'ref:' from API URL when HTTPError occurs + if isinstance(e.cause, compat_HTTPError): + url = url.replace('ref:', '') + response = self._download_json(api_url.replace('ref:', ''), video_id, headers=headers) + + return url, response