From 7bac52839c697dac58c93d9d440f5c473d168984 Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 7 Jan 2021 14:48:45 +0100 Subject: [PATCH] [rai] improve subtitles extraction (#27705) closes #27698 --- test/test_subtitles.py | 12 +++++++-- youtube_dl/extractor/rai.py | 50 ++++++++++++++++++++++++------------- 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 17aaaf20d9a..550e0ca0081 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -258,16 +258,24 @@ def test_allsubtitles(self): class TestRaiPlaySubtitles(BaseTestSubtitles): - url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html' IE = RaiPlayIE - def test_allsubtitles(self): + def test_subtitles_key(self): + self.url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['it'])) self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') + def test_subtitles_array_key(self): + self.url = 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html' + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['it'])) + self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd') + class TestVikiSubtitles(BaseTestSubtitles): url = 'http://www.viki.com/videos/1060846v-punch-episode-18' diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 0a68d16b038..67b86fc72c1 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -103,22 +103,28 @@ def _extract_relinker_info(self, relinker_url, video_id): }.items() if v is not None) @staticmethod - def _extract_subtitles(url, subtitle_url): + def _extract_subtitles(url, video_data): + STL_EXT = 'stl' + SRT_EXT = 'srt' subtitles = {} - if subtitle_url and isinstance(subtitle_url, compat_str): - subtitle_url = urljoin(url, subtitle_url) - STL_EXT = '.stl' - SRT_EXT = '.srt' - subtitles['it'] = [{ - 'ext': 'stl', - 'url': subtitle_url, - }] - if subtitle_url.endswith(STL_EXT): - srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT - subtitles['it'].append({ - 'ext': 'srt', - 'url': srt_url, + subtitles_array = video_data.get('subtitlesArray') or [] + for k in ('subtitles', 'subtitlesUrl'): + subtitles_array.append({'url': video_data.get(k)}) + for subtitle in subtitles_array: + sub_url = subtitle.get('url') + if sub_url and isinstance(sub_url, compat_str): + sub_lang = subtitle.get('language') or 'it' + sub_url = urljoin(url, sub_url) + sub_ext = determine_ext(sub_url, SRT_EXT) + subtitles.setdefault(sub_lang, []).append({ + 'ext': sub_ext, + 'url': sub_url, }) + if STL_EXT == sub_ext: + subtitles[sub_lang].append({ + 'ext': SRT_EXT, + 'url': sub_url[:-len(STL_EXT)] + SRT_EXT, + }) return subtitles @@ -138,6 +144,9 @@ class RaiPlayIE(RaiBaseIE): 'duration': 6160, 'series': 'Report', 'season': '2013/14', + 'subtitles': { + 'it': 'count:2', + }, }, 'params': { 'skip_download': True, @@ -145,6 +154,10 @@ class RaiPlayIE(RaiBaseIE): }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 'only_matching': True, + }, { + # subtitles at 'subtitlesArray' key (see #27698) + 'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -172,7 +185,7 @@ def _real_extract(self, url): if date_published and time_published: date_published += ' ' + time_published - subtitles = self._extract_subtitles(url, video.get('subtitles')) + subtitles = self._extract_subtitles(url, video) program_info = media.get('program_info') or {} season = media.get('season') @@ -327,7 +340,7 @@ class RaiIE(RaiBaseIE): 'skip_download': True, }, }, { - # ContentItem in iframe (see #12652) + # ContentItem in iframe (see #12652) and subtitle at 'subtitlesUrl' key 'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html', 'info_dict': { 'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd', @@ -335,6 +348,9 @@ class RaiIE(RaiBaseIE): 'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015', 'description': 'md5:d291b03407ec505f95f27970c0b025f4', 'upload_date': '20150913', + 'subtitles': { + 'it': 'count:2', + }, }, 'params': { 'skip_download': True, @@ -379,7 +395,7 @@ def _extract_from_content_id(self, content_id, url): 'url': compat_urlparse.urljoin(url, thumbnail_url), }) - subtitles = self._extract_subtitles(url, media.get('subtitlesUrl')) + subtitles = self._extract_subtitles(url, media) info = { 'id': content_id,