From 33db516443bd4a9ac513b214c736f2178c0b808d Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 15 Oct 2020 14:24:17 +0200 Subject: [PATCH 01/13] [gedidigital] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/gedidigital.py | 199 ++++++++++++++++++++++++++++ 2 files changed, 200 insertions(+) create mode 100644 youtube_dl/extractor/gedidigital.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 62819ddcf30..1dcb9b33bac 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -421,6 +421,7 @@ from .gaskrank import GaskrankIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE +from .gedidigital import GediDigitalIE from .generic import GenericIE from .gfycat import GfycatIE from .giantbomb import GiantBombIE diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py new file mode 100644 index 00000000000..cc1492589ab --- /dev/null +++ b/youtube_dl/extractor/gedidigital.py @@ -0,0 +1,199 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from var_dump import var_dump + +from .common import InfoExtractor + + +class GediDigitalBaseIE(InfoExtractor): + @staticmethod + def _clean_audio_fmts(formats): + unique_formats = [] + for f in formats: + if 'acodec' in f: + unique_formats.append(f) + formats[:] = unique_formats + + def _real_extract(self, url): + u = re.match(self._VALID_URL, url) + self.IE_NAME = u.group('iename') if u.group('iename') else 'gedi' + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + player_data = re.findall( + r'PlayerFactory\.setParam\(\'(?P.+?)\',\s*\'(?P.+?)\',\s*\'(?P.+?)\'\);', + webpage) + + formats = [] + audio_fmts = [] + hls_fmts = [] + http_fmts = [] + title = '' + thumb = '' + + fmt_reg = r'(?Pvideo|audio)-(?P

rrtv|hls)-(?P[\w\d]+)(?:-(?P
[\w\d]+))?$' + br_reg = r'video-rrtv-(?P
\d+)-' + for t, n, v in player_data: + if t == 'format': + m = re.match(fmt_reg, n) + if m: + # audio formats + if m.group('t') == 'audio': + if m.group('p') == 'hls': + audio_fmts.extend(self._extract_m3u8_formats( + v, video_id, 'm4a', m3u8_id='hls', fatal=False)) + elif m.group('p') == 'rrtv': + audio_fmts.append({ + 'format_id': 'mp3', + 'url': v, + 'tbr': 128, + 'ext': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + }) + + # video formats + elif m.group('t') == 'video': + # hls manifest video + if m.group('p') == 'hls': + hls_fmts.extend(self._extract_m3u8_formats( + v, video_id, 'mp4', m3u8_id='hls', fatal=False)) + # direct mp4 video + elif m.group('p') == 'rrtv': + if not m.group('br'): + mm = re.search(br_reg, v) + http_fmts.append({ + 'format_id': 'https-' + m.group('h'), + 'protocol': 'https', + 'url': v, + 'tbr': int(m.group('br')) if m.group('br') else + (int(mm.group('br')) if mm.group('br') else 0), + 'height': int(m.group('h')) + }) + + elif t == 'param': + if n == 'videotitle': + title = v + if n == 'image_full_play': + thumb = v + + title = re.sub(r'\s*-\s*La Stampa', '', self._og_search_title(webpage)) if title == '' else title + + if audio_fmts: + self._clean_audio_fmts(audio_fmts) + self._sort_formats(audio_fmts) + if hls_fmts: + self._sort_formats(hls_fmts) + if http_fmts: + self._sort_formats(http_fmts) + + formats.extend(audio_fmts) + formats.extend(hls_fmts) + formats.extend(http_fmts) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta('twitter:description', webpage), + 'thumbnail': thumb, + 'formats': formats, + } + + +class GediDigitalIE(GediDigitalBaseIE): + IE_NAME = '' + _VALID_URL = r'''(?x)https://video\. + (?P + (?:espresso\.)?repubblica + |lastampa + |ilsecoloxix + |iltirreno + |messaggeroveneto + |ilpiccolo + |gazzettadimantova + |mattinopadova + |laprovinciapavese + |tribunatreviso + |nuovavenezia + |gazzettadimodena + |lanuovaferrara + |corrierealpi + |lasentinella + ) + (?:\.gelocal)?\.it/.+?/(?P[\d/]+)$''' + _TESTS = [{ + 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', + 'md5': '84658d7fb9e55a6e57ecc77b73137494', + 'info_dict': { + 'id': '121559/121683', + 'ext': 'mp4', + 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', + 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', + 'thumbnail': 'https://www.repstatic.it/video/photo/2020/09/22/731397/731397-thumb-social-play.jpg', + }, + }, { + 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', + 'md5': 'e763b94b7920799a0e0e23ffefa2d157', + 'info_dict': { + 'id': '367415/367963', + 'ext': 'mp4', + 'title': 'Record della pista a Spa Francorchamps, la Pagani Huayra Roadster BC stupisce', + 'description': 'md5:5deb503cefe734a3eb3f07ed74303920', + 'thumbnail': 'https://www.repstatic.it/video/photo/2020/09/19/730799/730799-thumb-social-play.jpg', + }, + }, { + 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', + 'md5': 'e48108e97b1af137d22a8469f2019057', + 'info_dict': { + 'id': '66184/66267', + 'ext': 'mp4', + 'title': 'Cassani e i brividi azzurri ai Mondiali di Imola: \\"Qui mi sono innamorato del ciclismo da ragazzino, incredibile tornarci da ct\\"', + 'description': 'md5:fc9c50894f70a2469bb9b54d3d0a3d3b', + 'thumbnail': 'https://www.repstatic.it/video/photo/2020/09/23/731520/731520-thumb-social-play.jpg', + }, + }, { + 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', + 'md5': 'a6e39f3bdc1842bbd92abbbbef230817', + 'info_dict': { + 'id': '141059/142723', + 'ext': 'mp4', + 'title': 'Dentro la notizia - Ferrari, cosa succede a Maranello', + 'description': 'md5:9907d65b53765681fa3a0b3122617c1f', + 'thumbnail': 'https://www.repstatic.it/video/photo/2020/09/23/731504/731504-thumb-social-play.jpg', + }, + }, { + 'url': 'https://video.messaggeroveneto.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'only_matching': True, + }, { + 'url': 'https://video.ilpiccolo.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimantova.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'only_matching': True, + }, { + 'url': 'https://video.mattinopadova.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'only_matching': True, + }, { + 'url': 'https://video.laprovinciapavese.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'only_matching': True, + }, { + 'url': 'https://video.tribunatreviso.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'only_matching': True, + }, { + 'url': 'https://video.nuovavenezia.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimodena.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'only_matching': True, + }, { + 'url': 'https://video.lanuovaferrara.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'only_matching': True, + }, { + 'url': 'https://video.corrierealpi.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'only_matching': True, + }, { + 'url': 'https://video.lasentinella.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'only_matching': True, + }] From 8553b84bffcffc1cb909ea396225a0b649fd2437 Mon Sep 17 00:00:00 2001 From: nixxo Date: Tue, 20 Oct 2020 15:55:46 +0200 Subject: [PATCH 02/13] [gedidigital] title cleanup, thumb regex, added new test --- youtube_dl/extractor/gedidigital.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index cc1492589ab..7cd6fb6d373 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -2,9 +2,9 @@ from __future__ import unicode_literals import re -from var_dump import var_dump from .common import InfoExtractor +from ..compat import compat_str class GediDigitalBaseIE(InfoExtractor): @@ -35,6 +35,7 @@ def _real_extract(self, url): fmt_reg = r'(?Pvideo|audio)-(?P

rrtv|hls)-(?P[\w\d]+)(?:-(?P
[\w\d]+))?$' br_reg = r'video-rrtv-(?P
\d+)-' + for t, n, v in player_data: if t == 'format': m = re.match(fmt_reg, n) @@ -79,7 +80,10 @@ def _real_extract(self, url): if n == 'image_full_play': thumb = v - title = re.sub(r'\s*-\s*La Stampa', '', self._og_search_title(webpage)) if title == '' else title + title = self._og_search_title(webpage) if title == '' else title + + # clean weird char + title = compat_str(title).encode('utf8', 'replace').replace(b'\xc3\x82', b'').decode('utf8', 'replace') if audio_fmts: self._clean_audio_fmts(audio_fmts) @@ -122,7 +126,7 @@ class GediDigitalIE(GediDigitalBaseIE): |corrierealpi |lasentinella ) - (?:\.gelocal)?\.it/.+?/(?P[\d/]+)$''' + (?:\.gelocal)?\.it/.+?/(?P[\d/]+)(?:\?|\&|$)''' _TESTS = [{ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', 'md5': '84658d7fb9e55a6e57ecc77b73137494', @@ -131,7 +135,7 @@ class GediDigitalIE(GediDigitalBaseIE): 'ext': 'mp4', 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', - 'thumbnail': 'https://www.repstatic.it/video/photo/2020/09/22/731397/731397-thumb-social-play.jpg', + 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', }, }, { 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', @@ -141,7 +145,7 @@ class GediDigitalIE(GediDigitalBaseIE): 'ext': 'mp4', 'title': 'Record della pista a Spa Francorchamps, la Pagani Huayra Roadster BC stupisce', 'description': 'md5:5deb503cefe734a3eb3f07ed74303920', - 'thumbnail': 'https://www.repstatic.it/video/photo/2020/09/19/730799/730799-thumb-social-play.jpg', + 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', }, }, { 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', @@ -151,7 +155,7 @@ class GediDigitalIE(GediDigitalBaseIE): 'ext': 'mp4', 'title': 'Cassani e i brividi azzurri ai Mondiali di Imola: \\"Qui mi sono innamorato del ciclismo da ragazzino, incredibile tornarci da ct\\"', 'description': 'md5:fc9c50894f70a2469bb9b54d3d0a3d3b', - 'thumbnail': 'https://www.repstatic.it/video/photo/2020/09/23/731520/731520-thumb-social-play.jpg', + 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', }, }, { 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', @@ -161,7 +165,17 @@ class GediDigitalIE(GediDigitalBaseIE): 'ext': 'mp4', 'title': 'Dentro la notizia - Ferrari, cosa succede a Maranello', 'description': 'md5:9907d65b53765681fa3a0b3122617c1f', - 'thumbnail': 'https://www.repstatic.it/video/photo/2020/09/23/731504/731504-thumb-social-play.jpg', + 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', + }, + }, { + 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', + 'md5': '0391c2c83c6506581003aaf0255889c0', + 'info_dict': { + 'id': '14772/14870', + 'ext': 'mp4', + 'title': 'Festival EMERGENCY, Villa: «La buona informazione aiuta la salute» (14772-14870)', + 'description': 'md5:2bce954d278248f3c950be355b7c2226', + 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', }, }, { 'url': 'https://video.messaggeroveneto.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', From 5a687f0c9311e20d92eea9461f7735d2204f4e10 Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 22 Oct 2020 13:36:18 +0200 Subject: [PATCH 03/13] [gedi] https regex fix --- youtube_dl/extractor/gedidigital.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index 7cd6fb6d373..0dd525d64bb 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -108,7 +108,7 @@ def _real_extract(self, url): class GediDigitalIE(GediDigitalBaseIE): IE_NAME = '' - _VALID_URL = r'''(?x)https://video\. + _VALID_URL = r'''(?x)https?://video\. (?P (?:espresso\.)?repubblica |lastampa From 9c2bbd6c7af5c416893cf6433320f5d894ce4eaa Mon Sep 17 00:00:00 2001 From: nixxo Date: Tue, 19 Jan 2021 16:47:03 +0100 Subject: [PATCH 04/13] [gedidigital] improvements - improved and simplified formats extraction using _extract_akamai_formats - test md5 fix - improved thumbnail extraction --- youtube_dl/extractor/gedidigital.py | 79 +++++++++-------------------- 1 file changed, 23 insertions(+), 56 deletions(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index 0dd525d64bb..ea709fbe713 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -10,6 +10,7 @@ class GediDigitalBaseIE(InfoExtractor): @staticmethod def _clean_audio_fmts(formats): + # remove duplicates audio formats unique_formats = [] for f in formats: if 'acodec' in f: @@ -28,74 +29,40 @@ def _real_extract(self, url): formats = [] audio_fmts = [] - hls_fmts = [] - http_fmts = [] - title = '' - thumb = '' - - fmt_reg = r'(?Pvideo|audio)-(?P

rrtv|hls)-(?P[\w\d]+)(?:-(?P
[\w\d]+))?$' - br_reg = r'video-rrtv-(?P
\d+)-' + title = None + thumb = None for t, n, v in player_data: if t == 'format': - m = re.match(fmt_reg, n) - if m: - # audio formats - if m.group('t') == 'audio': - if m.group('p') == 'hls': - audio_fmts.extend(self._extract_m3u8_formats( - v, video_id, 'm4a', m3u8_id='hls', fatal=False)) - elif m.group('p') == 'rrtv': - audio_fmts.append({ - 'format_id': 'mp3', - 'url': v, - 'tbr': 128, - 'ext': 'mp3', - 'vcodec': 'none', - 'acodec': 'mp3', - }) - - # video formats - elif m.group('t') == 'video': - # hls manifest video - if m.group('p') == 'hls': - hls_fmts.extend(self._extract_m3u8_formats( - v, video_id, 'mp4', m3u8_id='hls', fatal=False)) - # direct mp4 video - elif m.group('p') == 'rrtv': - if not m.group('br'): - mm = re.search(br_reg, v) - http_fmts.append({ - 'format_id': 'https-' + m.group('h'), - 'protocol': 'https', - 'url': v, - 'tbr': int(m.group('br')) if m.group('br') else - (int(mm.group('br')) if mm.group('br') else 0), - 'height': int(m.group('h')) - }) - + if n == 'video-hls-vod-ak': + formats.extend(self._extract_akamai_formats( + v, video_id, {'http': 'media.gedidigital.it'})) + if n == 'audio-hls-vod': + audio_fmts.extend(self._extract_m3u8_formats( + v, video_id, 'm4a', m3u8_id='hls', fatal=False)) + if n == 'audio-rrtv-mp3': + audio_fmts.append({ + 'format_id': 'mp3', + 'url': v, + 'tbr': 128, + 'ext': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + }) elif t == 'param': if n == 'videotitle': title = v - if n == 'image_full_play': + if n in ['image_full_play', 'image_full', 'image']: thumb = v - title = self._og_search_title(webpage) if title == '' else title + title = self._og_search_title(webpage) if not title else title # clean weird char title = compat_str(title).encode('utf8', 'replace').replace(b'\xc3\x82', b'').decode('utf8', 'replace') - if audio_fmts: - self._clean_audio_fmts(audio_fmts) - self._sort_formats(audio_fmts) - if hls_fmts: - self._sort_formats(hls_fmts) - if http_fmts: - self._sort_formats(http_fmts) - + self._clean_audio_fmts(audio_fmts) formats.extend(audio_fmts) - formats.extend(hls_fmts) - formats.extend(http_fmts) + self._sort_formats(formats) return { 'id': video_id, @@ -169,7 +136,7 @@ class GediDigitalIE(GediDigitalBaseIE): }, }, { 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', - 'md5': '0391c2c83c6506581003aaf0255889c0', + 'md5': 'ca3323b47c94cac92fff03eef0387d97', 'info_dict': { 'id': '14772/14870', 'ext': 'mp4', From 93cd298fa388ad973ee09aa68ac2ba3df8ad9c8c Mon Sep 17 00:00:00 2001 From: nixxo Date: Fri, 19 Feb 2021 12:08:16 +0100 Subject: [PATCH 05/13] gedidigital: merged two extractor class together --- youtube_dl/extractor/gedidigital.py | 142 ++++++++++++++-------------- 1 file changed, 70 insertions(+), 72 deletions(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index ea709fbe713..05658eec41e 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -7,73 +7,7 @@ from ..compat import compat_str -class GediDigitalBaseIE(InfoExtractor): - @staticmethod - def _clean_audio_fmts(formats): - # remove duplicates audio formats - unique_formats = [] - for f in formats: - if 'acodec' in f: - unique_formats.append(f) - formats[:] = unique_formats - - def _real_extract(self, url): - u = re.match(self._VALID_URL, url) - self.IE_NAME = u.group('iename') if u.group('iename') else 'gedi' - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - player_data = re.findall( - r'PlayerFactory\.setParam\(\'(?P.+?)\',\s*\'(?P.+?)\',\s*\'(?P.+?)\'\);', - webpage) - - formats = [] - audio_fmts = [] - title = None - thumb = None - - for t, n, v in player_data: - if t == 'format': - if n == 'video-hls-vod-ak': - formats.extend(self._extract_akamai_formats( - v, video_id, {'http': 'media.gedidigital.it'})) - if n == 'audio-hls-vod': - audio_fmts.extend(self._extract_m3u8_formats( - v, video_id, 'm4a', m3u8_id='hls', fatal=False)) - if n == 'audio-rrtv-mp3': - audio_fmts.append({ - 'format_id': 'mp3', - 'url': v, - 'tbr': 128, - 'ext': 'mp3', - 'vcodec': 'none', - 'acodec': 'mp3', - }) - elif t == 'param': - if n == 'videotitle': - title = v - if n in ['image_full_play', 'image_full', 'image']: - thumb = v - - title = self._og_search_title(webpage) if not title else title - - # clean weird char - title = compat_str(title).encode('utf8', 'replace').replace(b'\xc3\x82', b'').decode('utf8', 'replace') - - self._clean_audio_fmts(audio_fmts) - formats.extend(audio_fmts) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': self._html_search_meta('twitter:description', webpage), - 'thumbnail': thumb, - 'formats': formats, - } - - -class GediDigitalIE(GediDigitalBaseIE): +class GediDigitalIE(InfoExtractor): IE_NAME = '' _VALID_URL = r'''(?x)https?://video\. (?P @@ -96,7 +30,7 @@ class GediDigitalIE(GediDigitalBaseIE): (?:\.gelocal)?\.it/.+?/(?P[\d/]+)(?:\?|\&|$)''' _TESTS = [{ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', - 'md5': '84658d7fb9e55a6e57ecc77b73137494', + 'md5': '60f33c793bc396dc23da682d2453feee', 'info_dict': { 'id': '121559/121683', 'ext': 'mp4', @@ -106,7 +40,7 @@ class GediDigitalIE(GediDigitalBaseIE): }, }, { 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', - 'md5': 'e763b94b7920799a0e0e23ffefa2d157', + 'md5': '1737111b9601b2d36b456f992643e911', 'info_dict': { 'id': '367415/367963', 'ext': 'mp4', @@ -116,7 +50,7 @@ class GediDigitalIE(GediDigitalBaseIE): }, }, { 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', - 'md5': 'e48108e97b1af137d22a8469f2019057', + 'md5': '696a20e29a83422125995fc371879bb8', 'info_dict': { 'id': '66184/66267', 'ext': 'mp4', @@ -126,7 +60,7 @@ class GediDigitalIE(GediDigitalBaseIE): }, }, { 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', - 'md5': 'a6e39f3bdc1842bbd92abbbbef230817', + 'md5': '6e5e85fac6cdd8b41b868d55645b411d', 'info_dict': { 'id': '141059/142723', 'ext': 'mp4', @@ -136,7 +70,7 @@ class GediDigitalIE(GediDigitalBaseIE): }, }, { 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', - 'md5': 'ca3323b47c94cac92fff03eef0387d97', + 'md5': '295b7eed409f12c7107f9adca58a0cc6', 'info_dict': { 'id': '14772/14870', 'ext': 'mp4', @@ -178,3 +112,67 @@ class GediDigitalIE(GediDigitalBaseIE): 'url': 'https://video.lasentinella.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', 'only_matching': True, }] + + @staticmethod + def _clean_audio_fmts(formats): + # remove duplicates audio formats + unique_formats = [] + for f in formats: + if 'acodec' in f: + unique_formats.append(f) + formats[:] = unique_formats + + def _real_extract(self, url): + u = re.match(self._VALID_URL, url) + self.IE_NAME = u.group('iename') if u.group('iename') else 'gedi' + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + player_data = re.findall( + r'PlayerFactory\.setParam\(\'(?P.+?)\',\s*\'(?P.+?)\',\s*\'(?P.+?)\'\);', + webpage) + + formats = [] + audio_fmts = [] + title = None + thumb = None + + for t, n, v in player_data: + if t == 'format': + if n == 'video-hls-vod-ak': + formats.extend(self._extract_akamai_formats( + v, video_id, {'http': 'media.gedidigital.it'})) + if n == 'audio-hls-vod': + audio_fmts.extend(self._extract_m3u8_formats( + v, video_id, 'm4a', m3u8_id='hls', fatal=False)) + if n == 'audio-rrtv-mp3': + audio_fmts.append({ + 'format_id': 'mp3', + 'url': v, + 'tbr': 128, + 'ext': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + }) + elif t == 'param': + if n == 'videotitle': + title = v + if n in ['image_full_play', 'image_full', 'image']: + thumb = v + + title = self._og_search_title(webpage) if not title else title + + # clean weird char + title = compat_str(title).encode('utf8', 'replace').replace(b'\xc3\x82', b'').decode('utf8', 'replace') + + self._clean_audio_fmts(audio_fmts) + formats.extend(audio_fmts) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta('twitter:description', webpage), + 'thumbnail': thumb, + 'formats': formats, + } From c0c9385b300f531c556d6fdcd97cfec7f7cee542 Mon Sep 17 00:00:00 2001 From: nixxo Date: Fri, 19 Feb 2021 14:45:35 +0100 Subject: [PATCH 06/13] gedi: added direct https mp4 urls generation --- youtube_dl/extractor/gedidigital.py | 77 ++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index 05658eec41e..b24a3b4433c 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..compat import compat_str +from ..utils import int_or_none class GediDigitalIE(InfoExtractor): @@ -30,7 +31,7 @@ class GediDigitalIE(InfoExtractor): (?:\.gelocal)?\.it/.+?/(?P[\d/]+)(?:\?|\&|$)''' _TESTS = [{ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', - 'md5': '60f33c793bc396dc23da682d2453feee', + 'md5': '84658d7fb9e55a6e57ecc77b73137494', 'info_dict': { 'id': '121559/121683', 'ext': 'mp4', @@ -40,7 +41,7 @@ class GediDigitalIE(InfoExtractor): }, }, { 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', - 'md5': '1737111b9601b2d36b456f992643e911', + 'md5': 'e763b94b7920799a0e0e23ffefa2d157', 'info_dict': { 'id': '367415/367963', 'ext': 'mp4', @@ -50,7 +51,7 @@ class GediDigitalIE(InfoExtractor): }, }, { 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', - 'md5': '696a20e29a83422125995fc371879bb8', + 'md5': 'e48108e97b1af137d22a8469f2019057', 'info_dict': { 'id': '66184/66267', 'ext': 'mp4', @@ -60,7 +61,7 @@ class GediDigitalIE(InfoExtractor): }, }, { 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', - 'md5': '6e5e85fac6cdd8b41b868d55645b411d', + 'md5': 'a6e39f3bdc1842bbd92abbbbef230817', 'info_dict': { 'id': '141059/142723', 'ext': 'mp4', @@ -70,7 +71,7 @@ class GediDigitalIE(InfoExtractor): }, }, { 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', - 'md5': '295b7eed409f12c7107f9adca58a0cc6', + 'md5': 'ca3323b47c94cac92fff03eef0387d97', 'info_dict': { 'id': '14772/14870', 'ext': 'mp4', @@ -122,6 +123,62 @@ def _clean_audio_fmts(formats): unique_formats.append(f) formats[:] = unique_formats + @staticmethod + def _generate_http_urls(mp4, formats): + _QUALITY = { + # tbr: w, h + '200': [428, 240], + '400': [428, 240], + '650': [640, 360], + '1200': [640, 360], + '1800': [854, 480], + '2500': [1280, 720], + '3500': [1280, 720], + '4500': [1920, 1080] + } + _PATTERN = r'(rrtv-([\d\,]+)-)' + + def get_format_info(tbr): + br = int_or_none(tbr) + if len(formats) == 1 and not br: + br = formats[0].get('tbr') + + for f in formats: + if f.get('tbr'): + if (br - br / 100 * 10) <= f['tbr'] <= (br + br / 100 * 10): + return [ + f.get('width'), + f.get('height'), + f['tbr'] + ] + return [None, None, None] + + mobj = re.search(_PATTERN, mp4.get('mp4') or '') + if not mobj: + return None + pattern = mobj.group(1) + + qualities = re.search(_PATTERN, mp4.get('manifest') or '') + if qualities: + qualities = qualities.group(2) + qualities = qualities.split(',') if qualities else ['.'] + qualities = [i for i in qualities if i] + else: + qualities = [mobj.group(2)] + + http_formats = [] + for q in qualities: + w, h, t = get_format_info(q) + http_formats.append({ + 'url': mp4['mp4'].replace(pattern, 'rrtv-%s-' % q), + 'width': w or _QUALITY[q][0], + 'height': h or _QUALITY[q][1], + 'tbr': t or int(q), + 'protocol': 'https', + 'format_id': 'https-%s' % q, + }) + return http_formats + def _real_extract(self, url): u = re.match(self._VALID_URL, url) self.IE_NAME = u.group('iename') if u.group('iename') else 'gedi' @@ -136,15 +193,19 @@ def _real_extract(self, url): audio_fmts = [] title = None thumb = None + mp4 = {} for t, n, v in player_data: if t == 'format': + if n == 'mp4': + mp4.update({'mp4': v}) if n == 'video-hls-vod-ak': + mp4.update({'manifest': v}) formats.extend(self._extract_akamai_formats( v, video_id, {'http': 'media.gedidigital.it'})) if n == 'audio-hls-vod': audio_fmts.extend(self._extract_m3u8_formats( - v, video_id, 'm4a', m3u8_id='hls', fatal=False)) + v, video_id, 'm4a', m3u8_id='audio-hls', fatal=False)) if n == 'audio-rrtv-mp3': audio_fmts.append({ 'format_id': 'mp3', @@ -167,6 +228,10 @@ def _real_extract(self, url): self._clean_audio_fmts(audio_fmts) formats.extend(audio_fmts) + + if mp4: + formats.extend(self._generate_http_urls(mp4, formats) or []) + self._sort_formats(formats) return { From 8a7db200b742eea4b5fff3a7d31faa5284baf698 Mon Sep 17 00:00:00 2001 From: nixxo Date: Fri, 19 Feb 2021 20:11:21 +0100 Subject: [PATCH 07/13] [gedidigital] fixes following review - improved title extraction with fallback value - added timestamp (with tests) - added fallback values for description and thumbnail - improved regex to extract video player data - removed custom IE_NAME --- youtube_dl/extractor/gedidigital.py | 52 ++++++++++++++++------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index b24a3b4433c..a3910f923d3 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -4,14 +4,15 @@ import re from .common import InfoExtractor -from ..compat import compat_str -from ..utils import int_or_none +from ..utils import ( + int_or_none, + parse_iso8601, +) class GediDigitalIE(InfoExtractor): - IE_NAME = '' _VALID_URL = r'''(?x)https?://video\. - (?P + (?: (?:espresso\.)?repubblica |lastampa |ilsecoloxix @@ -38,6 +39,8 @@ class GediDigitalIE(InfoExtractor): 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', + 'timestamp': 1600788168, + 'upload_date': '20200922', }, }, { 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', @@ -48,6 +51,8 @@ class GediDigitalIE(InfoExtractor): 'title': 'Record della pista a Spa Francorchamps, la Pagani Huayra Roadster BC stupisce', 'description': 'md5:5deb503cefe734a3eb3f07ed74303920', 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', + 'timestamp': 1600531032, + 'upload_date': '20200919', }, }, { 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', @@ -55,9 +60,11 @@ class GediDigitalIE(InfoExtractor): 'info_dict': { 'id': '66184/66267', 'ext': 'mp4', - 'title': 'Cassani e i brividi azzurri ai Mondiali di Imola: \\"Qui mi sono innamorato del ciclismo da ragazzino, incredibile tornarci da ct\\"', + 'title': 'Cassani e i brividi azzurri ai Mondiali di Imola: \'Qui mi sono innamorato del ciclismo da ragazzino, incredibile tornarci da ct\'', 'description': 'md5:fc9c50894f70a2469bb9b54d3d0a3d3b', 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', + 'timestamp': 1600852553, + 'upload_date': '20200923', }, }, { 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', @@ -68,6 +75,8 @@ class GediDigitalIE(InfoExtractor): 'title': 'Dentro la notizia - Ferrari, cosa succede a Maranello', 'description': 'md5:9907d65b53765681fa3a0b3122617c1f', 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', + 'timestamp': 1600847536, + 'upload_date': '20200923', }, }, { 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', @@ -75,9 +84,11 @@ class GediDigitalIE(InfoExtractor): 'info_dict': { 'id': '14772/14870', 'ext': 'mp4', - 'title': 'Festival EMERGENCY, Villa: «La buona informazione aiuta la salute» (14772-14870)', + 'title': 'Festival EMERGENCY, Villa: «La buona informazione aiuta la salute»', 'description': 'md5:2bce954d278248f3c950be355b7c2226', 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', + 'timestamp': 1602159940, + 'upload_date': '20201008', }, }, { 'url': 'https://video.messaggeroveneto.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', @@ -180,18 +191,15 @@ def get_format_info(tbr): return http_formats def _real_extract(self, url): - u = re.match(self._VALID_URL, url) - self.IE_NAME = u.group('iename') if u.group('iename') else 'gedi' video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) player_data = re.findall( - r'PlayerFactory\.setParam\(\'(?P.+?)\',\s*\'(?P.+?)\',\s*\'(?P.+?)\'\);', + r'''PlayerFactory\.setParam\('(?Pformat|param)',\s*'(?P(?:audio|video|mp4|image).*?)',\s*'(?P.+?)'\);''', webpage) formats = [] audio_fmts = [] - title = None thumb = None mp4 = {} @@ -208,7 +216,7 @@ def _real_extract(self, url): v, video_id, 'm4a', m3u8_id='audio-hls', fatal=False)) if n == 'audio-rrtv-mp3': audio_fmts.append({ - 'format_id': 'mp3', + 'format_id': 'audio-mp3', 'url': v, 'tbr': 128, 'ext': 'mp3', @@ -216,28 +224,24 @@ def _real_extract(self, url): 'acodec': 'mp3', }) elif t == 'param': - if n == 'videotitle': - title = v if n in ['image_full_play', 'image_full', 'image']: thumb = v - title = self._og_search_title(webpage) if not title else title - - # clean weird char - title = compat_str(title).encode('utf8', 'replace').replace(b'\xc3\x82', b'').decode('utf8', 'replace') - - self._clean_audio_fmts(audio_fmts) + # self._clean_audio_fmts(audio_fmts) formats.extend(audio_fmts) - if mp4: formats.extend(self._generate_http_urls(mp4, formats) or []) - self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'description': self._html_search_meta('twitter:description', webpage), - 'thumbnail': thumb, + 'title': self._html_search_meta('twitter:title', webpage) or self._og_search_title(webpage), + 'description': self._html_search_meta( + ['twitter:description', 'og:description', 'description'], + webpage, default=None), + 'timestamp': parse_iso8601(self._og_search_property( + ['published_time', 'modified_time'], + webpage, default='').strip()), + 'thumbnail': thumb or self._og_search_thumbnail(webpage), 'formats': formats, } From 8f564a71d1669eaee696fc379eab52fbb712ac30 Mon Sep 17 00:00:00 2001 From: nixxo Date: Fri, 19 Feb 2021 20:37:04 +0100 Subject: [PATCH 08/13] [gedidigital] code clean-up - removed http direct url creation (no longer necessary after extract_akamai_formats teke care of it) - removed audio format clean-up --- youtube_dl/extractor/gedidigital.py | 85 ++--------------------------- 1 file changed, 4 insertions(+), 81 deletions(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index a3910f923d3..00bebe183a7 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -4,10 +4,7 @@ import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601, -) +from ..utils import parse_iso8601 class GediDigitalIE(InfoExtractor): @@ -125,97 +122,27 @@ class GediDigitalIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _clean_audio_fmts(formats): - # remove duplicates audio formats - unique_formats = [] - for f in formats: - if 'acodec' in f: - unique_formats.append(f) - formats[:] = unique_formats - - @staticmethod - def _generate_http_urls(mp4, formats): - _QUALITY = { - # tbr: w, h - '200': [428, 240], - '400': [428, 240], - '650': [640, 360], - '1200': [640, 360], - '1800': [854, 480], - '2500': [1280, 720], - '3500': [1280, 720], - '4500': [1920, 1080] - } - _PATTERN = r'(rrtv-([\d\,]+)-)' - - def get_format_info(tbr): - br = int_or_none(tbr) - if len(formats) == 1 and not br: - br = formats[0].get('tbr') - - for f in formats: - if f.get('tbr'): - if (br - br / 100 * 10) <= f['tbr'] <= (br + br / 100 * 10): - return [ - f.get('width'), - f.get('height'), - f['tbr'] - ] - return [None, None, None] - - mobj = re.search(_PATTERN, mp4.get('mp4') or '') - if not mobj: - return None - pattern = mobj.group(1) - - qualities = re.search(_PATTERN, mp4.get('manifest') or '') - if qualities: - qualities = qualities.group(2) - qualities = qualities.split(',') if qualities else ['.'] - qualities = [i for i in qualities if i] - else: - qualities = [mobj.group(2)] - - http_formats = [] - for q in qualities: - w, h, t = get_format_info(q) - http_formats.append({ - 'url': mp4['mp4'].replace(pattern, 'rrtv-%s-' % q), - 'width': w or _QUALITY[q][0], - 'height': h or _QUALITY[q][1], - 'tbr': t or int(q), - 'protocol': 'https', - 'format_id': 'https-%s' % q, - }) - return http_formats - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) player_data = re.findall( - r'''PlayerFactory\.setParam\('(?Pformat|param)',\s*'(?P(?:audio|video|mp4|image).*?)',\s*'(?P.+?)'\);''', + r'''PlayerFactory\.setParam\('(?Pformat|param)',\s*'(?P(?:audio|video|image).*?)',\s*'(?P.+?)'\);''', webpage) formats = [] - audio_fmts = [] thumb = None - mp4 = {} for t, n, v in player_data: if t == 'format': - if n == 'mp4': - mp4.update({'mp4': v}) if n == 'video-hls-vod-ak': - mp4.update({'manifest': v}) formats.extend(self._extract_akamai_formats( v, video_id, {'http': 'media.gedidigital.it'})) if n == 'audio-hls-vod': - audio_fmts.extend(self._extract_m3u8_formats( + formats.extend(self._extract_m3u8_formats( v, video_id, 'm4a', m3u8_id='audio-hls', fatal=False)) if n == 'audio-rrtv-mp3': - audio_fmts.append({ + formats.append({ 'format_id': 'audio-mp3', 'url': v, 'tbr': 128, @@ -227,10 +154,6 @@ def _real_extract(self, url): if n in ['image_full_play', 'image_full', 'image']: thumb = v - # self._clean_audio_fmts(audio_fmts) - formats.extend(audio_fmts) - if mp4: - formats.extend(self._generate_http_urls(mp4, formats) or []) self._sort_formats(formats) return { From 92e71b19e67fbb2864466e9e4bb09cf3d7f7c623 Mon Sep 17 00:00:00 2001 From: nixxo Date: Sat, 20 Feb 2021 09:22:30 +0100 Subject: [PATCH 09/13] [gedi] updated unreachable test urls --- youtube_dl/extractor/gedidigital.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index 00bebe183a7..e687dab49bd 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -88,37 +88,37 @@ class GediDigitalIE(InfoExtractor): 'upload_date': '20201008', }, }, { - 'url': 'https://video.messaggeroveneto.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'url': 'https://video.messaggeroveneto.gelocal.it/locale/maria-giovanna-elmi-covid-vaccino/138155/139268', 'only_matching': True, }, { - 'url': 'https://video.ilpiccolo.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'url': 'https://video.ilpiccolo.gelocal.it/dossier/big-john/dinosauro-big-john-al-via-le-visite-guidate-a-trieste/135226/135751', 'only_matching': True, }, { - 'url': 'https://video.gazzettadimantova.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'url': 'https://video.gazzettadimantova.gelocal.it/locale/dal-ponte-visconteo-di-valeggio-l-and-8217sos-dei-ristoratori-aprire-anche-a-cena/137310/137818', 'only_matching': True, }, { - 'url': 'https://video.mattinopadova.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'url': 'https://video.mattinopadova.gelocal.it/dossier/coronavirus-in-veneto/covid-a-vo-un-anno-dopo-un-cuore-tricolore-per-non-dimenticare/138402/138964', 'only_matching': True, }, { - 'url': 'https://video.laprovinciapavese.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'url': 'https://video.laprovinciapavese.gelocal.it/locale/mede-zona-rossa-via-alle-vaccinazioni-per-gli-over-80/137545/138120', 'only_matching': True, }, { - 'url': 'https://video.tribunatreviso.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'url': 'https://video.tribunatreviso.gelocal.it/dossier/coronavirus-in-veneto/ecco-le-prima-vaccinazioni-di-massa-nella-marca/134485/135024', 'only_matching': True, }, { - 'url': 'https://video.nuovavenezia.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'url': 'https://video.nuovavenezia.gelocal.it/locale/camion-troppo-alto-per-il-ponte-ferroviario-perde-il-carico/135734/136266', 'only_matching': True, }, { - 'url': 'https://video.gazzettadimodena.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'url': 'https://video.gazzettadimodena.gelocal.it/locale/modena-scoperta-la-proteina-che-predice-il-livello-di-gravita-del-covid/139109/139796', 'only_matching': True, }, { - 'url': 'https://video.lanuovaferrara.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'url': 'https://video.lanuovaferrara.gelocal.it/locale/due-bombole-di-gpl-aperte-e-abbandonate-i-vigili-bruciano-il-gas/134391/134957', 'only_matching': True, }, { - 'url': 'https://video.corrierealpi.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'url': 'https://video.corrierealpi.gelocal.it/dossier/cortina-2021-i-mondiali-di-sci-alpino/mondiali-di-sci-il-timelapse-sulla-splendida-olympia/133760/134331', 'only_matching': True, }, { - 'url': 'https://video.lasentinella.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/133362/134466', + 'url': 'https://video.lasentinella.gelocal.it/locale/vestigne-centra-un-auto-e-si-ribalta/138931/139466', 'only_matching': True, }] From afc3669d9bbf083a9ce238be53bb3691f479fa29 Mon Sep 17 00:00:00 2001 From: nixxo Date: Sat, 20 Feb 2021 15:43:35 +0100 Subject: [PATCH 10/13] [gedi] improved formats extraction and updated test hashes --- youtube_dl/extractor/gedidigital.py | 56 ++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index e687dab49bd..b10ce8ad493 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -4,7 +4,10 @@ import re from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( + int_or_none, + parse_iso8601, +) class GediDigitalIE(InfoExtractor): @@ -29,7 +32,7 @@ class GediDigitalIE(InfoExtractor): (?:\.gelocal)?\.it/.+?/(?P[\d/]+)(?:\?|\&|$)''' _TESTS = [{ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', - 'md5': '84658d7fb9e55a6e57ecc77b73137494', + 'md5': '1e9bbbfb7c563b6858376fa6e4211b30', 'info_dict': { 'id': '121559/121683', 'ext': 'mp4', @@ -41,7 +44,7 @@ class GediDigitalIE(InfoExtractor): }, }, { 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', - 'md5': 'e763b94b7920799a0e0e23ffefa2d157', + 'md5': 'c75ba5637a3c375a1b09062d7a7bd305', 'info_dict': { 'id': '367415/367963', 'ext': 'mp4', @@ -53,7 +56,7 @@ class GediDigitalIE(InfoExtractor): }, }, { 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', - 'md5': 'e48108e97b1af137d22a8469f2019057', + 'md5': '08097084884a2edfc532fb1f2434d22a', 'info_dict': { 'id': '66184/66267', 'ext': 'mp4', @@ -65,7 +68,7 @@ class GediDigitalIE(InfoExtractor): }, }, { 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', - 'md5': 'a6e39f3bdc1842bbd92abbbbef230817', + 'md5': 'ad0bfd3683e7e2bbe3f52b0d5c27ecb4', 'info_dict': { 'id': '141059/142723', 'ext': 'mp4', @@ -77,7 +80,7 @@ class GediDigitalIE(InfoExtractor): }, }, { 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', - 'md5': 'ca3323b47c94cac92fff03eef0387d97', + 'md5': '0391c2c83c6506581003aaf0255889c0', 'info_dict': { 'id': '14772/14870', 'ext': 'mp4', @@ -127,20 +130,42 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) player_data = re.findall( - r'''PlayerFactory\.setParam\('(?Pformat|param)',\s*'(?P(?:audio|video|image).*?)',\s*'(?P.+?)'\);''', + r'''PlayerFactory\.setParam\('(?Pformat|param)',\s*'(?P(?:audio|video|image|mp4).*?)',\s*'(?P.+?)'\);''', webpage) formats = [] thumb = None - for t, n, v in player_data: if t == 'format': - if n == 'video-hls-vod-ak': - formats.extend(self._extract_akamai_formats( - v, video_id, {'http': 'media.gedidigital.it'})) - if n == 'audio-hls-vod': + # http direct formats + fmt = re.match(r'(?:video|mp4)(?:-rrtv-)?(\d+)?-?(\d+)?$', n) + if fmt: + formats.append({ + 'format_id': n if 'video' in n else 'video-%s' % n, + 'url': v, + 'ext': 'mp4', + 'protocol': 'https', + 'height': int_or_none(fmt.group(1)) or 360, + 'tbr': int_or_none(fmt.group(2)) or ( + 4500 if fmt.group(1) == '1080' else 650), + }) + continue + # hls formats + fmt = re.match(r'(video|audio)-hls-', n) + if fmt: + ext = 'mp4' if fmt.group(1) == 'video' else 'm4a' formats.extend(self._extract_m3u8_formats( - v, video_id, 'm4a', m3u8_id='audio-hls', fatal=False)) + v, video_id, ext, m3u8_id=n, fatal=False)) + continue + # hds formats + if 'video-hds-' in n: + f4m_formats = self._extract_f4m_formats( + '%s?hdcore=3.7.0' % v, video_id, f4m_id=n, fatal=False) + for entry in f4m_formats: + entry.update({'extra_param_to_segment_url': 'hdcore=3.7.0'}) + formats.extend(f4m_formats) + continue + # mp3 audio if n == 'audio-rrtv-mp3': formats.append({ 'format_id': 'audio-mp3', @@ -150,9 +175,8 @@ def _real_extract(self, url): 'vcodec': 'none', 'acodec': 'mp3', }) - elif t == 'param': - if n in ['image_full_play', 'image_full', 'image']: - thumb = v + elif t == 'param' and n in ['image_full_play', 'image_full', 'image']: + thumb = v self._sort_formats(formats) From 47e832018d5b2a4ceb070df8cb0b190aabcbd375 Mon Sep 17 00:00:00 2001 From: nixxo Date: Sun, 21 Feb 2021 12:02:01 +0100 Subject: [PATCH 11/13] [gedi] cleanup following review - removed 'image_full_play' from thumbnail url extraction and adjusted test value - removed 'modified_time' as a value to timestamp generation - put 'only_matching' to some no longer necessary tests - added 'ipad' tag to hls formats selection --- youtube_dl/extractor/gedidigital.py | 52 +++++------------------------ 1 file changed, 8 insertions(+), 44 deletions(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index b10ce8ad493..f269ffb49d0 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -38,58 +38,23 @@ class GediDigitalIE(InfoExtractor): 'ext': 'mp4', 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', - 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', + 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-full-.+?\.jpg$', 'timestamp': 1600788168, 'upload_date': '20200922', }, }, { 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', 'md5': 'c75ba5637a3c375a1b09062d7a7bd305', - 'info_dict': { - 'id': '367415/367963', - 'ext': 'mp4', - 'title': 'Record della pista a Spa Francorchamps, la Pagani Huayra Roadster BC stupisce', - 'description': 'md5:5deb503cefe734a3eb3f07ed74303920', - 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', - 'timestamp': 1600531032, - 'upload_date': '20200919', - }, + 'only_matching': True, }, { 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', - 'md5': '08097084884a2edfc532fb1f2434d22a', - 'info_dict': { - 'id': '66184/66267', - 'ext': 'mp4', - 'title': 'Cassani e i brividi azzurri ai Mondiali di Imola: \'Qui mi sono innamorato del ciclismo da ragazzino, incredibile tornarci da ct\'', - 'description': 'md5:fc9c50894f70a2469bb9b54d3d0a3d3b', - 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', - 'timestamp': 1600852553, - 'upload_date': '20200923', - }, + 'only_matching': True, }, { 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', - 'md5': 'ad0bfd3683e7e2bbe3f52b0d5c27ecb4', - 'info_dict': { - 'id': '141059/142723', - 'ext': 'mp4', - 'title': 'Dentro la notizia - Ferrari, cosa succede a Maranello', - 'description': 'md5:9907d65b53765681fa3a0b3122617c1f', - 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', - 'timestamp': 1600847536, - 'upload_date': '20200923', - }, + 'only_matching': True, }, { 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', - 'md5': '0391c2c83c6506581003aaf0255889c0', - 'info_dict': { - 'id': '14772/14870', - 'ext': 'mp4', - 'title': 'Festival EMERGENCY, Villa: «La buona informazione aiuta la salute»', - 'description': 'md5:2bce954d278248f3c950be355b7c2226', - 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-social-play\.jpg$', - 'timestamp': 1602159940, - 'upload_date': '20201008', - }, + 'only_matching': True, }, { 'url': 'https://video.messaggeroveneto.gelocal.it/locale/maria-giovanna-elmi-covid-vaccino/138155/139268', 'only_matching': True, @@ -151,7 +116,7 @@ def _real_extract(self, url): }) continue # hls formats - fmt = re.match(r'(video|audio)-hls-', n) + fmt = re.match(r'(video|audio)-(hls|ipad)-', n) if fmt: ext = 'mp4' if fmt.group(1) == 'video' else 'm4a' formats.extend(self._extract_m3u8_formats( @@ -175,7 +140,7 @@ def _real_extract(self, url): 'vcodec': 'none', 'acodec': 'mp3', }) - elif t == 'param' and n in ['image_full_play', 'image_full', 'image']: + elif t == 'param' and n in ['image_full', 'image']: thumb = v self._sort_formats(formats) @@ -187,8 +152,7 @@ def _real_extract(self, url): ['twitter:description', 'og:description', 'description'], webpage, default=None), 'timestamp': parse_iso8601(self._og_search_property( - ['published_time', 'modified_time'], - webpage, default='').strip()), + 'published_time', webpage, default='').strip()), 'thumbnail': thumb or self._og_search_thumbnail(webpage), 'formats': formats, } From fbf5cfa9e145851b9322f9c1c46b46e012b54c21 Mon Sep 17 00:00:00 2001 From: nixxo Date: Sun, 21 Feb 2021 14:32:32 +0100 Subject: [PATCH 12/13] [gedi] fixes - removed md5 leftover in test - added comment in test for .strip() necessity - changed default='' to fatal=False in _og_search_property for timestamp genaration --- youtube_dl/extractor/gedidigital.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index f269ffb49d0..68aeb418f49 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -42,9 +42,12 @@ class GediDigitalIE(InfoExtractor): 'timestamp': 1600788168, 'upload_date': '20200922', }, + }, { + # .strip() necessary in timezone creation + 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', + 'only_matching': True, }, { 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', - 'md5': 'c75ba5637a3c375a1b09062d7a7bd305', 'only_matching': True, }, { 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', @@ -52,9 +55,6 @@ class GediDigitalIE(InfoExtractor): }, { 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', 'only_matching': True, - }, { - 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', - 'only_matching': True, }, { 'url': 'https://video.messaggeroveneto.gelocal.it/locale/maria-giovanna-elmi-covid-vaccino/138155/139268', 'only_matching': True, @@ -152,7 +152,7 @@ def _real_extract(self, url): ['twitter:description', 'og:description', 'description'], webpage, default=None), 'timestamp': parse_iso8601(self._og_search_property( - 'published_time', webpage, default='').strip()), + 'published_time', webpage, fatal=False).strip()), 'thumbnail': thumb or self._og_search_thumbnail(webpage), 'formats': formats, } From 549621473da8560260320f692a7427f10270ee85 Mon Sep 17 00:00:00 2001 From: nixxo Date: Mon, 22 Feb 2021 19:18:37 +0100 Subject: [PATCH 13/13] [gedi] removed timezone extraction end relative test values --- youtube_dl/extractor/gedidigital.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index 68aeb418f49..dc48b1f153d 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -4,10 +4,7 @@ import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601, -) +from ..utils import int_or_none class GediDigitalIE(InfoExtractor): @@ -39,11 +36,8 @@ class GediDigitalIE(InfoExtractor): 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-full-.+?\.jpg$', - 'timestamp': 1600788168, - 'upload_date': '20200922', }, }, { - # .strip() necessary in timezone creation 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', 'only_matching': True, }, { @@ -151,8 +145,6 @@ def _real_extract(self, url): 'description': self._html_search_meta( ['twitter:description', 'og:description', 'description'], webpage, default=None), - 'timestamp': parse_iso8601(self._og_search_property( - 'published_time', webpage, fatal=False).strip()), 'thumbnail': thumb or self._og_search_thumbnail(webpage), 'formats': formats, }