From f19b07f6bea5b8ae39025e3dfb23f3a6ee05cce5 Mon Sep 17 00:00:00 2001 From: Kay B <> Date: Sun, 3 Sep 2017 14:12:08 +0200 Subject: [PATCH 1/5] [Heise] Add support for new c't uplink episodes --- youtube_dl/extractor/heise.py | 77 ++++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 382f32771db..72430258d45 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -9,6 +9,8 @@ xpath_text, ) +import re + class HeiseIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?heise\.de/(?:[^/]+/)+[^/]+-(?P[0-9]+)\.html' @@ -25,6 +27,14 @@ class HeiseIE(InfoExtractor): 'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', 'thumbnail': r're:^https?://.*/gallery/$', } + }, { + 'url': 'https://www.heise.de/ct/artikel/c-t-uplink-18-5-Android-Oreo-Nokia-Galaxy-Note-8-AMD-Ryzen-Threadripper-3812972.html',# noqa + 'info_dict': { + 'id': '3812972', + 'ext': 'mp4', + 'title': "c't uplink 18.5: Android Oreo, Nokia, Galaxy Note 8, AMD Ryzen Threadripper",# noqa + 'description': 'md5:0601ade34ae5c4f5058d378327928348' + } }, { 'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 'only_matching': True, @@ -53,11 +63,20 @@ def _real_extract(self, url): r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', webpage, 'title') - doc = self._download_xml( - 'http://www.heise.de/videout/feed', video_id, query={ - 'container': container_id, - 'sequenz': sequenz_id, - }) + # videout/feed still and exlusively works for older videos + # for new ct uplink episodes, whe need the work-around below. + try: + doc = self._download_xml( + 'http://www.heise.de/videout/feed', video_id, query={ + 'container': container_id, + 'sequenz': sequenz_id, + }) + except Exception as e: + if e.cause.code == 404: + if title.rfind('c\'t') != -1: + return self.ctUplinkHelper(title, video_id) + else: + raise e formats = [] for source_node in doc.findall('.//{http://rss.jwpcdn.com/}source'): @@ -88,3 +107,51 @@ def _real_extract(self, url): self._html_search_meta('date', webpage)), 'formats': formats, } + + def ctUplinkHelper(self, title, video_id): + formats = [] + + # e.g. "18.5" from "c't uplink 18.5:" + episode_str = re.findall(r'[0-9]{1,2}.[0-9]{1,2}', title) + + sd_rss_feed = self._download_xml( + 'https://blog.ct.de/ctuplink/ctuplinkvideo.rss', + video_id, "Downloading alternative XML (SD)") + hd_rss_feed = self._download_xml( + 'https://blog.ct.de/ctuplink/ctuplinkvideohd.rss', + video_id, "Downloading alternative XML (HD)") + + titles = hd_rss_feed.findall('./channel/item/title') + descriptions = hd_rss_feed.findall('./channel/item/description') + + sd_video_urls = sd_rss_feed.findall('./channel/item/guid') + hd_video_urls = hd_rss_feed.findall('./channel/item/guid') + + # try to find the real matching title. it might be misformatted or so. + # thereby only rely on the episode_str, e.g. "18.5" + episode_index = -1 + for index, item in enumerate(titles): + if titles[index].text.rfind(episode_str[0]) != -1: + episode_index = index + break + + # in case something went wrong + if episode_index == -1: + return + + formats.append({ + 'url': sd_video_urls[episode_index].text, + 'height': 360}) + + formats.append({ + 'url': hd_video_urls[episode_index].text, + 'height': 720}) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': descriptions[episode_index].text, + 'formats': formats + } From 36d79d4a3ddfb024d23c354dd7f0f121275e5eb0 Mon Sep 17 00:00:00 2001 From: Kay B <> Date: Sun, 17 Sep 2017 22:55:52 +0200 Subject: [PATCH 2/5] [Heise] Remove code duplications for PR #14108 --- youtube_dl/extractor/heise.py | 36 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 72430258d45..991e6520c74 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -109,43 +109,37 @@ def _real_extract(self, url): } def ctUplinkHelper(self, title, video_id): - formats = [] - # e.g. "18.5" from "c't uplink 18.5:" episode_str = re.findall(r'[0-9]{1,2}.[0-9]{1,2}', title) - sd_rss_feed = self._download_xml( + feeds = [ + self._download_xml( 'https://blog.ct.de/ctuplink/ctuplinkvideo.rss', - video_id, "Downloading alternative XML (SD)") - hd_rss_feed = self._download_xml( + video_id, "Downloading alternative XML (SD)"), + self._download_xml( 'https://blog.ct.de/ctuplink/ctuplinkvideohd.rss', - video_id, "Downloading alternative XML (HD)") - - titles = hd_rss_feed.findall('./channel/item/title') - descriptions = hd_rss_feed.findall('./channel/item/description') + video_id, "Downloading alternative XML (HD)")] - sd_video_urls = sd_rss_feed.findall('./channel/item/guid') - hd_video_urls = hd_rss_feed.findall('./channel/item/guid') + titles = feeds[0].findall('./channel/item/title') + descriptions = feeds[0].findall('./channel/item/description') - # try to find the real matching title. it might be misformatted or so. - # thereby only rely on the episode_str, e.g. "18.5" + # try to find the real matching title. + # only rely on the episode_str, e.g. "18.5". episode_index = -1 for index, item in enumerate(titles): if titles[index].text.rfind(episode_str[0]) != -1: episode_index = index break - # in case something went wrong + # in case episode not found at all if episode_index == -1: return - formats.append({ - 'url': sd_video_urls[episode_index].text, - 'height': 360}) - - formats.append({ - 'url': hd_video_urls[episode_index].text, - 'height': 720}) + formats = [] + for feed in feeds: + formats.append({ + 'url': feed.findall('./channel/item/guid')[episode_index].text, + 'ext': 'mp4'}) self._sort_formats(formats) From 5da955312c35454c4eeee00440a14ff03d46f48d Mon Sep 17 00:00:00 2001 From: Kay B <> Date: Sun, 17 Sep 2017 23:42:38 +0200 Subject: [PATCH 3/5] [Heise] Add audio rss, resolve url redirects --- youtube_dl/extractor/heise.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 991e6520c74..7124019dfb3 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -114,10 +114,13 @@ def ctUplinkHelper(self, title, video_id): feeds = [ self._download_xml( - 'https://blog.ct.de/ctuplink/ctuplinkvideo.rss', + 'https://www.heise.de/ct/uplink/ctuplink.rss', + video_id, "Downloading alternative XML (audio)"), + self._download_xml( + 'https://www.heise.de/ct/uplink/ctuplinkvideo.rss', video_id, "Downloading alternative XML (SD)"), self._download_xml( - 'https://blog.ct.de/ctuplink/ctuplinkvideohd.rss', + 'https://www.heise.de/ct/uplink/ctuplinkvideohd.rss', video_id, "Downloading alternative XML (HD)")] titles = feeds[0].findall('./channel/item/title') @@ -131,15 +134,16 @@ def ctUplinkHelper(self, title, video_id): episode_index = index break - # in case episode not found at all + # in case episode not found at all if episode_index == -1: return formats = [] for feed in feeds: + url = feed.findall('./channel/item/guid')[episode_index].text formats.append({ - 'url': feed.findall('./channel/item/guid')[episode_index].text, - 'ext': 'mp4'}) + 'url': url, + 'ext': determine_ext(url, '')}) self._sort_formats(formats) From 1721d4a8467c7216e90398700270aaccbdf439a2 Mon Sep 17 00:00:00 2001 From: Kay B <> Date: Mon, 18 Sep 2017 01:15:36 +0200 Subject: [PATCH 4/5] [Heise] Improve RSS download for PR #14108 --- youtube_dl/extractor/heise.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 7124019dfb3..5aefc8f8f02 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -112,16 +112,19 @@ def ctUplinkHelper(self, title, video_id): # e.g. "18.5" from "c't uplink 18.5:" episode_str = re.findall(r'[0-9]{1,2}.[0-9]{1,2}', title) - feeds = [ - self._download_xml( + feeds = [] + for i, feed in enumerate([ 'https://www.heise.de/ct/uplink/ctuplink.rss', - video_id, "Downloading alternative XML (audio)"), - self._download_xml( 'https://www.heise.de/ct/uplink/ctuplinkvideo.rss', - video_id, "Downloading alternative XML (SD)"), - self._download_xml( - 'https://www.heise.de/ct/uplink/ctuplinkvideohd.rss', - video_id, "Downloading alternative XML (HD)")] + 'https://www.heise.de/ct/uplink/ctuplinkvideohd.rss']): + xml = self._download_xml(feed, video_id, + "Downloading alternative XML (%s)" % (['audio', 'SD video', 'HD video'][i]), + fatal=False) + if xml is not False: + feeds.append(xml) + + if len(feeds) == 0: + return titles = feeds[0].findall('./channel/item/title') descriptions = feeds[0].findall('./channel/item/description') From 237794107bc7125bf598aea01abd593c1bf9ff45 Mon Sep 17 00:00:00 2001 From: Kay B <> Date: Mon, 18 Sep 2017 02:00:29 +0200 Subject: [PATCH 5/5] [heise] Further improve RSS download --- youtube_dl/extractor/heise.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 5aefc8f8f02..2d9fd761eef 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -113,13 +113,13 @@ def ctUplinkHelper(self, title, video_id): episode_str = re.findall(r'[0-9]{1,2}.[0-9]{1,2}', title) feeds = [] - for i, feed in enumerate([ - 'https://www.heise.de/ct/uplink/ctuplink.rss', - 'https://www.heise.de/ct/uplink/ctuplinkvideo.rss', - 'https://www.heise.de/ct/uplink/ctuplinkvideohd.rss']): - xml = self._download_xml(feed, video_id, - "Downloading alternative XML (%s)" % (['audio', 'SD video', 'HD video'][i]), - fatal=False) + for path, format_id in ( + ('', 'audio'), + ('video', 'HD video'), + ('videohd', 'SD video')): + xml = self._download_xml( + 'https://www.heise.de/ct/uplink/ctuplink%s.rss' % path, + video_id, 'Downloading %s feed' % format_id, fatal=False) if xml is not False: feeds.append(xml)