diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 382f32771db..2d9fd761eef 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -9,6 +9,8 @@ xpath_text, ) +import re + class HeiseIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?heise\.de/(?:[^/]+/)+[^/]+-(?P[0-9]+)\.html' @@ -25,6 +27,14 @@ class HeiseIE(InfoExtractor): 'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', 'thumbnail': r're:^https?://.*/gallery/$', } + }, { + 'url': 'https://www.heise.de/ct/artikel/c-t-uplink-18-5-Android-Oreo-Nokia-Galaxy-Note-8-AMD-Ryzen-Threadripper-3812972.html',# noqa + 'info_dict': { + 'id': '3812972', + 'ext': 'mp4', + 'title': "c't uplink 18.5: Android Oreo, Nokia, Galaxy Note 8, AMD Ryzen Threadripper",# noqa + 'description': 'md5:0601ade34ae5c4f5058d378327928348' + } }, { 'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 'only_matching': True, @@ -53,11 +63,20 @@ def _real_extract(self, url): r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', webpage, 'title') - doc = self._download_xml( - 'http://www.heise.de/videout/feed', video_id, query={ - 'container': container_id, - 'sequenz': sequenz_id, - }) + # videout/feed still and exlusively works for older videos + # for new ct uplink episodes, whe need the work-around below. + try: + doc = self._download_xml( + 'http://www.heise.de/videout/feed', video_id, query={ + 'container': container_id, + 'sequenz': sequenz_id, + }) + except Exception as e: + if e.cause.code == 404: + if title.rfind('c\'t') != -1: + return self.ctUplinkHelper(title, video_id) + else: + raise e formats = [] for source_node in doc.findall('.//{http://rss.jwpcdn.com/}source'): @@ -88,3 +107,52 @@ def _real_extract(self, url): self._html_search_meta('date', webpage)), 'formats': formats, } + + def ctUplinkHelper(self, title, video_id): + # e.g. "18.5" from "c't uplink 18.5:" + episode_str = re.findall(r'[0-9]{1,2}.[0-9]{1,2}', title) + + feeds = [] + for path, format_id in ( + ('', 'audio'), + ('video', 'HD video'), + ('videohd', 'SD video')): + xml = self._download_xml( + 'https://www.heise.de/ct/uplink/ctuplink%s.rss' % path, + video_id, 'Downloading %s feed' % format_id, fatal=False) + if xml is not False: + feeds.append(xml) + + if len(feeds) == 0: + return + + titles = feeds[0].findall('./channel/item/title') + descriptions = feeds[0].findall('./channel/item/description') + + # try to find the real matching title. + # only rely on the episode_str, e.g. "18.5". + episode_index = -1 + for index, item in enumerate(titles): + if titles[index].text.rfind(episode_str[0]) != -1: + episode_index = index + break + + # in case episode not found at all + if episode_index == -1: + return + + formats = [] + for feed in feeds: + url = feed.findall('./channel/item/guid')[episode_index].text + formats.append({ + 'url': url, + 'ext': determine_ext(url, '')}) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': descriptions[episode_index].text, + 'formats': formats + }