From 5650b0d582d43ab5ee88a66267dc1899ff181205 Mon Sep 17 00:00:00 2001 From: Steven Gosseling Date: Fri, 26 Feb 2016 13:31:52 +0100 Subject: [PATCH 1/5] [closertotruth] Add new extractor Removed print statement from code. Replaced two regex searches with the corret ones. Removed some unnecessary semicolumns fixed title extraction refactored everything to search_regex --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/closertotruth.py | 61 +++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 youtube_dl/extractor/closertotruth.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1ae606f1eaa..8611cceb197 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -115,6 +115,7 @@ from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE +from .closertotruth import CloserToTruthIE from .cloudy import CloudyIE from .clubic import ClubicIE from .clyp import ClypIE diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py new file mode 100644 index 00000000000..11781b0b7be --- /dev/null +++ b/youtube_dl/extractor/closertotruth.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import itertools +import hashlib + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, +) + +class CloserToTruthIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?closertotruth\.com/series/\S+#video-(?P\w+)' + _TESTS = [{ + 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', + 'md5': '2aa5b8971633d86fe32152827846a5b4', + 'info_dict': { + 'id': '3688', + 'ext': 'mov', + 'title': 'Solutions to the Mind-Body Problem? - Dean W.Zimmerman ' + } + },{ + 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-4048', + 'md5': 'a3882bb6e453720d8a7a3983f58abd04', + 'info_dict': { + 'id': '4048', + 'ext': 'mov', + 'title': 'Solutions to the Mind-Body Problem? - John Searle ' + } + }] + + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + #compose title for video + video_title = self._search_regex(r'(.+) \|.+', webpage, 'video title') + + entry_id = self._search_regex(r'.+', webpage, "video entry_id") + interviewee_name = re.sub(r'(<[^>]+>)', '',self._search_regex(r'(.+)', webpage, "video interviewee_name")) + + video_title = video_title + ' - ' + interviewee_name + + #extract the partner id for kaltura.com + p_id = self._search_regex(r'', webpage, "kaltura partner_id") + + #request video url at kaltura API + #from: http://knowledge.kaltura.com/faq/how-retrieve-download-or-streaming-url-using-api-calls + api_request_url = 'http://www.kaltura.com/p/'+p_id+'/sp/0/playManifest/entryId/'+entry_id+'/protocol/HTTPS/flavorParamId/0/video.mp4' + api_response = self._download_webpage(api_request_url, video_id) + + video_url = self._search_regex(r' Date: Sat, 27 Feb 2016 18:06:38 +0100 Subject: [PATCH 2/5] processed comments on commit 5650b0d, fixed feedback from flake8 --- youtube_dl/extractor/closertotruth.py | 47 +++++++-------------------- 1 file changed, 12 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py index 11781b0b7be..1e5fa1be5c0 100644 --- a/youtube_dl/extractor/closertotruth.py +++ b/youtube_dl/extractor/closertotruth.py @@ -2,14 +2,9 @@ from __future__ import unicode_literals import re -import itertools -import hashlib from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, -) + class CloserToTruthIE(InfoExtractor): _VALID_URL = r'http?://(?:www\.)?closertotruth\.com/series/\S+#video-(?P\w+)' @@ -17,45 +12,27 @@ class CloserToTruthIE(InfoExtractor): 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', 'md5': '2aa5b8971633d86fe32152827846a5b4', 'info_dict': { - 'id': '3688', - 'ext': 'mov', - 'title': 'Solutions to the Mind-Body Problem? - Dean W.Zimmerman ' - } - },{ - 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-4048', - 'md5': 'a3882bb6e453720d8a7a3983f58abd04', - 'info_dict': { - 'id': '4048', + 'id': '0_zh2b6eqr', 'ext': 'mov', - 'title': 'Solutions to the Mind-Body Problem? - John Searle ' + 'title': 'ZimDe-010-S', + 'upload_date': '20140307', + 'timestamp': 1394236392, + 'uploader_id': 'CTTXML' } }] - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - - #compose title for video + video_title = self._search_regex(r'(.+) \|.+', webpage, 'video title') - entry_id = self._search_regex(r'.+', webpage, "video entry_id") - interviewee_name = re.sub(r'(<[^>]+>)', '',self._search_regex(r'(.+)', webpage, "video interviewee_name")) + entry_id = self._search_regex(r'.+', webpage, "video entry_id") + interviewee_name = re.sub(r'(<[^>]+>)', '', self._search_regex(r'(.+)', webpage, "video interviewee_name")) video_title = video_title + ' - ' + interviewee_name - #extract the partner id for kaltura.com + # extract the partner id for kaltura.com p_id = self._search_regex(r'', webpage, "kaltura partner_id") - - #request video url at kaltura API - #from: http://knowledge.kaltura.com/faq/how-retrieve-download-or-streaming-url-using-api-calls - api_request_url = 'http://www.kaltura.com/p/'+p_id+'/sp/0/playManifest/entryId/'+entry_id+'/protocol/HTTPS/flavorParamId/0/video.mp4' - api_response = self._download_webpage(api_request_url, video_id) - - video_url = self._search_regex(r' Date: Sat, 27 Feb 2016 22:28:39 +0100 Subject: [PATCH 3/5] Improved regexes and returns info dict now. --- youtube_dl/extractor/closertotruth.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py index 1e5fa1be5c0..66b41a9d35b 100644 --- a/youtube_dl/extractor/closertotruth.py +++ b/youtube_dl/extractor/closertotruth.py @@ -7,7 +7,7 @@ class CloserToTruthIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?closertotruth\.com/series/\S+#video-(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/series/[^#]+#video-(?P\w+)' _TESTS = [{ 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', 'md5': '2aa5b8971633d86fe32152827846a5b4', @@ -27,12 +27,17 @@ def _real_extract(self, url): video_title = self._search_regex(r'(.+) \|.+', webpage, 'video title') - entry_id = self._search_regex(r'.+', webpage, "video entry_id") + entry_id = self._search_regex(r']+id="video-%s"[^>]+data-kaltura="([^"]+)' % video_id, webpage, "video entry_id") interviewee_name = re.sub(r'(<[^>]+>)', '', self._search_regex(r'(.+)', webpage, "video interviewee_name")) video_title = video_title + ' - ' + interviewee_name - # extract the partner id for kaltura.com - p_id = self._search_regex(r'', webpage, "kaltura partner_id") + p_id = self._search_regex(r']+src=["\'].+?partner_id/(\d+)', webpage, "kaltura partner_id") - return self.url_result('kaltura:%s:%s' % (p_id, entry_id), 'Kaltura', entry_id, video_title) + return { + '_type': 'url_transparent', + 'id': entry_id, + 'url': 'kaltura:%s:%s' % (p_id, entry_id), + 'ie_key': 'Kaltura', + 'title': video_title + } \ No newline at end of file From 13d78925603df273572cdb4525e406f070163c4c Mon Sep 17 00:00:00 2001 From: Steven Gosseling Date: Tue, 8 Mar 2016 15:56:49 +0100 Subject: [PATCH 4/5] Added support for closertotruth interview URL --- youtube_dl/extractor/closertotruth.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py index 66b41a9d35b..3b9a0292102 100644 --- a/youtube_dl/extractor/closertotruth.py +++ b/youtube_dl/extractor/closertotruth.py @@ -1,13 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor class CloserToTruthIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/series/[^#]+#video-(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(series|interviews)/(?:[^#]+#video-)?(?P\d+)' _TESTS = [{ 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', 'md5': '2aa5b8971633d86fe32152827846a5b4', @@ -18,6 +16,16 @@ class CloserToTruthIE(InfoExtractor): 'upload_date': '20140307', 'timestamp': 1394236392, 'uploader_id': 'CTTXML' + }, + 'url': 'http://closertotruth.com/interviews/1725', + 'md5': 'b00598fd6a38372edb976408f72c5792', + 'info_dict': { + 'id': '0_19qv5rn1', + 'ext': 'mov', + 'title': 'AyaFr-002 - Francisco J. Ayala', + 'upload_date': '20140307', + 'timestamp': 1394236431, + 'uploader_id': 'CTTXML' } }] @@ -27,8 +35,9 @@ def _real_extract(self, url): video_title = self._search_regex(r'(.+) \|.+', webpage, 'video title') - entry_id = self._search_regex(r']+id="video-%s"[^>]+data-kaltura="([^"]+)' % video_id, webpage, "video entry_id") - interviewee_name = re.sub(r'(<[^>]+>)', '', self._search_regex(r'(.+)', webpage, "video interviewee_name")) + entry_id = self._search_regex(r']+id="(?:video-%s|embed-kaltura)"[^>]+data-kaltura="([^"]+)' % video_id, webpage, "video entry_id") + + interviewee_name = self._search_regex(r'
(.*)<\/h3>', webpage, "video interviewee_name") video_title = video_title + ' - ' + interviewee_name @@ -40,4 +49,4 @@ def _real_extract(self, url): 'url': 'kaltura:%s:%s' % (p_id, entry_id), 'ie_key': 'Kaltura', 'title': video_title - } \ No newline at end of file + } From 7aabc672ca80719e53cbb0890d72e7388d02981f Mon Sep 17 00:00:00 2001 From: Steven Gosseling Date: Fri, 18 Mar 2016 16:42:28 +0100 Subject: [PATCH 5/5] Added support for episodes page --- youtube_dl/extractor/closertotruth.py | 65 +++++++++++++++++---------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py index 3b9a0292102..d04ff5e4f88 100644 --- a/youtube_dl/extractor/closertotruth.py +++ b/youtube_dl/extractor/closertotruth.py @@ -5,29 +5,45 @@ class CloserToTruthIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(series|interviews)/(?:[^#]+#video-)?(?P\d+)' - _TESTS = [{ - 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', - 'md5': '2aa5b8971633d86fe32152827846a5b4', - 'info_dict': { - 'id': '0_zh2b6eqr', - 'ext': 'mov', - 'title': 'ZimDe-010-S', - 'upload_date': '20140307', - 'timestamp': 1394236392, - 'uploader_id': 'CTTXML' + _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(episodes/|(series|interviews)/(?:[^#]+#video-)?(?P\d+))' + _TESTS = [ + { + 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', + 'md5': '5c548bde260a9247ddfdc07c7458ed29', + 'info_dict': { + 'id': '0_zof1ktre', + 'ext': 'mov', + 'title': 'Solutions to the Mind-Body Problem?', + 'upload_date': '20140221', + 'timestamp': 1392956007, + 'uploader_id': 'CTTXML' + } }, - 'url': 'http://closertotruth.com/interviews/1725', - 'md5': 'b00598fd6a38372edb976408f72c5792', - 'info_dict': { - 'id': '0_19qv5rn1', - 'ext': 'mov', - 'title': 'AyaFr-002 - Francisco J. Ayala', - 'upload_date': '20140307', - 'timestamp': 1394236431, - 'uploader_id': 'CTTXML' - } - }] + { + 'url': 'http://closertotruth.com/interviews/1725', + 'md5': 'b00598fd6a38372edb976408f72c5792', + 'info_dict': { + 'id': '0_19qv5rn1', + 'ext': 'mov', + 'title': 'AyaFr-002 - Francisco J. Ayala', + 'upload_date': '20140307', + 'timestamp': 1394236431, + 'uploader_id': 'CTTXML' + } + }, + { + 'url': 'http://closertotruth.com/episodes/how-do-brains-work', + 'md5': '4dd96aa0a5c296afa5c0bd24895c2f16', + 'info_dict': { + 'id': '0_iuxai6g6', + 'ext': 'mov', + 'title': 'How do Brains Work?', + 'upload_date': '20140221', + 'timestamp': 1392956024, + 'uploader_id': 'CTTXML' + } + }, + ] def _real_extract(self, url): video_id = self._match_id(url) @@ -37,9 +53,10 @@ def _real_extract(self, url): entry_id = self._search_regex(r']+id="(?:video-%s|embed-kaltura)"[^>]+data-kaltura="([^"]+)' % video_id, webpage, "video entry_id") - interviewee_name = self._search_regex(r'
(.*)<\/h3>', webpage, "video interviewee_name") + interviewee_name = self._search_regex(r'
(.*).+', webpage, "video interviewee_name", False) - video_title = video_title + ' - ' + interviewee_name + if interviewee_name: + video_title = video_title + ' - ' + interviewee_name p_id = self._search_regex(r']+src=["\'].+?partner_id/(\d+)', webpage, "kaltura partner_id")