Skip to content

Commit

Permalink
[closertotruth] Add new extractor
Browse files Browse the repository at this point in the history
Removed print statement from code.

Replaced two regex searches with the corret ones.

Removed some unnecessary semicolumns

fixed title extraction

refactored everything to search_regex
  • Loading branch information
stevengos committed Feb 27, 2016
1 parent bf4b3b6 commit 5650b0d
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 0 deletions.
1 change: 1 addition & 0 deletions youtube_dl/extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@
from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE
from .closertotruth import CloserToTruthIE
from .cloudy import CloudyIE
from .clubic import ClubicIE
from .clyp import ClypIE
Expand Down
61 changes: 61 additions & 0 deletions youtube_dl/extractor/closertotruth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# coding: utf-8
from __future__ import unicode_literals

import re
import itertools
import hashlib

from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate,
)

class CloserToTruthIE(InfoExtractor):
_VALID_URL = r'http?://(?:www\.)?closertotruth\.com/series/\S+#video-(?P<id>\w+)'
_TESTS = [{
'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
'md5': '2aa5b8971633d86fe32152827846a5b4',
'info_dict': {
'id': '3688',
'ext': 'mov',
'title': 'Solutions to the Mind-Body Problem? - Dean W.Zimmerman '
}
},{
'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-4048',
'md5': 'a3882bb6e453720d8a7a3983f58abd04',
'info_dict': {
'id': '4048',
'ext': 'mov',
'title': 'Solutions to the Mind-Body Problem? - John Searle '
}
}]


def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)

#compose title for video
video_title = self._search_regex(r'<title>(.+) \|.+</title>', webpage, 'video title')

entry_id = self._search_regex(r'<a href="\S+" id="video-'+video_id+'" data-kaltura="(\w+)">.+<span.+<\/a>', webpage, "video entry_id")
interviewee_name = re.sub(r'(<[^>]+>)', '',self._search_regex(r'<a href="\S+" id="video-'+video_id+'" data-kaltura="\w+">(.+)<span.+<\/a>', webpage, "video interviewee_name"))

video_title = video_title + ' - ' + interviewee_name

#extract the partner id for kaltura.com
p_id = self._search_regex(r'<script src="http://cdnapi\.kaltura\.com/p/(?P<p>\w+)/sp/\w+/\S+/partner_id/\w+"></script>', webpage, "kaltura partner_id")

#request video url at kaltura API
#from: http://knowledge.kaltura.com/faq/how-retrieve-download-or-streaming-url-using-api-calls
api_request_url = 'http://www.kaltura.com/p/'+p_id+'/sp/0/playManifest/entryId/'+entry_id+'/protocol/HTTPS/flavorParamId/0/video.mp4'
api_response = self._download_webpage(api_request_url, video_id)

video_url = self._search_regex(r'<media url="(\S+)"', api_response, "video url")

return {
'url': video_url,
'id': video_id,
'title': video_title,
}

0 comments on commit 5650b0d

Please sign in to comment.