Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[spankwire] Fixed and improved extractor #20648

Closed
wants to merge 3 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 32 additions & 47 deletions youtube_dl/extractor/spankwire.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,11 @@
import re

from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
)
from ..utils import (
int_or_none,
sanitized_Request,
str_to_int,
unified_strdate,
)
from ..aes import aes_decrypt_text


class SpankwireIE(InfoExtractor):
Expand Down Expand Up @@ -54,61 +49,48 @@ def _real_extract(self, url):
req = sanitized_Request('http://www.' + mobj.group('url'))
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
JChris246 marked this conversation as resolved.
Show resolved Hide resolved
video_data = self._download_json(
sanitized_Request('https://www.spankwire.com/api/video/' + video_id + '.json'), video_id)
JChris246 marked this conversation as resolved.
Show resolved Hide resolved

title = self._html_search_regex(
title = video_data.get('title') or self._html_search_regex(
r'<h1>([^<]+)', webpage, 'title')
description = self._html_search_regex(
r'(?s)<div\s+id="descriptionContent">(.+?)</div>',
webpage, 'description', fatal=False)
thumbnail = self._html_search_regex(
r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']',
webpage, 'thumbnail', fatal=False)
thumbnail = video_data.get('poster')
description = video_data.get('description')

uploader = self._html_search_regex(
r'by:\s*<a [^>]*>(.+?)</a>',
webpage, 'uploader', fatal=False)
uploader = self._search_regex(
r'<a[^>]+class="uploaded__by"[^>]*>(.+?)</a>',
webpage, 'uploader', flags=re.DOTALL, fatal=False)
JChris246 marked this conversation as resolved.
Show resolved Hide resolved
uploader_id = self._html_search_regex(
r'by:\s*<a href="/(?:user/viewProfile|Profile\.aspx)\?.*?UserId=(\d+).*?"',
r'by\s*<a href="/(?:user/viewProfile|Profile\.aspx)\?.*?UserId=(\d+).*?"',
webpage, 'uploader id', fatal=False)
upload_date = unified_strdate(self._html_search_regex(
r'</a> on (.+?) at \d+:\d+',
r'</span>(.+?) at \d+:\d+ (?:AM|PM) by',
webpage, 'upload date', fatal=False))

view_count = str_to_int(self._html_search_regex(
r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>',
webpage, 'view count', fatal=False))
comment_count = str_to_int(self._html_search_regex(
r'<span\s+id="spCommentCount"[^>]*>([\d,\.]+)</span>',
webpage, 'comment count', fatal=False))
view_count = int_or_none(video_data.get('viewed'))
comment_count = int_or_none(video_data.get('comments'))
duration = int_or_none(video_data.get('duration'))

def extract_list(arr):
names = []
for i in arr:
JChris246 marked this conversation as resolved.
Show resolved Hide resolved
names.append(i.get('name'))
return names

videos = re.findall(
r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage)
heights = [int(video[0]) for video in videos]
video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos]))
if webpage.find(r'flashvars\.encrypted = "true"') != -1:
password = self._search_regex(
r'flashvars\.video_title = "([^"]+)',
webpage, 'password').replace('+', ' ')
video_urls = list(map(
lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'),
video_urls))
categories = extract_list(video_data.get('categories'))
tags = extract_list(video_data.get('tags'))

videos = video_data.get('videos').items()
videos.sort()
JChris246 marked this conversation as resolved.
Show resolved Hide resolved
formats = []
for height, video_url in zip(heights, video_urls):
path = compat_urllib_parse_urlparse(video_url).path
m = re.search(r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', path)
if m:
tbr = int(m.group('tbr'))
height = int(m.group('height'))
else:
tbr = None
for quality, video_url in videos:
height = quality.replace('quality_', '').replace('p', '')
formats.append({
'url': video_url,
'format_id': '%dp' % height,
'height': height,
'tbr': tbr,
'format_id': quality,
'height': int_or_none(height),
'tbr': None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is pointless.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Formats in webpage are still available and should be extracted.

})
self._sort_formats(formats)

age_limit = self._rta_search(webpage)

Expand All @@ -124,4 +106,7 @@ def _real_extract(self, url):
'comment_count': comment_count,
'formats': formats,
'age_limit': age_limit,
'duration': duration,
'categories': categories,
'tags': tags,
}