Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[funimation] Extract more format info from site #9515

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/supportedsites.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@
- **freespeech.org**
- **FreeVideo**
- **Funimation**
- **funimation:playlist**
- **FunnyOrDie**
- **GameInformer**
- **Gamekings**
Expand Down
5 changes: 4 additions & 1 deletion youtube_dl/extractor/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,10 @@
from .freesound import FreesoundIE
from .freespeech import FreespeechIE
from .freevideo import FreeVideoIE
from .funimation import FunimationIE
from .funimation import (
FunimationIE,
FunimationShowPlaylistIE,
)
from .funnyordie import FunnyOrDieIE
from .gameinformer import GameInformerIE
from .gamekings import GamekingsIE
Expand Down
197 changes: 150 additions & 47 deletions youtube_dl/extractor/funimation.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,35 @@
# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_urllib_parse_unquote_plus,
compat_urllib_parse_urlparse,
compat_parse_qs,
)
from ..utils import (
clean_html,
determine_ext,
int_or_none,
float_or_none,
sanitized_Request,
ExtractorError,
urlencode_postdata
urlencode_postdata,
NO_DEFAULT,
OnDemandPagedList,
)


class FunimationIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?funimation\.com/shows/[^/]+/videos/(?:official|promotional)/(?P<id>[^/?#&]+)'

class FunimationBaseIE(InfoExtractor):
_NETRC_MACHINE = 'funimation'

_TESTS = [{
'url': 'http://www.funimation.com/shows/air/videos/official/breeze',
'info_dict': {
'id': '658',
'display_id': 'breeze',
'ext': 'mp4',
'title': 'Air - 1 - Breeze',
'description': 'md5:1769f43cd5fc130ace8fd87232207892',
'thumbnail': 're:https?://.*\.jpg',
},
'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed',
}, {
'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play',
'info_dict': {
'id': '31128',
'display_id': 'role-play',
'ext': 'mp4',
'title': '.hack//SIGN - 1 - Role Play',
'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd',
'thumbnail': 're:https?://.*\.jpg',
},
'skip': 'Access without user interaction is forbidden by CloudFlare',
}, {
'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview',
'info_dict': {
'id': '9635',
'display_id': 'broadcast-dub-preview',
'ext': 'mp4',
'title': 'Attack on Titan: Junior High - Broadcast Dub Preview',
'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803',
'thumbnail': 're:https?://.*\.(?:jpg|png)',
},
'skip': 'Access without user interaction is forbidden by CloudFlare',
}]

_LOGIN_URL = 'http://www.funimation.com/login'

def _download_webpage(self, *args, **kwargs):
try:
return super(FunimationIE, self)._download_webpage(*args, **kwargs)
return super(FunimationBaseIE, self)._download_webpage(*args, **kwargs)
except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
response = ee.cause.read()
Expand Down Expand Up @@ -112,6 +81,33 @@ def _login(self):
def _real_initialize(self):
self._login()


class FunimationIE(FunimationBaseIE):
_VALID_URL = r'https?://(?:www\.)?funimation\.com/shows/[^/]+/videos/(?:official|promotional)/(?P<id>[^/?#&"]+)'
_TESTS = [{
'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play',
'info_dict': {
'id': '31128',
'display_id': 'role-play',
'ext': 'mp4',
'title': '.hack//SIGN - 1 - Role Play',
'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd',
'thumbnail': 're:https?://.*\.jpg',
},
'skip': 'Access without user interaction is forbidden by CloudFlare',
}, {
'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview',
'info_dict': {
'id': '9635',
'display_id': 'broadcast-dub-preview',
'ext': 'mp4',
'title': 'Attack on Titan: Junior High - Broadcast Dub Preview',
'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803',
'thumbnail': 're:https?://.*\.(?:jpg|png)',
},
'skip': 'Access without user interaction is forbidden by CloudFlare',
}]

def _real_extract(self, url):
display_id = self._match_id(url)

Expand Down Expand Up @@ -144,6 +140,15 @@ def _real_extract(self, url):
if user_agent:
USER_AGENTS = ((None, user_agent),)

# Extract language preference from URL if present
query = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
preference = query.get('watch', [None])[-1]

# Initialize variables with defaults
season_id = None
season_number = None
episode_number = None

for kind, user_agent in USER_AGENTS:
request = sanitized_Request(url)
request.add_header('User-Agent', user_agent)
Expand All @@ -157,8 +162,13 @@ def _real_extract(self, url):
webpage, 'players data'),
display_id)[0]['playlist']

items = next(item['items'] for item in playlist if item.get('items'))
item = next(item for item in items if item.get('itemAK') == display_id)
season = next(item for item in playlist if item.get('items'))
item = next(item for item in season['items'] if item.get('itemAK') == display_id)
if season.get('itemClass') == 'season':
season_id = season.get('itemAK')
season_number = int_or_none(self._search_regex(
r'^Season ([0-9]+)$', season_id, 'season number', None))
episode_number = float_or_none(item.get('number'))

error_messages = {}
video_error_messages = self._search_regex(
Expand All @@ -181,7 +191,6 @@ def _real_extract(self, url):
if not auth_token:
continue
funimation_id = video.get('FUNImationID') or video.get('videoId')
preference = 1 if video.get('languageMode') == 'dub' else 0
if not auth_token.startswith('?'):
auth_token = '?%s' % auth_token
for quality, height in (('sd', 480), ('hd', 720), ('hd1080', 1080)):
Expand All @@ -192,9 +201,18 @@ def _real_extract(self, url):
errors.append(format_url)
continue
if determine_ext(format_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
m3u8_formats = self._extract_m3u8_formats(
format_url + auth_token, display_id, 'mp4', entry_protocol='m3u8_native',
preference=preference, m3u8_id='%s-hls' % funimation_id, fatal=False))
m3u8_id='%s-hls' % funimation_id, fatal=False)
# Add language and preference
for m3u8_format in m3u8_formats:
m3u8_format['language'] = ('en-US'
if video.get('languageMode') == 'dub'
else 'ja-JP')
m3u8_format['language_preference'] = (10
if video.get('languageMode') == preference
else -1)
formats.append(m3u8_format)
else:
tbr = int_or_none(self._search_regex(
r'-(\d+)[Kk]', format_url, 'tbr', default=None))
Expand All @@ -203,7 +221,8 @@ def _real_extract(self, url):
'format_id': '%s-http-%dp' % (funimation_id, height),
'height': height,
'tbr': tbr,
'preference': preference,
'language': 'en-US' if video.get('languageMode') == 'dub' else 'ja-JP',
'language_preference': 10 if video.get('languageMode') == preference else -1
})

if not formats and errors:
Expand All @@ -216,9 +235,14 @@ def _real_extract(self, url):

title = item['title']
artist = item.get('artist')
episode = None
if artist:
title = '%s - %s' % (artist, title)
episode = self._search_regex(
r'^[0-9]+ - (.*)$', item['title'], 'episode name', NO_DEFAULT, False)
description = self._og_search_description(webpage) or item.get('description')
if description:
description = description.strip()
thumbnail = self._og_search_thumbnail(webpage) or item.get('posterUrl')
video_id = item.get('itemId') or display_id

Expand All @@ -227,6 +251,85 @@ def _real_extract(self, url):
'display_id': display_id,
'title': title,
'description': description,
'series': artist,
'season_id': season_id,
'season_number': season_number,
'episode_id': item.get('videoUrl'),
'episode': episode,
'episode_number': episode_number,
'thumbnail': thumbnail,
'formats': formats,
}


class FunimationShowPlaylistIE(FunimationBaseIE):
IE_NAME = 'funimation:playlist'
_VALID_URL = r'(?P<seriesurl>https?://(?:www\.)?funimation\.com/shows/(?P<id>[^/]+))(?:/(?:home|about|videos))?$'
_TESTS = [{
'url': 'http://www.funimation.com/shows/a-certain-scientific-railgun/home',
'info_dict': {
'id': 'a-certain-scientific-railgun',
'description': 'Misaka’s electro-manipulation abilities – and delightfully destructive Railgun projectile move – make her a rock star in Academy City. The techno-metropolis is packed with supernaturally powered students known as espers, including Misaka’s flirty friend and roommate, Kuroko. She uses her teleportation skills as a member of the Judgment law enforcement team, fighting crime alongside her fellow agent Uiharu. Joined by their friend Saten, a spunky Level 0 esper, Misaka,',
'title': 'A Certain Scientific Railgun'
},
'playlist_count': 48
}, {
'url': 'http://www.funimation.com/shows/hacksign/home',
'info_dict': {
'id': 'hacksign',
'description': 'Tsukasa wakes up inside The World, a massive online role-playing game full of magic and monsters, and finds himself unable to log out. With no knowledge of what’s happening in the real world, Tsukasa must discover how he ended up stuck in the game, and what connection he has with the fabled Key of the Twilight—an item that’s rumored to grant ultimate control over the digital realm.',
'title': '.hack//SIGN'
},
'playlist_count': 56
}]

def _real_extract(self, url):
display_id = self._match_id(url)

user_agent = self._extract_cloudflare_session_ua(url)

# Use series page to get ID number and title / description
series_url = self._search_regex(self._VALID_URL, url, 'series URL', group='seriesurl')
request = sanitized_Request(series_url)
request.add_header('User-Agent', user_agent)
webpage = self._download_webpage(request, display_id, 'Downloading series webpage')

# Parseable show data stored as a JavaScript variable
playlist = self._parse_json(
self._search_regex(
r'var\s+playersData\s*=\s*(\[.+?\]);\n',
webpage, 'players data'),
display_id)[0]['playlist'][0]

def pagefunc(pagenum):
# Internal Funimation endpoint for getting paginated video list HTML
request = sanitized_Request(
'https://www.funimation.com/shows/viewAllFiltered?section=episodes&showid={0}&offset={1}'
.format(playlist.get('showId'), pagenum * 20))
request.add_header('User-Agent', user_agent)
episode_list = self._download_json(
request, display_id, 'Downloading episode list from {0}'.format(pagenum * 20))['main']

# There are multiple instances of each video URL, so filter for unique URLs
# while keeping the order of the episodes
urls_seen = set()
episode_paths = re.finditer(
r'(?s)<a href="(' + FunimationIE._VALID_URL + r')"',
episode_list)
episode_paths = [
path.group(1) for path in episode_paths
if not (path.group(1) in urls_seen or urls_seen.add(path.group(1)))]

return [self.url_result(ep, FunimationIE.ie_key()) for ep in episode_paths]

description = self._og_search_description(webpage) or playlist.get('description')
if description:
description = description.strip()

return {
'_type': 'playlist',
'id': display_id,
'title': playlist.get('artist'),
'description': description,
'entries': OnDemandPagedList(pagefunc, 20, True)
}