ytdl-org · Parent5446 · May 16, 2016 · Jun 1, 2016
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
@@ -228,6 +228,7 @@
  - **freespeech.org**
  - **FreeVideo**
  - **Funimation**
+ - **funimation:playlist**
  - **FunnyOrDie**
  - **GameInformer**
  - **Gamekings**

diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
@@ -258,7 +258,10 @@
 from .freesound import FreesoundIE
 from .freespeech import FreespeechIE
 from .freevideo import FreeVideoIE
-from .funimation import FunimationIE
+from .funimation import (
+    FunimationIE,
+    FunimationShowPlaylistIE,
+)
 from .funnyordie import FunnyOrDieIE
 from .gameinformer import GameInformerIE
 from .gamekings import GamekingsIE

diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py
@@ -1,66 +1,35 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
 from ..compat import (
     compat_HTTPError,
     compat_urllib_parse_unquote_plus,
+    compat_urllib_parse_urlparse,
+    compat_parse_qs,
 )
 from ..utils import (
     clean_html,
     determine_ext,
     int_or_none,
+    float_or_none,
     sanitized_Request,
     ExtractorError,
-    urlencode_postdata
+    urlencode_postdata,
+    NO_DEFAULT,
+    OnDemandPagedList,
 )
 
 
-class FunimationIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?funimation\.com/shows/[^/]+/videos/(?:official|promotional)/(?P<id>[^/?#&]+)'
-
+class FunimationBaseIE(InfoExtractor):
     _NETRC_MACHINE = 'funimation'
-
-    _TESTS = [{
-        'url': 'http://www.funimation.com/shows/air/videos/official/breeze',
-        'info_dict': {
-            'id': '658',
-            'display_id': 'breeze',
-            'ext': 'mp4',
-            'title': 'Air - 1 - Breeze',
-            'description': 'md5:1769f43cd5fc130ace8fd87232207892',
-            'thumbnail': 're:https?://.*\.jpg',
-        },
-        'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed',
-    }, {
-        'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play',
-        'info_dict': {
-            'id': '31128',
-            'display_id': 'role-play',
-            'ext': 'mp4',
-            'title': '.hack//SIGN - 1 - Role Play',
-            'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd',
-            'thumbnail': 're:https?://.*\.jpg',
-        },
-        'skip': 'Access without user interaction is forbidden by CloudFlare',
-    }, {
-        'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview',
-        'info_dict': {
-            'id': '9635',
-            'display_id': 'broadcast-dub-preview',
-            'ext': 'mp4',
-            'title': 'Attack on Titan: Junior High - Broadcast Dub Preview',
-            'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803',
-            'thumbnail': 're:https?://.*\.(?:jpg|png)',
-        },
-        'skip': 'Access without user interaction is forbidden by CloudFlare',
-    }]
-
     _LOGIN_URL = 'http://www.funimation.com/login'
 
     def _download_webpage(self, *args, **kwargs):
         try:
-            return super(FunimationIE, self)._download_webpage(*args, **kwargs)
+            return super(FunimationBaseIE, self)._download_webpage(*args, **kwargs)
         except ExtractorError as ee:
             if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
                 response = ee.cause.read()
@@ -112,6 +81,33 @@ def _login(self):
     def _real_initialize(self):
         self._login()
 
+
+class FunimationIE(FunimationBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?funimation\.com/shows/[^/]+/videos/(?:official|promotional)/(?P<id>[^/?#&"]+)'
+    _TESTS = [{
+        'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play',
+        'info_dict': {
+            'id': '31128',
+            'display_id': 'role-play',
+            'ext': 'mp4',
+            'title': '.hack//SIGN - 1 - Role Play',
+            'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd',
+            'thumbnail': 're:https?://.*\.jpg',
+        },
+        'skip': 'Access without user interaction is forbidden by CloudFlare',
+    }, {
+        'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview',
+        'info_dict': {
+            'id': '9635',
+            'display_id': 'broadcast-dub-preview',
+            'ext': 'mp4',
+            'title': 'Attack on Titan: Junior High - Broadcast Dub Preview',
+            'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803',
+            'thumbnail': 're:https?://.*\.(?:jpg|png)',
+        },
+        'skip': 'Access without user interaction is forbidden by CloudFlare',
+    }]
+
     def _real_extract(self, url):
         display_id = self._match_id(url)
 
@@ -144,6 +140,15 @@ def _real_extract(self, url):
         if user_agent:
             USER_AGENTS = ((None, user_agent),)
 
+        # Extract language preference from URL if present
+        query = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        preference = query.get('watch', [None])[-1]
+
+        # Initialize variables with defaults
+        season_id = None
+        season_number = None
+        episode_number = None
+
         for kind, user_agent in USER_AGENTS:
             request = sanitized_Request(url)
             request.add_header('User-Agent', user_agent)
@@ -157,8 +162,13 @@ def _real_extract(self, url):
                     webpage, 'players data'),
                 display_id)[0]['playlist']
 
-            items = next(item['items'] for item in playlist if item.get('items'))
-            item = next(item for item in items if item.get('itemAK') == display_id)
+            season = next(item for item in playlist if item.get('items'))
+            item = next(item for item in season['items'] if item.get('itemAK') == display_id)
+            if season.get('itemClass') == 'season':
+                season_id = season.get('itemAK')
+                season_number = int_or_none(self._search_regex(
+                    r'^Season ([0-9]+)$', season_id, 'season number', None))
+                episode_number = float_or_none(item.get('number'))
 
             error_messages = {}
             video_error_messages = self._search_regex(
@@ -181,7 +191,6 @@ def _real_extract(self, url):
                 if not auth_token:
                     continue
                 funimation_id = video.get('FUNImationID') or video.get('videoId')
-                preference = 1 if video.get('languageMode') == 'dub' else 0
                 if not auth_token.startswith('?'):
                     auth_token = '?%s' % auth_token
                 for quality, height in (('sd', 480), ('hd', 720), ('hd1080', 1080)):
@@ -192,9 +201,18 @@ def _real_extract(self, url):
                         errors.append(format_url)
                         continue
                     if determine_ext(format_url) == 'm3u8':
-                        formats.extend(self._extract_m3u8_formats(
+                        m3u8_formats = self._extract_m3u8_formats(
                             format_url + auth_token, display_id, 'mp4', entry_protocol='m3u8_native',
-                            preference=preference, m3u8_id='%s-hls' % funimation_id, fatal=False))
+                            m3u8_id='%s-hls' % funimation_id, fatal=False)
+                        # Add language and preference
+                        for m3u8_format in m3u8_formats:
+                            m3u8_format['language'] = ('en-US'
+                                                       if video.get('languageMode') == 'dub'
+                                                       else 'ja-JP')
+                            m3u8_format['language_preference'] = (10
+                                                                  if video.get('languageMode') == preference
+                                                                  else -1)
+                            formats.append(m3u8_format)
                     else:
                         tbr = int_or_none(self._search_regex(
                             r'-(\d+)[Kk]', format_url, 'tbr', default=None))
@@ -203,7 +221,8 @@ def _real_extract(self, url):
                             'format_id': '%s-http-%dp' % (funimation_id, height),
                             'height': height,
                             'tbr': tbr,
-                            'preference': preference,
+                            'language': 'en-US' if video.get('languageMode') == 'dub' else 'ja-JP',
+                            'language_preference': 10 if video.get('languageMode') == preference else -1
                         })
 
         if not formats and errors:
@@ -216,9 +235,14 @@ def _real_extract(self, url):
 
         title = item['title']
         artist = item.get('artist')
+        episode = None
         if artist:
             title = '%s - %s' % (artist, title)
+            episode = self._search_regex(
+                r'^[0-9]+ - (.*)$', item['title'], 'episode name', NO_DEFAULT, False)
         description = self._og_search_description(webpage) or item.get('description')
+        if description:
+            description = description.strip()
         thumbnail = self._og_search_thumbnail(webpage) or item.get('posterUrl')
         video_id = item.get('itemId') or display_id
 
@@ -227,6 +251,85 @@ def _real_extract(self, url):
             'display_id': display_id,
             'title': title,
             'description': description,
+            'series': artist,
+            'season_id': season_id,
+            'season_number': season_number,
+            'episode_id': item.get('videoUrl'),
+            'episode': episode,
+            'episode_number': episode_number,
             'thumbnail': thumbnail,
             'formats': formats,
         }
+
+
+class FunimationShowPlaylistIE(FunimationBaseIE):
+    IE_NAME = 'funimation:playlist'
+    _VALID_URL = r'(?P<seriesurl>https?://(?:www\.)?funimation\.com/shows/(?P<id>[^/]+))(?:/(?:home|about|videos))?$'
+    _TESTS = [{
+        'url': 'http://www.funimation.com/shows/a-certain-scientific-railgun/home',
+        'info_dict': {
+            'id': 'a-certain-scientific-railgun',
+            'description': 'Misaka’s electro-manipulation abilities – and delightfully destructive Railgun projectile move – make her a rock star in Academy City. The techno-metropolis is packed with supernaturally powered students known as espers, including Misaka’s flirty friend and roommate, Kuroko. She uses her teleportation skills as a member of the Judgment law enforcement team, fighting crime alongside her fellow agent Uiharu. Joined by their friend Saten, a spunky Level 0 esper, Misaka,',
+            'title': 'A Certain Scientific Railgun'
+        },
+        'playlist_count': 48
+    }, {
+        'url': 'http://www.funimation.com/shows/hacksign/home',
+        'info_dict': {
+            'id': 'hacksign',
+            'description': 'Tsukasa wakes up inside The World, a massive online role-playing game full of magic and monsters, and finds himself unable to log out. With no knowledge of what’s happening in the real world, Tsukasa must discover how he ended up stuck in the game, and what connection he has with the fabled Key of the Twilight—an item that’s rumored to grant ultimate control over the digital realm.',
+            'title': '.hack//SIGN'
+        },
+        'playlist_count': 56
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        user_agent = self._extract_cloudflare_session_ua(url)
+
+        # Use series page to get ID number and title / description
+        series_url = self._search_regex(self._VALID_URL, url, 'series URL', group='seriesurl')
+        request = sanitized_Request(series_url)
+        request.add_header('User-Agent', user_agent)
+        webpage = self._download_webpage(request, display_id, 'Downloading series webpage')
+
+        # Parseable show data stored as a JavaScript variable
+        playlist = self._parse_json(
+            self._search_regex(
+                r'var\s+playersData\s*=\s*(\[.+?\]);\n',
+                webpage, 'players data'),
+            display_id)[0]['playlist'][0]
+
+        def pagefunc(pagenum):
+            # Internal Funimation endpoint for getting paginated video list HTML
+            request = sanitized_Request(
+                'https://www.funimation.com/shows/viewAllFiltered?section=episodes&showid={0}&offset={1}'
+                .format(playlist.get('showId'), pagenum * 20))
+            request.add_header('User-Agent', user_agent)
+            episode_list = self._download_json(
+                request, display_id, 'Downloading episode list from {0}'.format(pagenum * 20))['main']
+
+            # There are multiple instances of each video URL, so filter for unique URLs
+            # while keeping the order of the episodes
+            urls_seen = set()
+            episode_paths = re.finditer(
+                r'(?s)<a href="(' + FunimationIE._VALID_URL + r')"',
+                episode_list)
+            episode_paths = [
+                path.group(1) for path in episode_paths
+                if not (path.group(1) in urls_seen or urls_seen.add(path.group(1)))]
+
+            return [self.url_result(ep, FunimationIE.ie_key()) for ep in episode_paths]
+
+        description = self._og_search_description(webpage) or playlist.get('description')
+        if description:
+            description = description.strip()
+
+        return {
+            '_type': 'playlist',
+            'id': display_id,
+            'title': playlist.get('artist'),
+            'description': description,
+            'entries': OnDemandPagedList(pagefunc, 20, True)
+        }