[ie/bluey] Add extractor

kclauhk · Jul 21, 2024 · 7e39550 · 7e39550
1 parent 43412f6
commit 7e39550
Show file tree

Hide file tree

Showing 2 changed files with 276 additions and 0 deletions.
diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py
@@ -277,6 +277,7 @@
 from .blerp import BlerpIE
 from .blogger import BloggerIE
 from .bloomberg import BloombergIE
+from .bluey import BlueyIE
 from .bokecc import BokeCCIE
 from .bongacams import BongaCamsIE
 from .boosty import BoostyIE

diff --git a/yt_dlp/extractor/bluey.py b/yt_dlp/extractor/bluey.py
@@ -0,0 +1,275 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    int_or_none,
+    merge_dicts,
+    str_or_none,
+    traverse_obj,
+    url_or_none,
+)
+
+
+class BlueyIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.bluey\.tv/(?:.+/)?(?P<id>[^/]+)/?$'
+    _TESTS = [{
+        # Episode (YouTube embeded: https://youtu.be/u6D2ucvSas0)
+        'url': 'https://www.bluey.tv/watch/season-1/mums-and-dads/',
+        'info_dict': {
+            'id': 'u6D2ucvSas0',
+            'ext': 'mp4',
+            'title': 'Mums and Dads',
+            'description': 'md5:e215cd5c6d6ec050a354d2b06ad6fc9d',
+            'thumbnail': 'https://www.bluey.tv/wp-content/uploads/2023/08/ABTI325R50_MUMS_AND_DADS_Image_00.jpg',
+            'timestamp': 1591362032,
+            'upload_date': '20200605',
+            'uploader': 'Official Bluey TV',
+            'uploader_id': '@BlueyOfficialChannel',
+            'uploader_url': 'https://www.youtube.com/@BlueyOfficialChannel',
+            'channel': 'Bluey - Official Channel',
+            'channel_id': 'UCVzLLZkDuFGAE2BGdBuBNBg',
+            'channel_url': 'https://www.youtube.com/channel/UCVzLLZkDuFGAE2BGdBuBNBg',
+            'channel_follower_count': int,
+            'channel_is_verified': True,
+            'duration': 118,
+            'view_count': int,
+            'like_count': int,
+            'age_limit': 0,
+            'availability': 'public',
+            'categories': ['Film & Animation'],
+            'tags': 'count:18',
+            'heatmap': 'count:100',
+            'live_status': 'not_live',
+            'playable_in_embed': True,
+            'season': 'Season 1',
+            'season_number': 1,
+            'episode': 'Episode 33',
+            'episode_number': 33,
+        },
+    }, {
+        # Episode with trailer video
+        'url': 'https://www.bluey.tv/watch/season-3/the-sign/',
+        'info_dict': {
+            'id': 'the-sign',
+            'title': 'The Sign',
+            'description': 'md5:6e9b01b32f35bdcf33160c86a15080f7',
+            'thumbnail': 'https://www.bluey.tv/wp-content/uploads/2024/02/Sign-Sq.png',
+            'uploader': 'Official Bluey TV',
+            'season': 'Season 3',
+            'season_number': 3,
+            'episode': 'Episode 49',
+            'episode_number': 49,
+        },
+        'playlist_count': 2,
+    }, {
+        # Minisode (Brightcove)
+        'url': 'https://www.bluey.tv/watch/minisodes/animals/',
+        'info_dict': {
+            'id': 'animals',
+            'ext': 'mp4',
+            'title': 'Animals',
+            'description': 'Mum is playing the animal game on Bingo\'s back.',
+            'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/jit/6041795457001/b8000e79-49d6-4732-88be-09fb0d484a98/main/1280x720/11s413ms/match/image.jpg',
+            'upload_date': '20240701',
+            'uploader': 'Official Bluey TV',
+            'tags': [],
+            'episode': 'Episode 7',
+            'episode_number': 7,
+            'duration': 22827,
+        },
+    }, {
+        # Book-read (YouTube embeded: https://youtu.be/NbLxoLyPGyc)
+        'url': 'https://www.bluey.tv/watch/bluey-book-reads/charades-2/',
+        'info_dict': {
+            'id': 'NbLxoLyPGyc',
+            'ext': 'mp4',
+            'title': 'Charades',
+            'description': 'Jenna Fischer reads \'Charades\'',
+            'thumbnail': 'https://www.bluey.tv/wp-content/uploads/2024/02/AVSA067W_BlueyBookReads_S01_E06_Charades_TitlePromo_16x9.png',
+            'timestamp': 1713538806,
+            'release_date': '20240419',
+            'release_timestamp': 1713538806,
+            'upload_date': '20240419',
+            'uploader': 'Official Bluey TV',
+            'uploader_id': '@BlueyOfficialChannel',
+            'uploader_url': 'https://www.youtube.com/@BlueyOfficialChannel',
+            'channel': 'Bluey - Official Channel',
+            'channel_id': 'UCVzLLZkDuFGAE2BGdBuBNBg',
+            'channel_url': 'https://www.youtube.com/channel/UCVzLLZkDuFGAE2BGdBuBNBg',
+            'channel_follower_count': int,
+            'channel_is_verified': True,
+            'duration': 280,
+            'view_count': int,
+            'like_count': int,
+            'age_limit': 0,
+            'availability': 'public',
+            'categories': ['Film & Animation'],
+            'heatmap': 'count:100',
+            'live_status': 'not_live',
+            'playable_in_embed': True,
+            'tags': 'count:28',
+        },
+    }, {
+        # Bonus-bit (YouTube embeded: https://youtu.be/UUkb_b5UEE0)
+        'url': 'https://www.bluey.tv/watch/bonus-bits/tea-party/',
+        'info_dict': {
+            'id': 'UUkb_b5UEE0',
+            'ext': 'mp4',
+            'title': 'Tea Party',
+            'description': 'Bluey and Honey invite Honey\'s mum and dad to a tea party.',
+            'thumbnail': 'https://www.bluey.tv/wp-content/uploads/2021/03/Bluey_Tea_Party_001.jpg',
+            'timestamp': 1614960018,
+            'upload_date': '20210305',
+            'uploader': 'Official Bluey TV',
+            'uploader_id': '@BlueyOfficialChannel',
+            'uploader_url': 'https://www.youtube.com/@BlueyOfficialChannel',
+            'channel': 'Bluey - Official Channel',
+            'channel_id': 'UCVzLLZkDuFGAE2BGdBuBNBg',
+            'channel_url': 'https://www.youtube.com/channel/UCVzLLZkDuFGAE2BGdBuBNBg',
+            'channel_follower_count': int,
+            'channel_is_verified': True,
+            'duration': 95,
+            'view_count': int,
+            'like_count': int,
+            'age_limit': 0,
+            'availability': 'public',
+            'categories': ['Film & Animation'],
+            'heatmap': 'count:100',
+            'live_status': 'not_live',
+            'playable_in_embed': True,
+            'tags': 'count:24',
+        },
+    }, {
+        # Characters (YouTube embeded: https://youtu.be/HlOIzz-GIxk)
+        'url': 'https://www.bluey.tv/characters/bluey/',
+        'info_dict': {
+            'id': 'HlOIzz-GIxk',
+            'ext': 'mp4',
+            'title': 'BLUEY\'S HIGHLIGHTS',
+            'description': 'Bluey is a blue heeler pup who loves to make up and play fun and imaginative games with her family and friends.',
+            'thumbnail': 'https://www.bluey.tv/wp-content/uploads/2023/07/ABTI291B50_THE_BEACH_Image_09-scaled.jpg',
+            'timestamp': 1665759612,
+            'upload_date': '20221014',
+            'uploader': 'Official Bluey TV',
+            'uploader_id': '@BlueyOfficialChannel',
+            'uploader_url': 'https://www.youtube.com/@BlueyOfficialChannel',
+            'channel': 'Bluey - Official Channel',
+            'channel_id': 'UCVzLLZkDuFGAE2BGdBuBNBg',
+            'channel_url': 'https://www.youtube.com/channel/UCVzLLZkDuFGAE2BGdBuBNBg',
+            'channel_follower_count': int,
+            'channel_is_verified': True,
+            'duration': 604,
+            'view_count': int,
+            'like_count': int,
+            'age_limit': 0,
+            'availability': 'public',
+            'categories': ['Film & Animation'],
+            'live_status': 'not_live',
+            'playable_in_embed': True,
+            'tags': 'count:24',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        def brightcove_api(brightcove_id, video_id):
+            headers = {'Accept': 'application/json;pk=BCpkADawqM0-e9kbtiYMtk9IxVZUWQ1X3DfbKGkMTtgzX-8zRbWKYj31aVgMTPXxCK3Uy_J4wYE8mXuYHlLUhu47Tsco9l6H_-3_BJKL10ip7fnY8tUiCotYIoaMcOTeqCwM9Vn2trMyy3HM'}
+            if data := self._download_json(f'https://edge.api.brightcove.com/playback/v1/accounts/6041795457001/videos/{brightcove_id}',
+                                           video_id, headers=headers, fatal=False):
+                formats, subtitles = [], {}
+                for source in data.get('sources'):
+                    if source.get('type') == 'application/x-mpegURL' and source.get('src'):
+                        fmts, subs = self._extract_m3u8_formats_and_subtitles(
+                            source['src'], video_id, 'mp4', m3u8_id='hls', fatal=False)
+                        for idx, f in enumerate(fmts):
+                            fmts[idx]['format_id'] = f['format_id'].replace(' ', '').replace(')', '') + '-' + source['src'].split(':')[0]
+                        formats.extend(fmts)
+                        self._merge_subtitles(subs, target=subtitles)
+                    elif source.get('type') == 'application/dash+xml' and source.get('src'):
+                        fmts, subs = self._extract_mpd_formats_and_subtitles(
+                            source['src'], video_id, mpd_id='dash', fatal=False)
+                        for idx, f in enumerate(fmts):
+                            fmts[idx]['format_id'] = f['format_id'] + '-' + source['src'].split(':')[0]
+                        formats.extend(fmts)
+                        self._merge_subtitles(subs, target=subtitles)
+                return {
+                    **traverse_obj(data, {
+                        'description': (('long_description', 'description'), {str_or_none}),
+                        'thumbnail': (('poster', 'thumbnail'), {url_or_none}),
+                        'tags': ('tags', {list}),
+                        'upload_date': (('published_at', 'created_at'),
+                                        {lambda x: x[:10].replace('-', '') if x else None}),
+                        'duration': ('duration', {int_or_none}),
+                    }, get_all=False),
+                    'formats': formats,
+                    'subtitles': subtitles,
+                }
+            else:
+                return {}
+
+        entries, player_title, player_poster = [], None, None
+        if player_data := re.findall(r'fe-(\w+)-player" data-props="({[^"]+?})"', webpage):
+            for idx, data in enumerate(player_data):
+                if video_data := self._parse_json(clean_html(data[1]), video_id):
+                    player_title = traverse_obj(video_data, ('title', {lambda x: x if x != 'Watch the trailer' else None}))
+                    if idx == 0:
+                        player_poster = traverse_obj(video_data, ('posterImage', {url_or_none}))
+                    if data[0] == 'media':
+                        if video_data.get('type') == 'brightcove' and video_data.get('brightcoveId'):
+                            entries.append(brightcove_api(video_data['brightcoveId'], video_id))
+                        elif video_data.get('type') == 'youtube' and video_data.get('youtubeId'):
+                            entries.append(self.url_result(video_data['youtubeId']))
+                    elif data[0] == 'video' and video_data.get('url'):
+                        if url_or_none(video_data['url']):
+                            entries.append(self.url_result(video_data['url']))
+                        elif int_or_none(video_data['url']):
+                            entries.append(brightcove_api(video_data['url'], video_id))
+
+        if json_ld := list(self._yield_json_ld(webpage, video_id)):
+            info = {
+                'id': video_id,
+                **traverse_obj(json_ld[-1], {
+                    'title': (('containsSeason', '@graph'), 0, (('episode', 'name'), 'name'),
+                              {lambda x: re.sub(r'\W+Bluey Official Website$', '', x).split(' | ')[-1] if x else None}),
+                    'description': (('containsSeason', '@graph'), 0,
+                                    (('episode', 'description'), 'description'), {str_or_none}),
+                    'thumbnail': ('containsSeason', 0, 'episode', 'image',
+                                  {lambda x: x if url_or_none(x) else player_poster}),
+                    'season': ('containsSeason', 0, 'name',
+                               {lambda x: x if re.match(r'Season \d+$', x) else None}),
+                    'season_number': ('containsSeason', 0, 'name',
+                                      {lambda x: int(x.replace('Season ', '')) if re.match(r'Season \d+$', x) else None}),
+                    'episode': ('containsSeason', 0, 'episode', 'episodeNumber',
+                                {lambda x: f'Episode {x}' if x else None}),
+                    'episode_number': ('containsSeason', 0, 'episode', 'episodeNumber', {int_or_none}),
+                }, get_all=False),
+            }
+        else:
+            title = re.sub(r'\W+Bluey Official Website$', '', self._og_search_title(webpage))
+            info = {
+                'id': video_id,
+                'title': title.split(' | ')[-1],
+                'description': self._og_search_description(webpage),
+                'thumbnail': player_poster or self._og_search_thumbnail(webpage),
+            }
+            if season_number := self._search_regex(r' Season (\d+)', title, 'season_number', default=None):
+                info['season'] = f'Season {season_number}'
+                info['season_number'] = int(season_number)
+            if episode_number := self._search_regex(r' Episode (\d+)', title, 'episode_number', default=None):
+                info['episode'] = f'Episode {episode_number}'
+                info['episode_number'] = int(episode_number)
+        info['uploader'] = self._html_search_meta('article:author', webpage)
+
+        if len(entries) > 1:
+            return self.playlist_result(entries, video_id, **{
+                k: v for k, v in info.items() if v})
+        elif len(entries) == 1:
+            if entries[0].get('_type'):
+                entries[0]['_type'] = 'url_transparent'
+            info['title'] = player_title or info['title']
+            return merge_dicts(entries[0], info)
+        else:
+            return info