Merge branch 'yt-dlp:master' into master

saintliao · Dec 13, 2023 · de1d3d2 · de1d3d2
2 parents 63d3795 + 6b5d93b
commit de1d3d2
Show file tree

Hide file tree

Showing 20 changed files with 338 additions and 89 deletions.
diff --git a/README.md b/README.md
@@ -1333,6 +1333,7 @@ The available fields are:
  - `was_live` (boolean): Whether this video was originally a live stream
  - `playable_in_embed` (string): Whether this video is allowed to play in embedded players on other sites
  - `availability` (string): Whether the video is "private", "premium_only", "subscriber_only", "needs_auth", "unlisted" or "public"
+ - `media_type` (string): The type of media as classified by the site, e.g. "episode", "clip", "trailer"
  - `start_time` (numeric): Time in seconds where the reproduction should start, as specified in the URL
  - `end_time` (numeric): Time in seconds where the reproduction should end, as specified in the URL
  - `extractor` (string): Name of the extractor

diff --git a/test/test_utils.py b/test/test_utils.py
@@ -2317,23 +2317,6 @@ def test_traverse_obj(self):
         self.assertEqual(traverse_obj({}, (0, slice(1)), traverse_string=True), [],
                          msg='branching should result in list if `traverse_string`')
 
-        # Test is_user_input behavior
-        _IS_USER_INPUT_DATA = {'range8': list(range(8))}
-        self.assertEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', '3'),
-                                      is_user_input=True), 3,
-                         msg='allow for string indexing if `is_user_input`')
-        self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', '3:'),
-                                           is_user_input=True), tuple(range(8))[3:],
-                              msg='allow for string slice if `is_user_input`')
-        self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':4:2'),
-                                           is_user_input=True), tuple(range(8))[:4:2],
-                              msg='allow step in string slice if `is_user_input`')
-        self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':'),
-                                           is_user_input=True), range(8),
-                              msg='`:` should be treated as `...` if `is_user_input`')
-        with self.assertRaises(TypeError, msg='too many params should result in error'):
-            traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':::'), is_user_input=True)
-
         # Test re.Match as input obj
         mobj = re.fullmatch(r'0(12)(?P<group>3)(4)?', '0123')
         self.assertEqual(traverse_obj(mobj, ...), [x for x in mobj.groups() if x is not None],

diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py
@@ -1201,6 +1201,15 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
                 (?:\|(?P<default>.*?))?
             )$''')
 
+        def _from_user_input(field):
+            if field == ':':
+                return ...
+            elif ':' in field:
+                return slice(*map(int_or_none, field.split(':')))
+            elif int_or_none(field) is not None:
+                return int(field)
+            return field
+
         def _traverse_infodict(fields):
             fields = [f for x in re.split(r'\.({.+?})\.?', fields)
                       for f in ([x] if x.startswith('{') else x.split('.'))]
@@ -1210,11 +1219,12 @@ def _traverse_infodict(fields):
 
             for i, f in enumerate(fields):
                 if not f.startswith('{'):
+                    fields[i] = _from_user_input(f)
                     continue
                 assert f.endswith('}'), f'No closing brace for {f} in {fields}'
-                fields[i] = {k: k.split('.') for k in f[1:-1].split(',')}
+                fields[i] = {k: list(map(_from_user_input, k.split('.'))) for k in f[1:-1].split(',')}
 
-            return traverse_obj(info_dict, fields, is_user_input=True, traverse_string=True)
+            return traverse_obj(info_dict, fields, traverse_string=True)
 
         def get_value(mdict):
             # Object traversal

diff --git a/yt_dlp/extractor/aenetworks.py b/yt_dlp/extractor/aenetworks.py
@@ -121,11 +121,21 @@ class AENetworksIE(AENetworksBaseIE):
         'info_dict': {
             'id': '22253814',
             'ext': 'mp4',
-            'title': 'Winter is Coming',
-            'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
+            'title': 'Winter Is Coming',
+            'description': 'md5:a40e370925074260b1c8a633c632c63a',
             'timestamp': 1338306241,
             'upload_date': '20120529',
             'uploader': 'AENE-NEW',
+            'duration': 2592.0,
+            'thumbnail': r're:^https?://.*\.jpe?g$',
+            'chapters': 'count:5',
+            'tags': 'count:14',
+            'categories': ['Mountain Men'],
+            'episode_number': 1,
+            'episode': 'Episode 1',
+            'season': 'Season 1',
+            'season_number': 1,
+            'series': 'Mountain Men',
         },
         'params': {
             # m3u8 download
@@ -143,6 +153,15 @@ class AENetworksIE(AENetworksBaseIE):
             'timestamp': 1452634428,
             'upload_date': '20160112',
             'uploader': 'AENE-NEW',
+            'duration': 1277.695,
+            'thumbnail': r're:^https?://.*\.jpe?g$',
+            'chapters': 'count:4',
+            'tags': 'count:23',
+            'episode': 'Episode 1',
+            'episode_number': 1,
+            'season': 'Season 9',
+            'season_number': 9,
+            'series': 'Duck Dynasty',
         },
         'params': {
             # m3u8 download

diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py
@@ -7,8 +7,10 @@
     ExtractorError,
     OnDemandPagedList,
     clean_html,
+    extract_attributes,
     get_element_by_class,
     get_element_by_id,
+    get_element_html_by_class,
     get_elements_html_by_class,
     int_or_none,
     orderedSet,
@@ -17,6 +19,7 @@
     traverse_obj,
     unified_strdate,
     urlencode_postdata,
+    urljoin,
 )
 
 
@@ -34,6 +37,25 @@ class BitChuteIE(InfoExtractor):
             'thumbnail': r're:^https?://.*\.jpg$',
             'uploader': 'BitChute',
             'upload_date': '20170103',
+            'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/',
+            'channel': 'BitChute',
+            'channel_url': 'https://www.bitchute.com/channel/bitchute/'
+        },
+    }, {
+        # test case: video with different channel and uploader
+        'url': 'https://www.bitchute.com/video/Yti_j9A-UZ4/',
+        'md5': 'f10e6a8e787766235946d0868703f1d0',
+        'info_dict': {
+            'id': 'Yti_j9A-UZ4',
+            'ext': 'mp4',
+            'title': 'Israel at War | Full Measure',
+            'description': 'md5:38cf7bc6f42da1a877835539111c69ef',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'sharylattkisson',
+            'upload_date': '20231106',
+            'uploader_url': 'https://www.bitchute.com/profile/9K0kUWA9zmd9/',
+            'channel': 'Full Measure with Sharyl Attkisson',
+            'channel_url': 'https://www.bitchute.com/channel/sharylattkisson/'
         },
     }, {
         # video not downloadable in browser, but we can recover it
@@ -48,6 +70,9 @@ class BitChuteIE(InfoExtractor):
             'thumbnail': r're:^https?://.*\.jpg$',
             'uploader': 'BitChute',
             'upload_date': '20181113',
+            'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/',
+            'channel': 'BitChute',
+            'channel_url': 'https://www.bitchute.com/channel/bitchute/'
         },
         'params': {'check_formats': None},
     }, {
@@ -99,6 +124,11 @@ def _raise_if_restricted(self, webpage):
             reason = clean_html(get_element_by_id('page-detail', webpage)) or page_title
             self.raise_geo_restricted(reason)
 
+    @staticmethod
+    def _make_url(html):
+        path = extract_attributes(get_element_html_by_class('spa', html) or '').get('href')
+        return urljoin('https://www.bitchute.com', path)
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(
@@ -121,12 +151,19 @@ def _real_extract(self, url):
                 'Video is unavailable. Please make sure this video is playable in the browser '
                 'before reporting this issue.', expected=True, video_id=video_id)
 
+        details = get_element_by_class('details', webpage) or ''
+        uploader_html = get_element_html_by_class('creator', details) or ''
+        channel_html = get_element_html_by_class('name', details) or ''
+
         return {
             'id': video_id,
             'title': self._html_extract_title(webpage) or self._og_search_title(webpage),
             'description': self._og_search_description(webpage, default=None),
             'thumbnail': self._og_search_thumbnail(webpage),
-            'uploader': clean_html(get_element_by_class('owner', webpage)),
+            'uploader': clean_html(uploader_html),
+            'uploader_url': self._make_url(uploader_html),
+            'channel': clean_html(channel_html),
+            'channel_url': self._make_url(channel_html),
             'upload_date': unified_strdate(self._search_regex(
                 r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)),
             'formats': formats,
@@ -154,6 +191,9 @@ class BitChuteChannelIE(InfoExtractor):
                     'thumbnail': r're:^https?://.*\.jpg$',
                     'uploader': 'BitChute',
                     'upload_date': '20170103',
+                    'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/',
+                    'channel': 'BitChute',
+                    'channel_url': 'https://www.bitchute.com/channel/bitchute/',
                     'duration': 16,
                     'view_count': int,
                 },
@@ -169,7 +209,7 @@ class BitChuteChannelIE(InfoExtractor):
         'info_dict': {
             'id': 'wV9Imujxasw9',
             'title': 'Bruce MacDonald and "The Light of Darkness"',
-            'description': 'md5:04913227d2714af1d36d804aa2ab6b1e',
+            'description': 'md5:747724ef404eebdfc04277714f81863e',
         }
     }]
 

diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py
@@ -180,6 +180,13 @@ class CBCPlayerIE(InfoExtractor):
             'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg',
             'chapters': [],
             'duration': 494.811,
+            'categories': ['AudioMobile/All in a Weekend Montreal'],
+            'tags': 'count:8',
+            'location': 'Quebec',
+            'series': 'All in a Weekend Montreal',
+            'season': 'Season 2015',
+            'season_number': 2015,
+            'media_type': 'Excerpt',
         },
     }, {
         'url': 'http://www.cbc.ca/player/play/2164402062',
@@ -195,25 +202,37 @@ class CBCPlayerIE(InfoExtractor):
             'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg',
             'chapters': [],
             'duration': 186.867,
+            'series': 'CBC News: Windsor at 6:00',
+            'categories': ['News/Canada/Windsor'],
+            'location': 'Windsor',
+            'tags': ['cancer'],
+            'creator': 'Allison Johnson',
+            'media_type': 'Excerpt',
         },
     }, {
         # Has subtitles
         # These broadcasts expire after ~1 month, can find new test URL here:
         # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast
-        'url': 'http://www.cbc.ca/player/play/2249992771553',
-        'md5': '2f2fb675dd4f0f8a5bb7588d1b13bacd',
+        'url': 'http://www.cbc.ca/player/play/2284799043667',
+        'md5': '9b49f0839e88b6ec0b01d840cf3d42b5',
         'info_dict': {
-            'id': '2249992771553',
+            'id': '2284799043667',
             'ext': 'mp4',
-            'title': 'The National | Women’s soccer pay, Florida seawater, Swift quake',
-            'description': 'md5:adba28011a56cfa47a080ff198dad27a',
-            'timestamp': 1690596000,
-            'duration': 2716.333,
+            'title': 'The National | Hockey coach charged, Green grants, Safer drugs',
+            'description': 'md5:84ef46321c94bcf7d0159bb565d26bfa',
+            'timestamp': 1700272800,
+            'duration': 2718.833,
             'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
-            'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/481/326/thumbnail.jpeg',
+            'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/907/171/thumbnail.jpeg',
             'uploader': 'CBCC-NEW',
             'chapters': 'count:5',
-            'upload_date': '20230729',
+            'upload_date': '20231118',
+            'categories': 'count:4',
+            'series': 'The National - Full Show',
+            'tags': 'count:1',
+            'creator': 'News',
+            'location': 'Canada',
+            'media_type': 'Full Program',
         },
     }]
 

diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
@@ -382,6 +382,7 @@ class InfoExtractor:
                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
                     to set it
+    media_type:     The type of media as classified by the site, e.g. "episode", "clip", "trailer"
     _old_archive_ids: A list of old archive ids needed for backward compatibility
     _format_sort_fields: A list of fields to use for sorting formats
     __post_extractor: A function to be called just before the metadata is

diff --git a/yt_dlp/extractor/cwtv.py b/yt_dlp/extractor/cwtv.py
@@ -46,6 +46,10 @@ class CWTVIE(InfoExtractor):
             'timestamp': 1444107300,
             'age_limit': 14,
             'uploader': 'CWTV',
+            'thumbnail': r're:^https?://.*\.jpe?g$',
+            'chapters': 'count:4',
+            'episode': 'Episode 20',
+            'season': 'Season 11',
         },
         'params': {
             # m3u8 download