diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 834b1df18999..95e4881a8e02 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -10,14 +10,13 @@ ) from ..networking import Request from ..networking.exceptions import network_exceptions +from ..postprocessor import FFmpegPostProcessor from ..utils import ( ExtractorError, - clean_html, determine_ext, error_to_compat_str, float_or_none, format_field, - get_element_by_id, get_first, int_or_none, join_nonempty, @@ -32,7 +31,6 @@ url_or_none, urlencode_postdata, urljoin, - variadic, ) @@ -44,19 +42,12 @@ class FacebookIE(InfoExtractor): (?:[^#]*?\#!/)? (?: (?: - permalink\.php| - video/video\.php| - photo\.php| - video\.php| - video/embed| - story\.php| - watch(?:/live)?/? + (?:video/)?[a-z]{5,}(?:\.php|/live)?/? )\?(?:.*?)(?:v|video_id|story_fbid)=| - [^/]+/videos/(?:[^/]+/)?| - [^/]+/posts/| + [^/]+/(?:videos|posts)/(?:[^/]+/)?| events/(?:[^/]+/)?| groups/[^/]+/(?:permalink|posts)/| - watchparty/ + [a-z]{5,}/| )| facebook: ) @@ -89,9 +80,10 @@ class FacebookIE(InfoExtractor): 'timestamp': 1692346159, 'thumbnail': r're:^https?://.*', 'uploader_id': '100063551323670', - 'duration': 3132.184, + 'uploader_url': r're:^https?://.*', + 'duration': 3133.583, 'view_count': int, - 'concurrent_view_count': 0, + 'concurrent_view_count': int, }, }, { 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', @@ -117,8 +109,10 @@ class FacebookIE(InfoExtractor): 'upload_date': '20140506', 'timestamp': 1399398998, 'thumbnail': r're:^https?://.*', - 'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl', + 'uploader_id': r're:^pfbid.*', + 'uploader_url': r're:^https?://.*', 'duration': 131.03, + 'view_count': int, 'concurrent_view_count': int, }, }, { @@ -172,7 +166,7 @@ class FacebookIE(InfoExtractor): # have 1080P, but only up to 720p in swf params # data.video.story.attachments[].media 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': 'ca63897a90c9452efee5f8c40d080e25', + 'md5': '1659aa21fb3dd1585874f668e81a72c8', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', @@ -184,8 +178,9 @@ class FacebookIE(InfoExtractor): 'thumbnail': r're:^https?://.*', 'view_count': int, 'uploader_id': '100059479812265', + 'uploader_url': r're:^https?://.*', 'concurrent_view_count': int, - 'duration': 44.478, + 'duration': 44.181, }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall @@ -194,12 +189,13 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1417995061575415', 'ext': 'mp4', - 'title': 'Довгоочікуване відео | By Yaroslav - Facebook', + 'title': 'Довгоочікуване відео', 'description': 'Довгоочікуване відео', 'timestamp': 1486648217, 'upload_date': '20170209', 'uploader': 'Yaroslav Korpan', - 'uploader_id': 'pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl', + 'uploader_id': r're:^pfbid.*', + 'uploader_url': r're:^https?://.*', 'concurrent_view_count': int, 'thumbnail': r're:^https?://.*', 'view_count': int, @@ -209,32 +205,39 @@ class FacebookIE(InfoExtractor): 'skip_download': True, }, }, { - # FIXME 'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471', 'info_dict': { - 'id': '1072691702860471', + 'id': 'giphy', 'ext': 'mp4', - 'title': 'md5:ae2d22a93fbb12dad20dc393a869739d', + 'title': 'Nada mas satisfactorio que los otros 5... - La Guía Del Varón', + 'description': 'Nada mas satisfactorio que los otros 5 minutos', 'timestamp': 1477305000, 'upload_date': '20161024', 'uploader': 'La Guía Del Varón', + 'uploader_id': '100050567346031', + 'uploader_url': r're:^https?://.*', 'thumbnail': r're:^https?://.*', + 'age_limit': 0, }, - 'skip': 'Requires logging in', + 'skip': 'Gif on giphy.com', }, { # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', 'info_dict': { 'id': '202882990186699', 'ext': 'mp4', - 'title': 'birb (O v O") | Hello? Yes your uber ride is here', - 'description': 'Hello? Yes your uber ride is here * Jukin Media Verified * Find this video and others like it by visiting...', - 'timestamp': 1486035513, + 'title': 'birb (O v O")', + 'description': 'md5:963dee8a667a2b49f2059cf7ab54fe55', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1486035494, 'upload_date': '20170202', 'uploader': 'Elisabeth Ahtn', - 'uploader_id': '100013949973717', + 'uploader_id': r're:^pfbid.*', + 'uploader_url': r're:^https?://.*', + 'duration': 23.891, + 'view_count': int, + 'concurrent_view_count': int, }, - 'skip': 'Requires logging in', }, { # data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media 'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/', @@ -245,10 +248,12 @@ class FacebookIE(InfoExtractor): 'description': 'Vickie Gentry shared a memory.', 'timestamp': 1511548260, 'upload_date': '20171124', - 'uploader': 'Vickie Gentry', - 'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl', + 'uploader': 'ATTN:', + 'uploader_id': '100064451419378', + 'uploader_url': r're:^https?://.*', 'thumbnail': r're:^https?://.*', 'duration': 148.435, + 'concurrent_view_count': int, }, }, { # data.node.comet_sections.content.story.attachments[].styles.attachment.media @@ -259,12 +264,14 @@ class FacebookIE(InfoExtractor): 'description': 'md5:2f2fcf93e97ac00244fe64521bbdb0cb', 'uploader': 'ATTN:', 'upload_date': '20231207', - 'title': 'ATTN:', + 'title': 'ATTN: - Learning new problem-solving skills is hard for...', 'duration': 132.675, 'uploader_id': '100064451419378', + 'uploader_url': r're:^https?://.*', 'view_count': int, 'thumbnail': r're:^https?://.*', 'timestamp': 1701975646, + 'concurrent_view_count': int, }, }, { # data.node.comet_sections.content.story.attachments[].styles.attachment.media @@ -272,15 +279,17 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '270103405756416', 'ext': 'mp4', - 'title': 'Lela Evans', - 'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...', + 'title': 'Lela Evans - Today Makkovik\'s own Pilot Mandy Smith made...', + 'description': 'md5:cc93a91feb89923303c1f78656791e4d', 'thumbnail': r're:^https?://.*', 'uploader': 'Lela Evans', - 'uploader_id': 'pfbid0shZJipuigyy5mqrUJn9ub5LJFWNHvan5prtyi3LrDuuuJ4NwrURgnQHYR9fywBepl', + 'uploader_id': r're:^pfbid.*', + 'uploader_url': r're:^https?://.*', 'upload_date': '20231228', 'timestamp': 1703804085, 'duration': 394.347, 'view_count': int, + 'concurrent_view_count': int, }, }, { 'url': 'https://www.facebook.com/story.php?story_fbid=pfbid0Fnzhm8UuzjBYpPMNFzaSpFE9UmLdU4fJN8qTANi1Dmtj5q7DNrL5NERXfsAzDEV7l&id=100073071055552', @@ -288,9 +297,11 @@ class FacebookIE(InfoExtractor): }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, + 'skip': 'Video gone', }, { 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', 'only_matching': True, + 'skip': 'Video gone', }, { # data.mediaset.currMedia.edges 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', @@ -303,10 +314,12 @@ class FacebookIE(InfoExtractor): # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 'only_matching': True, + 'skip': 'Video gone', }, { # data.video.creation_story.attachments[].media 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', 'only_matching': True, + 'skip': 'Video gone', }, { # data.video 'url': 'https://www.facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd.onion/video.php?v=274175099429670', @@ -315,6 +328,7 @@ class FacebookIE(InfoExtractor): # no title 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', 'only_matching': True, + 'skip': 'Video gone', }, { # data.video 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', @@ -327,7 +341,8 @@ class FacebookIE(InfoExtractor): 'upload_date': '20180523', 'uploader': 'ESL One Dota 2', 'uploader_id': '100066514874195', - 'duration': 4524.212, + 'uploader_url': r're:^https?://.*', + 'duration': 4524.001, 'view_count': int, 'thumbnail': r're:^https?://.*', 'concurrent_view_count': int, @@ -344,9 +359,10 @@ class FacebookIE(InfoExtractor): 'title': 'Josef', 'thumbnail': r're:^https?://.*', 'concurrent_view_count': int, - 'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl', + 'uploader_id': r're:^pfbid.*', + 'uploader_url': r're:^https?://.*', 'timestamp': 1549275572, - 'duration': 3.413, + 'duration': 3.283, 'uploader': 'Josef Novak', 'description': '', 'upload_date': '20190204', @@ -354,7 +370,21 @@ class FacebookIE(InfoExtractor): }, { # data.video.story.attachments[].media 'url': 'https://www.facebook.com/watch/?v=647537299265662', - 'only_matching': True, + 'info_dict': { + 'id': '647537299265662', + 'ext': 'mp4', + 'title': 'Padre enseña a su hijo a cómo bañar un recién nacido junto con su...', + 'description': 'Padre ense\u00f1a a su hijo a c\u00f3mo ba\u00f1ar un reci\u00e9n nacido junto con su gato y se hace viral, mir\u00e1 el video 😍', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1605534618, + 'upload_date': '20201116', + 'uploader': 'InfoPico', + 'uploader_id': '100064391811349', + 'uploader_url': r're:^https?://.*', + 'duration': 136.179, + 'view_count': int, + 'concurrent_view_count': int, + }, }, { # FIXME: https://github.com/yt-dlp/yt-dlp/issues/542 # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media @@ -370,11 +400,16 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '117576630041613', 'ext': 'mp4', - # TODO: title can be extracted from video page - 'title': 'Facebook video #117576630041613', - 'uploader_id': '189393014416438', + 'title': 'Officers Rescue Trapped Motorist from Mahoning River Crash 11-22-20', + 'thumbnail': r're:^https?://.*', + 'uploader': 'City of Alliance Police Department', + 'uploader_id': '100064413680392', + 'uploader_url': r're:^https?://.*', 'upload_date': '20201123', 'timestamp': 1606162592, + 'duration': 101.504, + 'view_count': int, + 'concurrent_view_count': int, }, 'skip': 'Requires logging in', }, { @@ -392,7 +427,24 @@ class FacebookIE(InfoExtractor): }, { # data.video.creation_story.attachments[].media 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', - 'only_matching': True, + 'info_dict': { + 'id': '1823658634322275', + 'ext': 'mp4', + 'title': 'Live Webcam from Corfu - Greece', + 'description': 'md5:84c1af6894ecffe710c79744e4873e85', + 'thumbnail': r're:^https?://.*', + 'uploader': 'SkylineWebcams', + 'uploader_id': '100064307154679', + 'uploader_url': r're:^https?://.*', + 'upload_date': '20180319', + 'timestamp': 1521449766, + 'duration': 14424.199, + 'view_count': int, + 'concurrent_view_count': int, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.facebook.com/watchparty/211641140192478', 'info_dict': { @@ -411,6 +463,7 @@ class FacebookIE(InfoExtractor): 'thumbnail': r're:^https?://.*', 'uploader': 'Comitato Liberi Pensatori', 'uploader_id': '100065709540881', + 'uploader_url': r're:^https?://.*', }, }] _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' @@ -445,7 +498,9 @@ def _perform_login(self, username, password): try: login_results = self._download_webpage(request, None, note='Logging in', errnote='unable to fetch login page') - if re.search(r'', login_results) is not None: + if 'Your Request Couldn' in login_results: + self.raise_login_required('Failed to login with credentials', method='cookies') + elif re.search(r']*name="login"[^<]*', login_results): error = self._html_search_regex( r'(?s)]+class=(["\']).*?login_error_box.*?\1[^>]*>]*>.*?]*>(?P.+?)', login_results, 'login error', default=None, group='error') @@ -477,64 +532,396 @@ def _perform_login(self, username, password): self.report_warning('unable to log in: %s' % error_to_compat_str(err)) return + def _get_video_metadata(self, url): + metadata = {} + ffmpeg = FFmpegPostProcessor() + if ffmpeg.probe_available: + for stream in traverse_obj(ffmpeg.get_metadata_object(url), 'streams', expected_type=list): + if stream['codec_type'] == 'video': + [f, d] = [int_or_none(x) for x in ( + stream['avg_frame_rate'].split('/') if stream.get('avg_frame_rate') else [None, None])] + metadata.update(**traverse_obj(stream, { + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'vcodec': ('codec_tag_string' or 'codec_name', {str_or_none}), + 'vbr': ('bit_rate', {float_or_none}, {lambda x: float_or_none(x, 1000)}), + }), **{ + 'fps': round(f / d, 1) if f and d else None, + }) + elif stream['codec_type'] == 'audio': + metadata.update(traverse_obj(stream, { + 'audio_channels': ('channels', {int_or_none}), + 'acodec': ('codec_tag_string' or 'codec_name', {str_or_none}), + 'asr': ('sample_rate', {int_or_none}), + 'abr': ('bit_rate', {float_or_none}, {lambda x: float_or_none(x, 1000)}), + })) + return metadata + def _extract_from_url(self, url, video_id): webpage = self._download_webpage( - url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) - - def extract_metadata(webpage): - post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( - r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] - post = traverse_obj(post_data, ( - ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] - media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( - k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) - title = get_first(media, ('title', 'text')) - description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) - page_title = title or self._html_search_regex(( - r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', - r'(?s)(?P.*?)', - self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'(?P<content>.+?)' - ), webpage, 'title', default=None, group='content') - description = description or self._html_search_meta( - ['description', 'og:description', 'twitter:description'], - webpage, 'description', default=None) - uploader_data = ( - get_first(media, ('owner', {dict})) - or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name'])) - or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) - or get_first(post, ('node', 'actors', ..., {dict})) - or get_first(post, ('event', 'event_creator', {dict})) or {}) - uploader = uploader_data.get('name') or ( - clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) - or self._search_regex( - (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False)) - timestamp = int_or_none(self._search_regex( - r']+data-utime=["\'](\d+)', webpage, - 'timestamp', default=None)) - thumbnail = self._html_search_meta( - ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) - # some webpages contain unretrievable thumbnail urls - # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1 - # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ - if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): - thumbnail = None - info_dict = { - 'description': description, - 'uploader': uploader, - 'uploader_id': uploader_data.get('id'), - 'timestamp': timestamp, - 'thumbnail': thumbnail, - 'view_count': parse_count(self._search_regex( - (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)',), - webpage, 'view count', default=None)), - 'concurrent_view_count': get_first(post, ( - ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), - } + re.sub(r'://(?:[\w-]+\.)?facebook\.com/', '://www.facebook.com/', url), video_id, tries=2) + + post_data = re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage) + + sjs_data = [self._parse_json(j, video_id, fatal=False) for j in post_data] + cookies = self._get_cookies(url) + # user passed logged-in cookies or attempted to login + login_data = cookies.get('c_user') and cookies.get('xs') + logged_in = False + if login_data: + logged_in = get_first(sjs_data, ( + 'require', ..., ..., ..., '__bbox', 'define', + lambda _, v: 'CurrentUserInitialData' in v, ..., 'ACCOUNT_ID'), default='0') != '0' + if logged_in and (info := get_first(sjs_data, ( + 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data', + (('ufac_client', 'state', (('set_contact_point_state_renderer', 'title'), + ('intro_state_renderer', 'header_title'))), + ('epsilon_checkpoint', 'screen', 'title')) + ))): + if any(content in info for content in ['days left to appeal', 'suspended your account']): + raise ExtractorError('Your account is suspended', expected=True) + if 'Enter mobile number' == info: + raise ExtractorError('Facebook is requiring mobile number confirmation', expected=True) + if 'your account has been locked' in info: + raise ExtractorError('Your account has been locked', expected=True) + if props := get_first(sjs_data, ( + 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., 'rootView', + lambda _, v: v.get('title') is not None)): + if not self._cookies_passed: + self.raise_login_required() + else: + msg = join_nonempty('title', 'body', delim='. ', from_dict=props).replace('\n ', '') + raise ExtractorError(f'Facebook: {msg}', expected=True) + + def find_json_obj(json_strings, *patterns, obj_in_value=False, get_all=False): + """ + Find JSON object, in the form of a string, by regular expression + >>> obj = find_json_obj((json_a, _or_b), regex_a, (regex_b, _or_c), obj_in_value=True, get_all=True) + @param json_strings string, tuple regex patterns (match only one in tuple) + *patterns string, tuple regex patterns (match only one in tuple) + obj_in_value boolean False: find the object(s) containing the pattern(s) + True : given pattern(s) of the key(s) to find the + object(s) in the value of that key(s) + get_all boolean + @return list of tuple a list of (match, JSON object) + """ + def find_offset(string, bracket): + _BRACKET_MAP = { + '{': ('}', (1 if obj_in_value else -1)), # (opposite sign, search direction) + '}': ('{', 1), # search direction: 1 - forward, -1 - backward + } + count, b_sum, offset = 0, 0, 0 + for x in string[::_BRACKET_MAP[bracket][1]]: + count += (1 if x == bracket or x == _BRACKET_MAP[bracket][0] else 0) + b_sum += (1 if x == bracket else (-1 if x == _BRACKET_MAP[bracket][0] else 0)) + offset += 1 + if count > 0 and b_sum >= (0 if obj_in_value else 1): + break + return offset * _BRACKET_MAP[bracket][1] + for json_str in (json_strings if isinstance(json_strings, tuple) else [json_strings]): # return 1st match + if isinstance(json_str, str): + for pattern_grp in (patterns if isinstance(patterns, tuple) else [patterns]): # match all + for pattern in (pattern_grp if isinstance(pattern_grp, tuple) else [pattern_grp]): # return 1st match + found = False + if isinstance(pattern, str): + for m in re.finditer(pattern, json_str): # depend on get_all + i = m.end(m.lastindex or 0) if obj_in_value else m.start(m.lastindex or 0) + opening = (i + find_offset( + json_str[(i * obj_in_value):(i * (not obj_in_value) - obj_in_value)], '{' + ) - obj_in_value) + closing = i + find_offset(json_str[i:], '}') + if isinstance(opening, int) and isinstance(closing, int): + found = True + yield (m.group(0), json_str[opening:closing:]) + if not get_all: + break + else: + if found: + break + continue + break + else: + continue + else: + if found: + break + continue + break + + page_description = self._html_search_meta( + ['description', 'og:description', 'twitter:description'], + webpage, 'description', default=None) + page_title = self._html_search_regex(( + r'\s(?P<content>[\s\S]+?)\s', + self._meta_regex('og:title'), self._meta_regex('twitter:title'), + r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', + r'(?s)(?P.*?)' + ), re.sub(r'(Facebook(\sLive)?)|(Video)', '', + webpage), 'title', default='', group='content').split(' | ')[0] + thumbnail = self._html_search_meta( + ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) + if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): + thumbnail = None + + data, uploader_info = [], {} + p_id = s_id = linked_url = description = title = timestamp = None + for p_data in post_data[:]: + if '"feed_unit":' not in p_data: + if (not s_id or not p_id) and ('"story":' in p_data or '"creation_story":' in p_data): + p_id = p_id if p_id else (lambda x: x.group(1) if x else + (video_id if video_id.isnumeric() else None) + )(re.search(r'"(?:post_id|videoId|video_id)":\s?"(\d+)"', p_data)) + s_id = s_id if s_id else (lambda x: x.group(1) if x else None + )(re.search(r'"id":\s?"(Uzpf[^"]+)"', p_data)) + if not data: + if '"dash_manifest_url":' in p_data: + for x in find_json_obj(p_data, r'"data":\s?{', r'"data":', obj_in_value=True): + if '"dash_manifest_url":' in x[1]: + data = x[1] + break + elif '"attachment":{"source":{' in p_data or '"attachment":{"web_link":{' in p_data: + # linked media + for x in find_json_obj(p_data, r'"data":\s?{', r'"data":', obj_in_value=True): + if '"attachment":{"source":{' in x[1] or '"attachment":{"web_link":{' in x[1]: + data = x[1] + break + for x in find_json_obj( + data, r'("attachment"):\s*{"source":', r'("attachment"):\s*{"web_link":', + obj_in_value=True + ): + if linked_url := traverse_obj( + json.loads(x[1]), (('web_link', None), 'url', {url_or_none}), get_all=False): + break + if not re.search(r'"(?:dash_manifest_url|title|message)":', p_data): + post_data.remove(p_data) + else: + post_data.remove(p_data) + if data and s_id and p_id: + break + + post_data = ','.join(post_data) + # uploader + for x in find_json_obj(post_data, ( + r'"actors":[^}]*"__isActor":', r'"owner":[^}]*"name":\s?"[^"]'), get_all=True): + if f'"id":"{s_id}"' in x[1] and '"name":"' in x[1]: + uploader_info = traverse_obj(json.loads(x[1]), { + 'uploader': (('owner', ('actors', ...)), 'name', {str}), + 'uploader_id': (('owner', ('actors', ...)), 'id', {str}), + 'uploader_url': (('owner', ('actors', ...)), 'url', {str}), + }, get_all=False) + uploader_info['uploader_url'] = ( + uploader_info.get('uploader_url') + or f"https://www.facebook.com/profile.php?id={uploader_info['uploader_id']}") + break + # description + for x in find_json_obj(post_data, ( + r',"message":(?:(?!"message":)[^}])*"text":\s?"[^"](?:(?!"id":).)*"id":', + r'"message":(?:(?!"message":)[^}])*"text":\s?"[^"](?:(?!"id":).)*"id":' + ), get_all=True): + x_dict = json.loads(x[1]) + for i in [i for i in [s_id, p_id] if i is not None]: + if x_dict.get('id') == i: + if (description := x_dict['message'] if isinstance(x_dict['message'], str) + else (x_dict['message'] or {}).get('text')): + if (track_title := (lambda x: x.group(0) if x else None + )(re.search(r'"track_title":\s?"[^"]+"', x[1]))): + description += '. ' + json.loads('{' + track_title + '}')['track_title'] + break + if description: + break + # title / description + for x in find_json_obj(post_data, r'"title":(?:(?!"title":).)*"text":\s?"[^"]', get_all=True): + x_dict = json.loads(x[1]) + if p_id: + if (text := x_dict['title'] if isinstance(x_dict['title'], str) + else (x_dict['title'] or {}).get('text')): + title = title if title else (text if x_dict.get('id') == p_id else title) + page_description = (page_description if page_description + else (text if x_dict.get('id') == s_id else page_description)) + if title and page_description: + break + title = (lambda x: x if x != uploader_info.get('uploader') else None + )(title or page_title or re.sub(r'(\s*\n\s*)', ' ', (description or ''))) + title = title if len(title or '') <= 100 else title[:(47 + title[47:67].rfind(' '))] + '...' + # timestamp + for x in find_json_obj( + post_data, r'"publish_time":\d+,', r'"creation_time":\d+,', get_all=True + ): + if f'"id":"{p_id}"' in x[1] or f'"id":"{s_id}"' in x[1]: + if timestamp := json.loads(x[1]).get(x[0].split('"')[1]): + break + + webpage_info = { + 'title': title, + 'description': description or page_description, + 'thumbnails': [{k: v for k, v in { + 'url': thumbnail, + 'height': (lambda x: int_or_none(x.group(1)) if x else None + )(re.search(r'stp=.+_[a-z]\d+x(\d+)&', thumbnail)) + }.items() if v is not None}] if url_or_none(thumbnail) else [], + 'timestamp': timestamp, + 'view_count': parse_count(self._search_regex( + (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)',), + webpage, 'view count', default=None)), + } + + if linked_url: + return self.url_result(linked_url, video_id=video_id, url_transparent=True, + **{k: v for k, v in merge_dicts(webpage_info, uploader_info).items() if v}) + + def extract_dash_manifest(video, formats): + dash_manifest = video.get('dash_manifest') + if dash_manifest: + formats.extend(self._parse_mpd_formats( + compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), + mpd_url=video.get('dash_manifest_url'))) - info_json_ld = self._search_json_ld(webpage, video_id, default={}) - info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '') - or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}') - return merge_dicts(info_json_ld, info_dict) + def process_formats(info): + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. + for f in info['formats']: + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. + f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' + # Formats larger than ~500MB will return error 403 unless chunk size is regulated + f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20 + + if data: + entries = [] + + def parse_graphql_video(video): + v_id = video.get('videoId') or video.get('id') or video_id + formats = [] + for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), + ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), + ('browser_native_sd_url', 'sd')): + playable_url = video.get(key) + if not playable_url: + continue + if determine_ext(playable_url) == 'mpd': + formats.extend(self._extract_mpd_formats(playable_url, video_id)) + else: + metadata = self._get_video_metadata(playable_url) + if metadata: + formats.append({**{ + 'format_id': format_id, + 'url': playable_url, + }, **metadata}) + else: + q = qualities(['sd', 'hd']) + formats.append({ + 'format_id': format_id, + # sd, hd formats w/o resolution info should be deprioritized below DASH + 'quality': q(format_id) - 3, + 'url': playable_url, + }) + extract_dash_manifest(video, formats) + + # captions/subtitles + automatic_captions, subtitles = {}, {} + is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool})) + for caption in traverse_obj(video, ( + 'video_available_captions_locales', + {lambda x: sorted(x, key=lambda c: c['locale'])}, + lambda _, v: url_or_none(v['captions_url']) + )): + lang = caption.get('localized_language') or 'und' + subs = { + 'url': caption['captions_url'], + 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang), + } + if caption.get('localized_creation_method') or is_broadcast: + automatic_captions.setdefault(caption['locale'], []).append(subs) + else: + subtitles.setdefault(caption['locale'], []).append(subs) + captions_url = traverse_obj(video, ('captions_url', {url_or_none})) + if captions_url and not automatic_captions and not subtitles: + locale = self._html_search_meta( + ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US') + (automatic_captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}] + # thumbnails + thumbnails = [] + for url in [uri for uri in [traverse_obj(video, path) for path in [ + ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri'), + ('image', 'uri'), ('previewImage', 'uri') + ]] if url_or_none(uri) is not None]: + if (re.search(r'\.(?:jpg|png)', url) + and not any(url.split('_cat=')[0] in t['url'] for t in thumbnails)): + thumbnails.append({k: v for k, v in { + 'url': url, + 'height': (lambda x: int_or_none(x.group(1)) if x else None + )(re.search(r'stp=.+_[a-z]\d+x(\d+)&', url)) + }.items() if v is not None}) + # uploader + if uploader_id := traverse_obj(video, ('owner', 'id', {str_or_none})): + if x := list(find_json_obj(post_data, ( + r'"id":\s?"%s"[^}]*"name":\s?"[^"]' % uploader_id, + r'"name":\s?"[^"][^}]*"id":\s?"%s"' % uploader_id))): + if x[0][1]: + video['owner'] = merge_dicts(video['owner'], json.loads(x[0][1])) + elif x := list(find_json_obj(data, ( + r'("video_owner":)[^}]*"name":\s?"[^"]', r'([_"]owner":)[^}]*"name":\s?"[^"]', + r'(_creator":)[^}]*"name":\s?"[^"]', r'("actor":)[^}]*"name":\s?"[^"]' + ), obj_in_value=True)): + if x[0][1]: + video['owner'] = json.loads(x[0][1]) + uploader = try_get(video, lambda x: x['owner']['name']) + # title + if v_name := video.get('name'): + v_title = v_name if len(v_name) <= 100 else v_name[:(47 + v_name[47:67].rfind(' '))] + '...' + + info = { + 'id': v_id, + 'title': ((v_title if video.get('name') else None) + or (f"{webpage_info['title']} #{v_id}" if webpage_info['title'] + else (f"{uploader}'s Video #{v_id}" if uploader else f'Facebook Video #{v_id}'))), + 'description': (try_get(video, lambda x: x['savable_description']['text']) + or video.get('name') or webpage_info['description']), + 'thumbnails': thumbnails, + 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none), + 'uploader': uploader, + 'uploader_id': try_get(video, lambda x: x['owner']['id']), + 'uploader_url': (try_get(video, lambda x: x['owner']['url']) + or (lambda x: f'https://www.facebook.com/profile.php?id={x}' if x else None + )(try_get(video, lambda x: x['owner']['id']))), + 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000) + or float_or_none(video.get('length_in_second'))), + 'formats': formats, + 'automatic_captions': automatic_captions, + 'subtitles': subtitles, + 'concurrent_view_count': video.get('liveViewerCount'), + } + process_formats(info) + entries.append(info) + + video_ids = [] + for idx, x in enumerate(find_json_obj( + data, (r'"dash_manifest_url":\s?"', r'_hd_url":\s?"', r'_sd_url":\s?"'), get_all=True + )): + media = json.loads(x[1]) + if (media.get('__typename', 'Video') == 'Video' + and not media.get('sticker_image') + and not media.get('id', f'{video_id}_{idx}') in video_ids): + video_ids.append(media.get('id', f'{video_id}_{idx}')) + parse_graphql_video(media) + if media.get('id') == video_id: + break + + if len(entries) > 1: + return self.playlist_result(entries, video_id, **{ + k: v for k, v in merge_dicts(webpage_info, uploader_info).items() if v}) + + video_info = entries[0] if entries else {'id': video_id} + if "'s Video #" in video_info.get('title', ''): + video_info['title'] = f"{video_info['uploader']}'s Video" + if video_info['timestamp']: + webpage_info['timestamp'] = None + if webpage_info['thumbnails']: + if not (any(webpage_info['thumbnails'][0]['url'].split('_cat=')[0] in thumbnail['url'] + for thumbnail in video_info['thumbnails'])): + video_info['thumbnails'].extend(webpage_info['thumbnails']) + webpage_info['thumbnails'] = video_info['thumbnails'] + return merge_dicts(webpage_info, video_info) video_data = None @@ -559,34 +946,6 @@ def extract_from_jsmods_instances(js_data): return extract_video_data(try_get( js_data, lambda x: x['jsmods']['instances'], list) or []) - def extract_dash_manifest(video, formats): - dash_manifest = video.get('dash_manifest') - if dash_manifest: - formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), - mpd_url=video.get('dash_manifest_url'))) - - def process_formats(info): - # Downloads with browser's User-Agent are rate limited. Working around - # with non-browser User-Agent. - for f in info['formats']: - # Downloads with browser's User-Agent are rate limited. Working around - # with non-browser User-Agent. - f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - # Formats larger than ~500MB will return error 403 unless chunk size is regulated - f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20 - - def extract_relay_data(_filter): - return self._parse_json(self._search_regex( - r'data-sjs>({.*?%s.*?})' % _filter, - webpage, 'replay data', default='{}'), video_id, fatal=False) or {} - - def extract_relay_prefetched_data(_filter): - return traverse_obj(extract_relay_data(_filter), ( - 'require', (None, (..., ..., ..., '__bbox', 'require')), - lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v), - ..., ..., '__bbox', 'result', 'data', {dict}), get_all=False) or {} - if not video_data: server_js_data = self._parse_json(self._search_regex([ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, @@ -594,131 +953,6 @@ def extract_relay_prefetched_data(_filter): ], webpage, 'js data', default='{}'), video_id, js_to_json, False) video_data = extract_from_jsmods_instances(server_js_data) - if not video_data: - data = extract_relay_prefetched_data( - r'"(?:dash_manifest|playable_url(?:_quality_hd)?)') - if data: - entries = [] - - def parse_graphql_video(video): - v_id = video.get('videoId') or video.get('id') or video_id - reel_info = traverse_obj( - video, ('creation_story', 'short_form_video_context', 'playback_video', {dict})) - if reel_info: - video = video['creation_story'] - video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) - video.update(reel_info) - formats = [] - q = qualities(['sd', 'hd']) - for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), - ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), - ('browser_native_sd_url', 'sd')): - playable_url = video.get(key) - if not playable_url: - continue - if determine_ext(playable_url) == 'mpd': - formats.extend(self._extract_mpd_formats(playable_url, video_id)) - else: - formats.append({ - 'format_id': format_id, - # sd, hd formats w/o resolution info should be deprioritized below DASH - 'quality': q(format_id) - 3, - 'url': playable_url, - }) - extract_dash_manifest(video, formats) - - automatic_captions, subtitles = {}, {} - is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool})) - for caption in traverse_obj(video, ( - 'video_available_captions_locales', - {lambda x: sorted(x, key=lambda c: c['locale'])}, - lambda _, v: url_or_none(v['captions_url']) - )): - lang = caption.get('localized_language') or 'und' - subs = { - 'url': caption['captions_url'], - 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang), - } - if caption.get('localized_creation_method') or is_broadcast: - automatic_captions.setdefault(caption['locale'], []).append(subs) - else: - subtitles.setdefault(caption['locale'], []).append(subs) - captions_url = traverse_obj(video, ('captions_url', {url_or_none})) - if captions_url and not automatic_captions and not subtitles: - locale = self._html_search_meta( - ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US') - (automatic_captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}] - - info = { - 'id': v_id, - 'formats': formats, - 'thumbnail': traverse_obj( - video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')), - 'uploader_id': traverse_obj(video, ('owner', 'id', {str_or_none})), - 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none), - 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000) - or float_or_none(video.get('length_in_second'))), - 'automatic_captions': automatic_captions, - 'subtitles': subtitles, - } - process_formats(info) - description = try_get(video, lambda x: x['savable_description']['text']) - title = video.get('name') - if title: - info.update({ - 'title': title, - 'description': description, - }) - else: - info['title'] = description or 'Facebook video #%s' % v_id - entries.append(info) - - def parse_attachment(attachment, key='media'): - media = attachment.get(key) or {} - if media.get('__typename') == 'Video': - return parse_graphql_video(media) - - nodes = variadic(traverse_obj(data, 'nodes', 'node') or []) - attachments = traverse_obj(nodes, ( - ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments', - ..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')), - 'attachment', {dict})) - for attachment in attachments: - ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}), - ('target', 'attachments', ..., 'styles', 'attachment', {dict})) - for n in ns: - parse_attachment(n) - parse_attachment(attachment) - - edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] - for edge in edges: - parse_attachment(edge, key='node') - - video = traverse_obj(data, ( - 'event', 'cover_media_renderer', 'cover_video'), 'video', expected_type=dict) or {} - if video: - attachments = try_get(video, [ - lambda x: x['story']['attachments'], - lambda x: x['creation_story']['attachments'] - ], list) or [] - for attachment in attachments: - parse_attachment(attachment) - if not entries: - parse_graphql_video(video) - - if len(entries) > 1: - return self.playlist_result(entries, video_id) - - video_info = entries[0] if entries else {'id': video_id} - webpage_info = extract_metadata(webpage) - # honor precise duration in video info - if video_info.get('duration'): - webpage_info['duration'] = video_info['duration'] - # preserve preferred_thumbnail in video info - if video_info.get('thumbnail'): - webpage_info['thumbnail'] = video_info['thumbnail'] - return merge_dicts(webpage_info, video_info) - if not video_data: m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) if m_msg is not None: @@ -731,6 +965,17 @@ def parse_attachment(attachment, key='media'): 'id="loginbutton"')): self.raise_login_required() + def extract_relay_data(_filter): + return self._parse_json(self._search_regex( + r'data-sjs>({.*?%s.*?})' % _filter, + webpage, 'replay data', default='{}'), video_id, fatal=False) or {} + + def extract_relay_prefetched_data(_filter): + return traverse_obj(extract_relay_data(_filter), ( + 'require', (None, (..., ..., ..., '__bbox', 'require')), + lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v), + ..., ..., '__bbox', 'result', 'data', {dict}), get_all=False) or {} + if not video_data and '/watchparty/' in url: post_data = { 'doc_id': 3731964053542869, @@ -772,6 +1017,8 @@ def parse_attachment(attachment, key='media'): # tahoe player specific URL tahoe_data = self._download_webpage( self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, + errnote='Unable to download alterative webpage', + fatal=False, data=urlencode_postdata({ '__a': 1, '__pc': self._search_regex( @@ -787,15 +1034,20 @@ def parse_attachment(attachment, key='media'): headers={ 'Content-Type': 'application/x-www-form-urlencoded', }) - tahoe_js_data = self._parse_json( - self._search_regex( - r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, - 'tahoe js data', default='{}'), - video_id, fatal=False) - video_data = extract_from_jsmods_instances(tahoe_js_data) + if tahoe_data: + tahoe_js_data = self._parse_json( + self._search_regex( + r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, + 'tahoe js data', default='{}'), + video_id, fatal=False) + video_data = extract_from_jsmods_instances(tahoe_js_data) if not video_data: - raise ExtractorError('Cannot parse data') + if not login_data: + self.raise_login_required('No video formats found') + if not logged_in: + self.raise_login_required('Failed to login with provided data') + self.raise_no_formats('No video formats found!') if len(video_data) > 1: entries = [] @@ -820,17 +1072,24 @@ def parse_attachment(attachment, key='media'): for src_type in ('src', 'src_no_ratelimit'): src = f[0].get('%s_%s' % (quality, src_type)) if src: - # sd, hd formats w/o resolution info should be deprioritized below DASH - # TODO: investigate if progressive or src formats still exist - preference = -10 if format_id == 'progressive' else -3 - if quality == 'hd': - preference += 1 - formats.append({ - 'format_id': '%s_%s_%s' % (format_id, quality, src_type), - 'url': src, - 'quality': preference, - 'height': 720 if quality == 'hd' else None - }) + metadata = self._get_video_metadata(src) + if metadata: + formats.append({**{ + 'format_id': '%s_%s_%s' % (format_id, quality, src_type), + 'url': src, + }, **metadata}) + else: + # sd, hd formats w/o resolution info should be deprioritized below DASH + # TODO: investigate if progressive or src formats still exist + preference = -10 if format_id == 'progressive' else -3 + if quality == 'hd': + preference += 1 + formats.append({ + 'format_id': '%s_%s_%s' % (format_id, quality, src_type), + 'url': src, + 'quality': preference, + 'height': 720 if quality == 'hd' else None + }) extract_dash_manifest(f[0], formats) subtitles_src = f[0].get('subtitles_src') if subtitles_src: @@ -842,7 +1101,7 @@ def parse_attachment(attachment, key='media'): 'subtitles': subtitles, } process_formats(info_dict) - info_dict.update(extract_metadata(webpage)) + info_dict.update(webpage_info) return info_dict @@ -858,15 +1117,21 @@ class FacebookPluginsVideoIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560', - 'md5': '5954e92cdfe51fe5782ae9bda7058a07', + 'md5': 'ce2c32ab234398f4645389326133bce8', 'info_dict': { 'id': '10154383743583686', 'ext': 'mp4', - # TODO: Fix title, uploader 'title': 'What to do during the haze?', + 'description': 'md5:81839c0979803a014b20798df255ed0b', + 'thumbnail': r're:^https?://.*', 'uploader': 'Gov.sg', + 'uploader_id': '100064718678925', + 'uploader_url': r're:^https?://.*', 'upload_date': '20160826', 'timestamp': 1472184808, + 'duration': 65.087, + 'view_count': int, + 'concurrent_view_count': int, }, 'add_ie': [FacebookIE.ie_key()], }, { @@ -897,7 +1162,7 @@ class FacebookRedirectURLIE(InfoExtractor): 'playable_in_embed': True, 'categories': ['Music'], 'channel': 'Boiler Room', - 'uploader_id': 'brtvofficial', + 'uploader_id': '@boilerroom', 'uploader': 'Boiler Room', 'tags': 'count:11', 'duration': 3332, @@ -905,11 +1170,15 @@ class FacebookRedirectURLIE(InfoExtractor): 'thumbnail': 'https://i.ytimg.com/vi/pO8h3EaFRdo/maxresdefault.jpg', 'channel_url': 'https://www.youtube.com/channel/UCGBpxWJr9FNOcFYA5GkKrMg', 'availability': 'public', - 'uploader_url': 'http://www.youtube.com/user/brtvofficial', + 'uploader_url': r're:^https?://.*', 'upload_date': '20150917', 'age_limit': 0, 'view_count': int, 'like_count': int, + 'heatmap': 'count:100', + 'channel_is_verified': True, + 'channel_follower_count': int, + 'comment_count': int, }, 'add_ie': ['Youtube'], 'params': {'skip_download': 'Youtube'}, @@ -932,10 +1201,11 @@ class FacebookReelIE(InfoExtractor): 'info_dict': { 'id': '1195289147628387', 'ext': 'mp4', - 'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e', - 'description': 'md5:22f03309b216ac84720183961441d8db', - 'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1', + 'title': 'md5:32aab9976c6b8a145fc0d799631e2b74', + 'description': 'md5:3ea795c5ebb7ed28e3e78bb7b1191753', + 'uploader': 'Beast Camp Training', 'uploader_id': '100040874179269', + 'uploader_url': r're:^https?://.*', 'duration': 9.579, 'timestamp': 1637502609, 'upload_date': '20211121', @@ -949,7 +1219,7 @@ def _real_extract(self, url): f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id) -class FacebookAdsIE(InfoExtractor): +class FacebookAdsIE(FacebookIE): _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/ads/library/?\?(?:[^#]+&)?id=(?P\d+)' IE_NAME = 'facebook:ads' @@ -1000,13 +1270,13 @@ def _extract_formats(self, video_dict): for format_key, format_url in traverse_obj(video_dict, ( {dict.items}, lambda _, v: v[0] in self._FORMATS_MAP and url_or_none(v[1]) )): - formats.append({ + formats.append({**{ 'format_id': self._FORMATS_MAP[format_key][0], 'format_note': self._FORMATS_MAP[format_key][1], 'url': format_url, 'ext': 'mp4', 'quality': qualities(tuple(self._FORMATS_MAP))(format_key), - }) + }, **self._get_video_metadata(format_url)}) return formats def _real_extract(self, url):