diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py
index 834b1df18999..95e4881a8e02 100644
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@@ -10,14 +10,13 @@
)
from ..networking import Request
from ..networking.exceptions import network_exceptions
+from ..postprocessor import FFmpegPostProcessor
from ..utils import (
ExtractorError,
- clean_html,
determine_ext,
error_to_compat_str,
float_or_none,
format_field,
- get_element_by_id,
get_first,
int_or_none,
join_nonempty,
@@ -32,7 +31,6 @@
url_or_none,
urlencode_postdata,
urljoin,
- variadic,
)
@@ -44,19 +42,12 @@ class FacebookIE(InfoExtractor):
(?:[^#]*?\#!/)?
(?:
(?:
- permalink\.php|
- video/video\.php|
- photo\.php|
- video\.php|
- video/embed|
- story\.php|
- watch(?:/live)?/?
+ (?:video/)?[a-z]{5,}(?:\.php|/live)?/?
)\?(?:.*?)(?:v|video_id|story_fbid)=|
- [^/]+/videos/(?:[^/]+/)?|
- [^/]+/posts/|
+ [^/]+/(?:videos|posts)/(?:[^/]+/)?|
events/(?:[^/]+/)?|
groups/[^/]+/(?:permalink|posts)/|
- watchparty/
+ [a-z]{5,}/|
)|
facebook:
)
@@ -89,9 +80,10 @@ class FacebookIE(InfoExtractor):
'timestamp': 1692346159,
'thumbnail': r're:^https?://.*',
'uploader_id': '100063551323670',
- 'duration': 3132.184,
+ 'uploader_url': r're:^https?://.*',
+ 'duration': 3133.583,
'view_count': int,
- 'concurrent_view_count': 0,
+ 'concurrent_view_count': int,
},
}, {
'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
@@ -117,8 +109,10 @@ class FacebookIE(InfoExtractor):
'upload_date': '20140506',
'timestamp': 1399398998,
'thumbnail': r're:^https?://.*',
- 'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl',
+ 'uploader_id': r're:^pfbid.*',
+ 'uploader_url': r're:^https?://.*',
'duration': 131.03,
+ 'view_count': int,
'concurrent_view_count': int,
},
}, {
@@ -172,7 +166,7 @@ class FacebookIE(InfoExtractor):
# have 1080P, but only up to 720p in swf params
# data.video.story.attachments[].media
'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
- 'md5': 'ca63897a90c9452efee5f8c40d080e25',
+ 'md5': '1659aa21fb3dd1585874f668e81a72c8',
'info_dict': {
'id': '10155529876156509',
'ext': 'mp4',
@@ -184,8 +178,9 @@ class FacebookIE(InfoExtractor):
'thumbnail': r're:^https?://.*',
'view_count': int,
'uploader_id': '100059479812265',
+ 'uploader_url': r're:^https?://.*',
'concurrent_view_count': int,
- 'duration': 44.478,
+ 'duration': 44.181,
},
}, {
# bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
@@ -194,12 +189,13 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '1417995061575415',
'ext': 'mp4',
- 'title': 'Довгоочікуване відео | By Yaroslav - Facebook',
+ 'title': 'Довгоочікуване відео',
'description': 'Довгоочікуване відео',
'timestamp': 1486648217,
'upload_date': '20170209',
'uploader': 'Yaroslav Korpan',
- 'uploader_id': 'pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl',
+ 'uploader_id': r're:^pfbid.*',
+ 'uploader_url': r're:^https?://.*',
'concurrent_view_count': int,
'thumbnail': r're:^https?://.*',
'view_count': int,
@@ -209,32 +205,39 @@ class FacebookIE(InfoExtractor):
'skip_download': True,
},
}, {
- # FIXME
'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471',
'info_dict': {
- 'id': '1072691702860471',
+ 'id': 'giphy',
'ext': 'mp4',
- 'title': 'md5:ae2d22a93fbb12dad20dc393a869739d',
+ 'title': 'Nada mas satisfactorio que los otros 5... - La Guía Del Varón',
+ 'description': 'Nada mas satisfactorio que los otros 5 minutos',
'timestamp': 1477305000,
'upload_date': '20161024',
'uploader': 'La Guía Del Varón',
+ 'uploader_id': '100050567346031',
+ 'uploader_url': r're:^https?://.*',
'thumbnail': r're:^https?://.*',
+ 'age_limit': 0,
},
- 'skip': 'Requires logging in',
+ 'skip': 'Gif on giphy.com',
}, {
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/',
'info_dict': {
'id': '202882990186699',
'ext': 'mp4',
- 'title': 'birb (O v O") | Hello? Yes your uber ride is here',
- 'description': 'Hello? Yes your uber ride is here * Jukin Media Verified * Find this video and others like it by visiting...',
- 'timestamp': 1486035513,
+ 'title': 'birb (O v O")',
+ 'description': 'md5:963dee8a667a2b49f2059cf7ab54fe55',
+ 'thumbnail': r're:^https?://.*',
+ 'timestamp': 1486035494,
'upload_date': '20170202',
'uploader': 'Elisabeth Ahtn',
- 'uploader_id': '100013949973717',
+ 'uploader_id': r're:^pfbid.*',
+ 'uploader_url': r're:^https?://.*',
+ 'duration': 23.891,
+ 'view_count': int,
+ 'concurrent_view_count': int,
},
- 'skip': 'Requires logging in',
}, {
# data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media
'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/',
@@ -245,10 +248,12 @@ class FacebookIE(InfoExtractor):
'description': 'Vickie Gentry shared a memory.',
'timestamp': 1511548260,
'upload_date': '20171124',
- 'uploader': 'Vickie Gentry',
- 'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl',
+ 'uploader': 'ATTN:',
+ 'uploader_id': '100064451419378',
+ 'uploader_url': r're:^https?://.*',
'thumbnail': r're:^https?://.*',
'duration': 148.435,
+ 'concurrent_view_count': int,
},
}, {
# data.node.comet_sections.content.story.attachments[].styles.attachment.media
@@ -259,12 +264,14 @@ class FacebookIE(InfoExtractor):
'description': 'md5:2f2fcf93e97ac00244fe64521bbdb0cb',
'uploader': 'ATTN:',
'upload_date': '20231207',
- 'title': 'ATTN:',
+ 'title': 'ATTN: - Learning new problem-solving skills is hard for...',
'duration': 132.675,
'uploader_id': '100064451419378',
+ 'uploader_url': r're:^https?://.*',
'view_count': int,
'thumbnail': r're:^https?://.*',
'timestamp': 1701975646,
+ 'concurrent_view_count': int,
},
}, {
# data.node.comet_sections.content.story.attachments[].styles.attachment.media
@@ -272,15 +279,17 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '270103405756416',
'ext': 'mp4',
- 'title': 'Lela Evans',
- 'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...',
+ 'title': 'Lela Evans - Today Makkovik\'s own Pilot Mandy Smith made...',
+ 'description': 'md5:cc93a91feb89923303c1f78656791e4d',
'thumbnail': r're:^https?://.*',
'uploader': 'Lela Evans',
- 'uploader_id': 'pfbid0shZJipuigyy5mqrUJn9ub5LJFWNHvan5prtyi3LrDuuuJ4NwrURgnQHYR9fywBepl',
+ 'uploader_id': r're:^pfbid.*',
+ 'uploader_url': r're:^https?://.*',
'upload_date': '20231228',
'timestamp': 1703804085,
'duration': 394.347,
'view_count': int,
+ 'concurrent_view_count': int,
},
}, {
'url': 'https://www.facebook.com/story.php?story_fbid=pfbid0Fnzhm8UuzjBYpPMNFzaSpFE9UmLdU4fJN8qTANi1Dmtj5q7DNrL5NERXfsAzDEV7l&id=100073071055552',
@@ -288,9 +297,11 @@ class FacebookIE(InfoExtractor):
}, {
'url': 'https://www.facebook.com/video.php?v=10204634152394104',
'only_matching': True,
+ 'skip': 'Video gone',
}, {
'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf',
'only_matching': True,
+ 'skip': 'Video gone',
}, {
# data.mediaset.currMedia.edges
'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater',
@@ -303,10 +314,12 @@ class FacebookIE(InfoExtractor):
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',
'only_matching': True,
+ 'skip': 'Video gone',
}, {
# data.video.creation_story.attachments[].media
'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/',
'only_matching': True,
+ 'skip': 'Video gone',
}, {
# data.video
'url': 'https://www.facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd.onion/video.php?v=274175099429670',
@@ -315,6 +328,7 @@ class FacebookIE(InfoExtractor):
# no title
'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/',
'only_matching': True,
+ 'skip': 'Video gone',
}, {
# data.video
'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/',
@@ -327,7 +341,8 @@ class FacebookIE(InfoExtractor):
'upload_date': '20180523',
'uploader': 'ESL One Dota 2',
'uploader_id': '100066514874195',
- 'duration': 4524.212,
+ 'uploader_url': r're:^https?://.*',
+ 'duration': 4524.001,
'view_count': int,
'thumbnail': r're:^https?://.*',
'concurrent_view_count': int,
@@ -344,9 +359,10 @@ class FacebookIE(InfoExtractor):
'title': 'Josef',
'thumbnail': r're:^https?://.*',
'concurrent_view_count': int,
- 'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl',
+ 'uploader_id': r're:^pfbid.*',
+ 'uploader_url': r're:^https?://.*',
'timestamp': 1549275572,
- 'duration': 3.413,
+ 'duration': 3.283,
'uploader': 'Josef Novak',
'description': '',
'upload_date': '20190204',
@@ -354,7 +370,21 @@ class FacebookIE(InfoExtractor):
}, {
# data.video.story.attachments[].media
'url': 'https://www.facebook.com/watch/?v=647537299265662',
- 'only_matching': True,
+ 'info_dict': {
+ 'id': '647537299265662',
+ 'ext': 'mp4',
+ 'title': 'Padre enseña a su hijo a cómo bañar un recién nacido junto con su...',
+ 'description': 'Padre ense\u00f1a a su hijo a c\u00f3mo ba\u00f1ar un reci\u00e9n nacido junto con su gato y se hace viral, mir\u00e1 el video 😍',
+ 'thumbnail': r're:^https?://.*',
+ 'timestamp': 1605534618,
+ 'upload_date': '20201116',
+ 'uploader': 'InfoPico',
+ 'uploader_id': '100064391811349',
+ 'uploader_url': r're:^https?://.*',
+ 'duration': 136.179,
+ 'view_count': int,
+ 'concurrent_view_count': int,
+ },
}, {
# FIXME: https://github.com/yt-dlp/yt-dlp/issues/542
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
@@ -370,11 +400,16 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '117576630041613',
'ext': 'mp4',
- # TODO: title can be extracted from video page
- 'title': 'Facebook video #117576630041613',
- 'uploader_id': '189393014416438',
+ 'title': 'Officers Rescue Trapped Motorist from Mahoning River Crash 11-22-20',
+ 'thumbnail': r're:^https?://.*',
+ 'uploader': 'City of Alliance Police Department',
+ 'uploader_id': '100064413680392',
+ 'uploader_url': r're:^https?://.*',
'upload_date': '20201123',
'timestamp': 1606162592,
+ 'duration': 101.504,
+ 'view_count': int,
+ 'concurrent_view_count': int,
},
'skip': 'Requires logging in',
}, {
@@ -392,7 +427,24 @@ class FacebookIE(InfoExtractor):
}, {
# data.video.creation_story.attachments[].media
'url': 'https://www.facebook.com/watch/live/?v=1823658634322275',
- 'only_matching': True,
+ 'info_dict': {
+ 'id': '1823658634322275',
+ 'ext': 'mp4',
+ 'title': 'Live Webcam from Corfu - Greece',
+ 'description': 'md5:84c1af6894ecffe710c79744e4873e85',
+ 'thumbnail': r're:^https?://.*',
+ 'uploader': 'SkylineWebcams',
+ 'uploader_id': '100064307154679',
+ 'uploader_url': r're:^https?://.*',
+ 'upload_date': '20180319',
+ 'timestamp': 1521449766,
+ 'duration': 14424.199,
+ 'view_count': int,
+ 'concurrent_view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}, {
'url': 'https://www.facebook.com/watchparty/211641140192478',
'info_dict': {
@@ -411,6 +463,7 @@ class FacebookIE(InfoExtractor):
'thumbnail': r're:^https?://.*',
'uploader': 'Comitato Liberi Pensatori',
'uploader_id': '100065709540881',
+ 'uploader_url': r're:^https?://.*',
},
}]
_SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)'
@@ -445,7 +498,9 @@ def _perform_login(self, username, password):
try:
login_results = self._download_webpage(request, None,
note='Logging in', errnote='unable to fetch login page')
- if re.search(r'
', login_results):
error = self._html_search_regex(
r'(?s)]+class=(["\']).*?login_error_box.*?\1[^>]*>
]*>.*?
]*>(?P.+?)
',
login_results, 'login error', default=None, group='error')
@@ -477,64 +532,396 @@ def _perform_login(self, username, password):
self.report_warning('unable to log in: %s' % error_to_compat_str(err))
return
+ def _get_video_metadata(self, url):
+ metadata = {}
+ ffmpeg = FFmpegPostProcessor()
+ if ffmpeg.probe_available:
+ for stream in traverse_obj(ffmpeg.get_metadata_object(url), 'streams', expected_type=list):
+ if stream['codec_type'] == 'video':
+ [f, d] = [int_or_none(x) for x in (
+ stream['avg_frame_rate'].split('/') if stream.get('avg_frame_rate') else [None, None])]
+ metadata.update(**traverse_obj(stream, {
+ 'width': ('width', {int_or_none}),
+ 'height': ('height', {int_or_none}),
+ 'vcodec': ('codec_tag_string' or 'codec_name', {str_or_none}),
+ 'vbr': ('bit_rate', {float_or_none}, {lambda x: float_or_none(x, 1000)}),
+ }), **{
+ 'fps': round(f / d, 1) if f and d else None,
+ })
+ elif stream['codec_type'] == 'audio':
+ metadata.update(traverse_obj(stream, {
+ 'audio_channels': ('channels', {int_or_none}),
+ 'acodec': ('codec_tag_string' or 'codec_name', {str_or_none}),
+ 'asr': ('sample_rate', {int_or_none}),
+ 'abr': ('bit_rate', {float_or_none}, {lambda x: float_or_none(x, 1000)}),
+ }))
+ return metadata
+
def _extract_from_url(self, url, video_id):
webpage = self._download_webpage(
- url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
-
- def extract_metadata(webpage):
- post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall(
- r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)]
- post = traverse_obj(post_data, (
- ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
- media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: (
- k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
- title = get_first(media, ('title', 'text'))
- description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text'))
- page_title = title or self._html_search_regex((
- r'
]*class="uiHeaderTitle"[^>]*>(?P[^<]*)
',
- r'(?s)
(?P.*?)',
- self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'(?P.+?)'
- ), webpage, 'title', default=None, group='content')
- description = description or self._html_search_meta(
- ['description', 'og:description', 'twitter:description'],
- webpage, 'description', default=None)
- uploader_data = (
- get_first(media, ('owner', {dict}))
- or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name']))
- or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name']))
- or get_first(post, ('node', 'actors', ..., {dict}))
- or get_first(post, ('event', 'event_creator', {dict})) or {})
- uploader = uploader_data.get('name') or (
- clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
- or self._search_regex(
- (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False))
- timestamp = int_or_none(self._search_regex(
- r']+data-utime=["\'](\d+)', webpage,
- 'timestamp', default=None))
- thumbnail = self._html_search_meta(
- ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None)
- # some webpages contain unretrievable thumbnail urls
- # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1
- # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/
- if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail):
- thumbnail = None
- info_dict = {
- 'description': description,
- 'uploader': uploader,
- 'uploader_id': uploader_data.get('id'),
- 'timestamp': timestamp,
- 'thumbnail': thumbnail,
- 'view_count': parse_count(self._search_regex(
- (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)',),
- webpage, 'view count', default=None)),
- 'concurrent_view_count': get_first(post, (
- ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})),
- }
+ re.sub(r'://(?:[\w-]+\.)?facebook\.com/', '://www.facebook.com/', url), video_id, tries=2)
+
+ post_data = re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)
+
+ sjs_data = [self._parse_json(j, video_id, fatal=False) for j in post_data]
+ cookies = self._get_cookies(url)
+ # user passed logged-in cookies or attempted to login
+ login_data = cookies.get('c_user') and cookies.get('xs')
+ logged_in = False
+ if login_data:
+ logged_in = get_first(sjs_data, (
+ 'require', ..., ..., ..., '__bbox', 'define',
+ lambda _, v: 'CurrentUserInitialData' in v, ..., 'ACCOUNT_ID'), default='0') != '0'
+ if logged_in and (info := get_first(sjs_data, (
+ 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data',
+ (('ufac_client', 'state', (('set_contact_point_state_renderer', 'title'),
+ ('intro_state_renderer', 'header_title'))),
+ ('epsilon_checkpoint', 'screen', 'title'))
+ ))):
+ if any(content in info for content in ['days left to appeal', 'suspended your account']):
+ raise ExtractorError('Your account is suspended', expected=True)
+ if 'Enter mobile number' == info:
+ raise ExtractorError('Facebook is requiring mobile number confirmation', expected=True)
+ if 'your account has been locked' in info:
+ raise ExtractorError('Your account has been locked', expected=True)
+ if props := get_first(sjs_data, (
+ 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., 'rootView',
+ lambda _, v: v.get('title') is not None)):
+ if not self._cookies_passed:
+ self.raise_login_required()
+ else:
+ msg = join_nonempty('title', 'body', delim='. ', from_dict=props).replace('\n ', '')
+ raise ExtractorError(f'Facebook: {msg}', expected=True)
+
+ def find_json_obj(json_strings, *patterns, obj_in_value=False, get_all=False):
+ """
+ Find JSON object, in the form of a string, by regular expression
+ >>> obj = find_json_obj((json_a, _or_b), regex_a, (regex_b, _or_c), obj_in_value=True, get_all=True)
+ @param json_strings string, tuple regex patterns (match only one in tuple)
+ *patterns string, tuple regex patterns (match only one in tuple)
+ obj_in_value boolean False: find the object(s) containing the pattern(s)
+ True : given pattern(s) of the key(s) to find the
+ object(s) in the value of that key(s)
+ get_all boolean
+ @return list of tuple a list of (match, JSON object)
+ """
+ def find_offset(string, bracket):
+ _BRACKET_MAP = {
+ '{': ('}', (1 if obj_in_value else -1)), # (opposite sign, search direction)
+ '}': ('{', 1), # search direction: 1 - forward, -1 - backward
+ }
+ count, b_sum, offset = 0, 0, 0
+ for x in string[::_BRACKET_MAP[bracket][1]]:
+ count += (1 if x == bracket or x == _BRACKET_MAP[bracket][0] else 0)
+ b_sum += (1 if x == bracket else (-1 if x == _BRACKET_MAP[bracket][0] else 0))
+ offset += 1
+ if count > 0 and b_sum >= (0 if obj_in_value else 1):
+ break
+ return offset * _BRACKET_MAP[bracket][1]
+ for json_str in (json_strings if isinstance(json_strings, tuple) else [json_strings]): # return 1st match
+ if isinstance(json_str, str):
+ for pattern_grp in (patterns if isinstance(patterns, tuple) else [patterns]): # match all
+ for pattern in (pattern_grp if isinstance(pattern_grp, tuple) else [pattern_grp]): # return 1st match
+ found = False
+ if isinstance(pattern, str):
+ for m in re.finditer(pattern, json_str): # depend on get_all
+ i = m.end(m.lastindex or 0) if obj_in_value else m.start(m.lastindex or 0)
+ opening = (i + find_offset(
+ json_str[(i * obj_in_value):(i * (not obj_in_value) - obj_in_value)], '{'
+ ) - obj_in_value)
+ closing = i + find_offset(json_str[i:], '}')
+ if isinstance(opening, int) and isinstance(closing, int):
+ found = True
+ yield (m.group(0), json_str[opening:closing:])
+ if not get_all:
+ break
+ else:
+ if found:
+ break
+ continue
+ break
+ else:
+ continue
+ else:
+ if found:
+ break
+ continue
+ break
+
+ page_description = self._html_search_meta(
+ ['description', 'og:description', 'twitter:description'],
+ webpage, 'description', default=None)
+ page_title = self._html_search_regex((
+ r'\s(?P[\s\S]+?)\s',
+ self._meta_regex('og:title'), self._meta_regex('twitter:title'),
+ r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)
',
+ r'(?s)(?P.*?)'
+ ), re.sub(r'(Facebook(\sLive)?)|(Video)', '',
+ webpage), 'title', default='', group='content').split(' | ')[0]
+ thumbnail = self._html_search_meta(
+ ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None)
+ if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail):
+ thumbnail = None
+
+ data, uploader_info = [], {}
+ p_id = s_id = linked_url = description = title = timestamp = None
+ for p_data in post_data[:]:
+ if '"feed_unit":' not in p_data:
+ if (not s_id or not p_id) and ('"story":' in p_data or '"creation_story":' in p_data):
+ p_id = p_id if p_id else (lambda x: x.group(1) if x else
+ (video_id if video_id.isnumeric() else None)
+ )(re.search(r'"(?:post_id|videoId|video_id)":\s?"(\d+)"', p_data))
+ s_id = s_id if s_id else (lambda x: x.group(1) if x else None
+ )(re.search(r'"id":\s?"(Uzpf[^"]+)"', p_data))
+ if not data:
+ if '"dash_manifest_url":' in p_data:
+ for x in find_json_obj(p_data, r'"data":\s?{', r'"data":', obj_in_value=True):
+ if '"dash_manifest_url":' in x[1]:
+ data = x[1]
+ break
+ elif '"attachment":{"source":{' in p_data or '"attachment":{"web_link":{' in p_data:
+ # linked media
+ for x in find_json_obj(p_data, r'"data":\s?{', r'"data":', obj_in_value=True):
+ if '"attachment":{"source":{' in x[1] or '"attachment":{"web_link":{' in x[1]:
+ data = x[1]
+ break
+ for x in find_json_obj(
+ data, r'("attachment"):\s*{"source":', r'("attachment"):\s*{"web_link":',
+ obj_in_value=True
+ ):
+ if linked_url := traverse_obj(
+ json.loads(x[1]), (('web_link', None), 'url', {url_or_none}), get_all=False):
+ break
+ if not re.search(r'"(?:dash_manifest_url|title|message)":', p_data):
+ post_data.remove(p_data)
+ else:
+ post_data.remove(p_data)
+ if data and s_id and p_id:
+ break
+
+ post_data = ','.join(post_data)
+ # uploader
+ for x in find_json_obj(post_data, (
+ r'"actors":[^}]*"__isActor":', r'"owner":[^}]*"name":\s?"[^"]'), get_all=True):
+ if f'"id":"{s_id}"' in x[1] and '"name":"' in x[1]:
+ uploader_info = traverse_obj(json.loads(x[1]), {
+ 'uploader': (('owner', ('actors', ...)), 'name', {str}),
+ 'uploader_id': (('owner', ('actors', ...)), 'id', {str}),
+ 'uploader_url': (('owner', ('actors', ...)), 'url', {str}),
+ }, get_all=False)
+ uploader_info['uploader_url'] = (
+ uploader_info.get('uploader_url')
+ or f"https://www.facebook.com/profile.php?id={uploader_info['uploader_id']}")
+ break
+ # description
+ for x in find_json_obj(post_data, (
+ r',"message":(?:(?!"message":)[^}])*"text":\s?"[^"](?:(?!"id":).)*"id":',
+ r'"message":(?:(?!"message":)[^}])*"text":\s?"[^"](?:(?!"id":).)*"id":'
+ ), get_all=True):
+ x_dict = json.loads(x[1])
+ for i in [i for i in [s_id, p_id] if i is not None]:
+ if x_dict.get('id') == i:
+ if (description := x_dict['message'] if isinstance(x_dict['message'], str)
+ else (x_dict['message'] or {}).get('text')):
+ if (track_title := (lambda x: x.group(0) if x else None
+ )(re.search(r'"track_title":\s?"[^"]+"', x[1]))):
+ description += '. ' + json.loads('{' + track_title + '}')['track_title']
+ break
+ if description:
+ break
+ # title / description
+ for x in find_json_obj(post_data, r'"title":(?:(?!"title":).)*"text":\s?"[^"]', get_all=True):
+ x_dict = json.loads(x[1])
+ if p_id:
+ if (text := x_dict['title'] if isinstance(x_dict['title'], str)
+ else (x_dict['title'] or {}).get('text')):
+ title = title if title else (text if x_dict.get('id') == p_id else title)
+ page_description = (page_description if page_description
+ else (text if x_dict.get('id') == s_id else page_description))
+ if title and page_description:
+ break
+ title = (lambda x: x if x != uploader_info.get('uploader') else None
+ )(title or page_title or re.sub(r'(\s*\n\s*)', ' ', (description or '')))
+ title = title if len(title or '') <= 100 else title[:(47 + title[47:67].rfind(' '))] + '...'
+ # timestamp
+ for x in find_json_obj(
+ post_data, r'"publish_time":\d+,', r'"creation_time":\d+,', get_all=True
+ ):
+ if f'"id":"{p_id}"' in x[1] or f'"id":"{s_id}"' in x[1]:
+ if timestamp := json.loads(x[1]).get(x[0].split('"')[1]):
+ break
+
+ webpage_info = {
+ 'title': title,
+ 'description': description or page_description,
+ 'thumbnails': [{k: v for k, v in {
+ 'url': thumbnail,
+ 'height': (lambda x: int_or_none(x.group(1)) if x else None
+ )(re.search(r'stp=.+_[a-z]\d+x(\d+)&', thumbnail))
+ }.items() if v is not None}] if url_or_none(thumbnail) else [],
+ 'timestamp': timestamp,
+ 'view_count': parse_count(self._search_regex(
+ (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)',),
+ webpage, 'view count', default=None)),
+ }
+
+ if linked_url:
+ return self.url_result(linked_url, video_id=video_id, url_transparent=True,
+ **{k: v for k, v in merge_dicts(webpage_info, uploader_info).items() if v})
+
+ def extract_dash_manifest(video, formats):
+ dash_manifest = video.get('dash_manifest')
+ if dash_manifest:
+ formats.extend(self._parse_mpd_formats(
+ compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
+ mpd_url=video.get('dash_manifest_url')))
- info_json_ld = self._search_json_ld(webpage, video_id, default={})
- info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '')
- or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}')
- return merge_dicts(info_json_ld, info_dict)
+ def process_formats(info):
+ # Downloads with browser's User-Agent are rate limited. Working around
+ # with non-browser User-Agent.
+ for f in info['formats']:
+ # Downloads with browser's User-Agent are rate limited. Working around
+ # with non-browser User-Agent.
+ f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
+ # Formats larger than ~500MB will return error 403 unless chunk size is regulated
+ f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20
+
+ if data:
+ entries = []
+
+ def parse_graphql_video(video):
+ v_id = video.get('videoId') or video.get('id') or video_id
+ formats = []
+ for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
+ ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
+ ('browser_native_sd_url', 'sd')):
+ playable_url = video.get(key)
+ if not playable_url:
+ continue
+ if determine_ext(playable_url) == 'mpd':
+ formats.extend(self._extract_mpd_formats(playable_url, video_id))
+ else:
+ metadata = self._get_video_metadata(playable_url)
+ if metadata:
+ formats.append({**{
+ 'format_id': format_id,
+ 'url': playable_url,
+ }, **metadata})
+ else:
+ q = qualities(['sd', 'hd'])
+ formats.append({
+ 'format_id': format_id,
+ # sd, hd formats w/o resolution info should be deprioritized below DASH
+ 'quality': q(format_id) - 3,
+ 'url': playable_url,
+ })
+ extract_dash_manifest(video, formats)
+
+ # captions/subtitles
+ automatic_captions, subtitles = {}, {}
+ is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool}))
+ for caption in traverse_obj(video, (
+ 'video_available_captions_locales',
+ {lambda x: sorted(x, key=lambda c: c['locale'])},
+ lambda _, v: url_or_none(v['captions_url'])
+ )):
+ lang = caption.get('localized_language') or 'und'
+ subs = {
+ 'url': caption['captions_url'],
+ 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang),
+ }
+ if caption.get('localized_creation_method') or is_broadcast:
+ automatic_captions.setdefault(caption['locale'], []).append(subs)
+ else:
+ subtitles.setdefault(caption['locale'], []).append(subs)
+ captions_url = traverse_obj(video, ('captions_url', {url_or_none}))
+ if captions_url and not automatic_captions and not subtitles:
+ locale = self._html_search_meta(
+ ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US')
+ (automatic_captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}]
+ # thumbnails
+ thumbnails = []
+ for url in [uri for uri in [traverse_obj(video, path) for path in [
+ ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri'),
+ ('image', 'uri'), ('previewImage', 'uri')
+ ]] if url_or_none(uri) is not None]:
+ if (re.search(r'\.(?:jpg|png)', url)
+ and not any(url.split('_cat=')[0] in t['url'] for t in thumbnails)):
+ thumbnails.append({k: v for k, v in {
+ 'url': url,
+ 'height': (lambda x: int_or_none(x.group(1)) if x else None
+ )(re.search(r'stp=.+_[a-z]\d+x(\d+)&', url))
+ }.items() if v is not None})
+ # uploader
+ if uploader_id := traverse_obj(video, ('owner', 'id', {str_or_none})):
+ if x := list(find_json_obj(post_data, (
+ r'"id":\s?"%s"[^}]*"name":\s?"[^"]' % uploader_id,
+ r'"name":\s?"[^"][^}]*"id":\s?"%s"' % uploader_id))):
+ if x[0][1]:
+ video['owner'] = merge_dicts(video['owner'], json.loads(x[0][1]))
+ elif x := list(find_json_obj(data, (
+ r'("video_owner":)[^}]*"name":\s?"[^"]', r'([_"]owner":)[^}]*"name":\s?"[^"]',
+ r'(_creator":)[^}]*"name":\s?"[^"]', r'("actor":)[^}]*"name":\s?"[^"]'
+ ), obj_in_value=True)):
+ if x[0][1]:
+ video['owner'] = json.loads(x[0][1])
+ uploader = try_get(video, lambda x: x['owner']['name'])
+ # title
+ if v_name := video.get('name'):
+ v_title = v_name if len(v_name) <= 100 else v_name[:(47 + v_name[47:67].rfind(' '))] + '...'
+
+ info = {
+ 'id': v_id,
+ 'title': ((v_title if video.get('name') else None)
+ or (f"{webpage_info['title']} #{v_id}" if webpage_info['title']
+ else (f"{uploader}'s Video #{v_id}" if uploader else f'Facebook Video #{v_id}'))),
+ 'description': (try_get(video, lambda x: x['savable_description']['text'])
+ or video.get('name') or webpage_info['description']),
+ 'thumbnails': thumbnails,
+ 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none),
+ 'uploader': uploader,
+ 'uploader_id': try_get(video, lambda x: x['owner']['id']),
+ 'uploader_url': (try_get(video, lambda x: x['owner']['url'])
+ or (lambda x: f'https://www.facebook.com/profile.php?id={x}' if x else None
+ )(try_get(video, lambda x: x['owner']['id']))),
+ 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000)
+ or float_or_none(video.get('length_in_second'))),
+ 'formats': formats,
+ 'automatic_captions': automatic_captions,
+ 'subtitles': subtitles,
+ 'concurrent_view_count': video.get('liveViewerCount'),
+ }
+ process_formats(info)
+ entries.append(info)
+
+ video_ids = []
+ for idx, x in enumerate(find_json_obj(
+ data, (r'"dash_manifest_url":\s?"', r'_hd_url":\s?"', r'_sd_url":\s?"'), get_all=True
+ )):
+ media = json.loads(x[1])
+ if (media.get('__typename', 'Video') == 'Video'
+ and not media.get('sticker_image')
+ and not media.get('id', f'{video_id}_{idx}') in video_ids):
+ video_ids.append(media.get('id', f'{video_id}_{idx}'))
+ parse_graphql_video(media)
+ if media.get('id') == video_id:
+ break
+
+ if len(entries) > 1:
+ return self.playlist_result(entries, video_id, **{
+ k: v for k, v in merge_dicts(webpage_info, uploader_info).items() if v})
+
+ video_info = entries[0] if entries else {'id': video_id}
+ if "'s Video #" in video_info.get('title', ''):
+ video_info['title'] = f"{video_info['uploader']}'s Video"
+ if video_info['timestamp']:
+ webpage_info['timestamp'] = None
+ if webpage_info['thumbnails']:
+ if not (any(webpage_info['thumbnails'][0]['url'].split('_cat=')[0] in thumbnail['url']
+ for thumbnail in video_info['thumbnails'])):
+ video_info['thumbnails'].extend(webpage_info['thumbnails'])
+ webpage_info['thumbnails'] = video_info['thumbnails']
+ return merge_dicts(webpage_info, video_info)
video_data = None
@@ -559,34 +946,6 @@ def extract_from_jsmods_instances(js_data):
return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or [])
- def extract_dash_manifest(video, formats):
- dash_manifest = video.get('dash_manifest')
- if dash_manifest:
- formats.extend(self._parse_mpd_formats(
- compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
- mpd_url=video.get('dash_manifest_url')))
-
- def process_formats(info):
- # Downloads with browser's User-Agent are rate limited. Working around
- # with non-browser User-Agent.
- for f in info['formats']:
- # Downloads with browser's User-Agent are rate limited. Working around
- # with non-browser User-Agent.
- f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
- # Formats larger than ~500MB will return error 403 unless chunk size is regulated
- f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20
-
- def extract_relay_data(_filter):
- return self._parse_json(self._search_regex(
- r'data-sjs>({.*?%s.*?})' % _filter,
- webpage, 'replay data', default='{}'), video_id, fatal=False) or {}
-
- def extract_relay_prefetched_data(_filter):
- return traverse_obj(extract_relay_data(_filter), (
- 'require', (None, (..., ..., ..., '__bbox', 'require')),
- lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v),
- ..., ..., '__bbox', 'result', 'data', {dict}), get_all=False) or {}
-
if not video_data:
server_js_data = self._parse_json(self._search_regex([
r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX,
@@ -594,131 +953,6 @@ def extract_relay_prefetched_data(_filter):
], webpage, 'js data', default='{}'), video_id, js_to_json, False)
video_data = extract_from_jsmods_instances(server_js_data)
- if not video_data:
- data = extract_relay_prefetched_data(
- r'"(?:dash_manifest|playable_url(?:_quality_hd)?)')
- if data:
- entries = []
-
- def parse_graphql_video(video):
- v_id = video.get('videoId') or video.get('id') or video_id
- reel_info = traverse_obj(
- video, ('creation_story', 'short_form_video_context', 'playback_video', {dict}))
- if reel_info:
- video = video['creation_story']
- video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
- video.update(reel_info)
- formats = []
- q = qualities(['sd', 'hd'])
- for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
- ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
- ('browser_native_sd_url', 'sd')):
- playable_url = video.get(key)
- if not playable_url:
- continue
- if determine_ext(playable_url) == 'mpd':
- formats.extend(self._extract_mpd_formats(playable_url, video_id))
- else:
- formats.append({
- 'format_id': format_id,
- # sd, hd formats w/o resolution info should be deprioritized below DASH
- 'quality': q(format_id) - 3,
- 'url': playable_url,
- })
- extract_dash_manifest(video, formats)
-
- automatic_captions, subtitles = {}, {}
- is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool}))
- for caption in traverse_obj(video, (
- 'video_available_captions_locales',
- {lambda x: sorted(x, key=lambda c: c['locale'])},
- lambda _, v: url_or_none(v['captions_url'])
- )):
- lang = caption.get('localized_language') or 'und'
- subs = {
- 'url': caption['captions_url'],
- 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang),
- }
- if caption.get('localized_creation_method') or is_broadcast:
- automatic_captions.setdefault(caption['locale'], []).append(subs)
- else:
- subtitles.setdefault(caption['locale'], []).append(subs)
- captions_url = traverse_obj(video, ('captions_url', {url_or_none}))
- if captions_url and not automatic_captions and not subtitles:
- locale = self._html_search_meta(
- ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US')
- (automatic_captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}]
-
- info = {
- 'id': v_id,
- 'formats': formats,
- 'thumbnail': traverse_obj(
- video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')),
- 'uploader_id': traverse_obj(video, ('owner', 'id', {str_or_none})),
- 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none),
- 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000)
- or float_or_none(video.get('length_in_second'))),
- 'automatic_captions': automatic_captions,
- 'subtitles': subtitles,
- }
- process_formats(info)
- description = try_get(video, lambda x: x['savable_description']['text'])
- title = video.get('name')
- if title:
- info.update({
- 'title': title,
- 'description': description,
- })
- else:
- info['title'] = description or 'Facebook video #%s' % v_id
- entries.append(info)
-
- def parse_attachment(attachment, key='media'):
- media = attachment.get(key) or {}
- if media.get('__typename') == 'Video':
- return parse_graphql_video(media)
-
- nodes = variadic(traverse_obj(data, 'nodes', 'node') or [])
- attachments = traverse_obj(nodes, (
- ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments',
- ..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')),
- 'attachment', {dict}))
- for attachment in attachments:
- ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}),
- ('target', 'attachments', ..., 'styles', 'attachment', {dict}))
- for n in ns:
- parse_attachment(n)
- parse_attachment(attachment)
-
- edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or []
- for edge in edges:
- parse_attachment(edge, key='node')
-
- video = traverse_obj(data, (
- 'event', 'cover_media_renderer', 'cover_video'), 'video', expected_type=dict) or {}
- if video:
- attachments = try_get(video, [
- lambda x: x['story']['attachments'],
- lambda x: x['creation_story']['attachments']
- ], list) or []
- for attachment in attachments:
- parse_attachment(attachment)
- if not entries:
- parse_graphql_video(video)
-
- if len(entries) > 1:
- return self.playlist_result(entries, video_id)
-
- video_info = entries[0] if entries else {'id': video_id}
- webpage_info = extract_metadata(webpage)
- # honor precise duration in video info
- if video_info.get('duration'):
- webpage_info['duration'] = video_info['duration']
- # preserve preferred_thumbnail in video info
- if video_info.get('thumbnail'):
- webpage_info['thumbnail'] = video_info['thumbnail']
- return merge_dicts(webpage_info, video_info)
-
if not video_data:
m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">(.*?)
', webpage)
if m_msg is not None:
@@ -731,6 +965,17 @@ def parse_attachment(attachment, key='media'):
'id="loginbutton"')):
self.raise_login_required()
+ def extract_relay_data(_filter):
+ return self._parse_json(self._search_regex(
+ r'data-sjs>({.*?%s.*?})' % _filter,
+ webpage, 'replay data', default='{}'), video_id, fatal=False) or {}
+
+ def extract_relay_prefetched_data(_filter):
+ return traverse_obj(extract_relay_data(_filter), (
+ 'require', (None, (..., ..., ..., '__bbox', 'require')),
+ lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v),
+ ..., ..., '__bbox', 'result', 'data', {dict}), get_all=False) or {}
+
if not video_data and '/watchparty/' in url:
post_data = {
'doc_id': 3731964053542869,
@@ -772,6 +1017,8 @@ def parse_attachment(attachment, key='media'):
# tahoe player specific URL
tahoe_data = self._download_webpage(
self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id,
+ errnote='Unable to download alterative webpage',
+ fatal=False,
data=urlencode_postdata({
'__a': 1,
'__pc': self._search_regex(
@@ -787,15 +1034,20 @@ def parse_attachment(attachment, key='media'):
headers={
'Content-Type': 'application/x-www-form-urlencoded',
})
- tahoe_js_data = self._parse_json(
- self._search_regex(
- r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data,
- 'tahoe js data', default='{}'),
- video_id, fatal=False)
- video_data = extract_from_jsmods_instances(tahoe_js_data)
+ if tahoe_data:
+ tahoe_js_data = self._parse_json(
+ self._search_regex(
+ r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data,
+ 'tahoe js data', default='{}'),
+ video_id, fatal=False)
+ video_data = extract_from_jsmods_instances(tahoe_js_data)
if not video_data:
- raise ExtractorError('Cannot parse data')
+ if not login_data:
+ self.raise_login_required('No video formats found')
+ if not logged_in:
+ self.raise_login_required('Failed to login with provided data')
+ self.raise_no_formats('No video formats found!')
if len(video_data) > 1:
entries = []
@@ -820,17 +1072,24 @@ def parse_attachment(attachment, key='media'):
for src_type in ('src', 'src_no_ratelimit'):
src = f[0].get('%s_%s' % (quality, src_type))
if src:
- # sd, hd formats w/o resolution info should be deprioritized below DASH
- # TODO: investigate if progressive or src formats still exist
- preference = -10 if format_id == 'progressive' else -3
- if quality == 'hd':
- preference += 1
- formats.append({
- 'format_id': '%s_%s_%s' % (format_id, quality, src_type),
- 'url': src,
- 'quality': preference,
- 'height': 720 if quality == 'hd' else None
- })
+ metadata = self._get_video_metadata(src)
+ if metadata:
+ formats.append({**{
+ 'format_id': '%s_%s_%s' % (format_id, quality, src_type),
+ 'url': src,
+ }, **metadata})
+ else:
+ # sd, hd formats w/o resolution info should be deprioritized below DASH
+ # TODO: investigate if progressive or src formats still exist
+ preference = -10 if format_id == 'progressive' else -3
+ if quality == 'hd':
+ preference += 1
+ formats.append({
+ 'format_id': '%s_%s_%s' % (format_id, quality, src_type),
+ 'url': src,
+ 'quality': preference,
+ 'height': 720 if quality == 'hd' else None
+ })
extract_dash_manifest(f[0], formats)
subtitles_src = f[0].get('subtitles_src')
if subtitles_src:
@@ -842,7 +1101,7 @@ def parse_attachment(attachment, key='media'):
'subtitles': subtitles,
}
process_formats(info_dict)
- info_dict.update(extract_metadata(webpage))
+ info_dict.update(webpage_info)
return info_dict
@@ -858,15 +1117,21 @@ class FacebookPluginsVideoIE(InfoExtractor):
_TESTS = [{
'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560',
- 'md5': '5954e92cdfe51fe5782ae9bda7058a07',
+ 'md5': 'ce2c32ab234398f4645389326133bce8',
'info_dict': {
'id': '10154383743583686',
'ext': 'mp4',
- # TODO: Fix title, uploader
'title': 'What to do during the haze?',
+ 'description': 'md5:81839c0979803a014b20798df255ed0b',
+ 'thumbnail': r're:^https?://.*',
'uploader': 'Gov.sg',
+ 'uploader_id': '100064718678925',
+ 'uploader_url': r're:^https?://.*',
'upload_date': '20160826',
'timestamp': 1472184808,
+ 'duration': 65.087,
+ 'view_count': int,
+ 'concurrent_view_count': int,
},
'add_ie': [FacebookIE.ie_key()],
}, {
@@ -897,7 +1162,7 @@ class FacebookRedirectURLIE(InfoExtractor):
'playable_in_embed': True,
'categories': ['Music'],
'channel': 'Boiler Room',
- 'uploader_id': 'brtvofficial',
+ 'uploader_id': '@boilerroom',
'uploader': 'Boiler Room',
'tags': 'count:11',
'duration': 3332,
@@ -905,11 +1170,15 @@ class FacebookRedirectURLIE(InfoExtractor):
'thumbnail': 'https://i.ytimg.com/vi/pO8h3EaFRdo/maxresdefault.jpg',
'channel_url': 'https://www.youtube.com/channel/UCGBpxWJr9FNOcFYA5GkKrMg',
'availability': 'public',
- 'uploader_url': 'http://www.youtube.com/user/brtvofficial',
+ 'uploader_url': r're:^https?://.*',
'upload_date': '20150917',
'age_limit': 0,
'view_count': int,
'like_count': int,
+ 'heatmap': 'count:100',
+ 'channel_is_verified': True,
+ 'channel_follower_count': int,
+ 'comment_count': int,
},
'add_ie': ['Youtube'],
'params': {'skip_download': 'Youtube'},
@@ -932,10 +1201,11 @@ class FacebookReelIE(InfoExtractor):
'info_dict': {
'id': '1195289147628387',
'ext': 'mp4',
- 'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e',
- 'description': 'md5:22f03309b216ac84720183961441d8db',
- 'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1',
+ 'title': 'md5:32aab9976c6b8a145fc0d799631e2b74',
+ 'description': 'md5:3ea795c5ebb7ed28e3e78bb7b1191753',
+ 'uploader': 'Beast Camp Training',
'uploader_id': '100040874179269',
+ 'uploader_url': r're:^https?://.*',
'duration': 9.579,
'timestamp': 1637502609,
'upload_date': '20211121',
@@ -949,7 +1219,7 @@ def _real_extract(self, url):
f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id)
-class FacebookAdsIE(InfoExtractor):
+class FacebookAdsIE(FacebookIE):
_VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/ads/library/?\?(?:[^#]+&)?id=(?P\d+)'
IE_NAME = 'facebook:ads'
@@ -1000,13 +1270,13 @@ def _extract_formats(self, video_dict):
for format_key, format_url in traverse_obj(video_dict, (
{dict.items}, lambda _, v: v[0] in self._FORMATS_MAP and url_or_none(v[1])
)):
- formats.append({
+ formats.append({**{
'format_id': self._FORMATS_MAP[format_key][0],
'format_note': self._FORMATS_MAP[format_key][1],
'url': format_url,
'ext': 'mp4',
'quality': qualities(tuple(self._FORMATS_MAP))(format_key),
- })
+ }, **self._get_video_metadata(format_url)})
return formats
def _real_extract(self, url):