From 6d1cd26f42fec846ec418ed02fa7e1c992c3e7b9 Mon Sep 17 00:00:00 2001
From: kclauhk <78251477+kclauhk@users.noreply.github.com>
Date: Mon, 18 Nov 2024 09:16:24 +0800
Subject: [PATCH] [ie/facebook] Experimental
- extract data from JSON strings by searching with regex instead of traversing dicts
- extract linked video
- improved thumbnail info
- more lenient url pattern
- live status
Known issue: videos in comments might be included
---
yt_dlp/extractor/facebook.py | 1235 ++++++++++++++++++++--------------
1 file changed, 726 insertions(+), 509 deletions(-)
diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py
index c07efcd5811a..90bd674586c1 100644
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@@ -12,7 +12,6 @@
determine_ext,
float_or_none,
format_field,
- get_element_by_id,
get_first,
int_or_none,
join_nonempty,
@@ -39,19 +38,12 @@ class FacebookIE(InfoExtractor):
(?:[^#]*?\#!/)?
(?:
(?:
- permalink\.php|
- video/video\.php|
- photo\.php|
- video\.php|
- video/embed|
- story\.php|
- watch(?:/live)?/?
+ (?:video/)?[a-z]{5,}(?:\.php|/live)?/?
)\?(?:.*?)(?:v|video_id|story_fbid)=|
- [^/]+/videos/(?:[^/]+/)?|
- [^/]+/posts/|
+ [^/]+/(?:videos|posts)/(?:[^/]+/)?|
events/(?:[^/]+/)?|
groups/[^/]+/(?:permalink|posts)/|
- watchparty/
+ [a-z]{5,}/|
)|
facebook:
)
@@ -73,179 +65,132 @@ class FacebookIE(InfoExtractor):
_VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary'
_TESTS = [{
+ # data.video.story.attachments[].media
'url': 'https://www.facebook.com/radiokicksfm/videos/3676516585958356/',
'info_dict': {
'id': '3676516585958356',
'ext': 'mp4',
'title': 'dr Adam Przygoda',
'description': 'md5:34675bda53336b1d16400265c2bb9b3b',
- 'uploader': 'RADIO KICKS FM',
- 'upload_date': '20230818',
- 'timestamp': 1692346159,
'thumbnail': r're:^https?://.*',
+ 'timestamp': 1692346159,
+ 'upload_date': '20230818',
+ 'uploader': 'RADIO KICKS FM',
'uploader_id': '100063551323670',
+ 'uploader_url': r're:^https?://.*',
'duration': 3133.583,
+ 'live_status': 'was_live',
+ 'concurrent_view_count': int,
'view_count': int,
- 'concurrent_view_count': 0,
+ 'like_count': int,
+ 'comment_count': int,
},
}, {
'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
- 'md5': '6a40d33c0eccbb1af76cf0485a052659',
- 'info_dict': {
- 'id': '637842556329505',
- 'ext': 'mp4',
- 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam',
- 'uploader': 'Tennis on Facebook',
- 'upload_date': '20140908',
- 'timestamp': 1410199200,
- },
- 'skip': 'Requires logging in',
+ 'only_matching': True,
}, {
- # data.video
+ # data.video.story.attachments[].media
'url': 'https://www.facebook.com/video.php?v=274175099429670',
'info_dict': {
'id': '274175099429670',
'ext': 'mp4',
'title': 'Asif',
'description': '',
- 'uploader': 'Asif Nawab Butt',
- 'upload_date': '20140506',
- 'timestamp': 1399398998,
'thumbnail': r're:^https?://.*',
- 'uploader_id': 'pfbid05AzrFTXgY37tqwaSgbFTTEpCLBjjEJHkigogwGiRPtKEpAsJYJpzE94H1RxYXWEtl',
+ 'timestamp': 1399398998,
+ 'upload_date': '20140506',
+ 'uploader': 'Asif Nawab Butt',
+ 'uploader_id': r're:^pfbid.*',
+ 'uploader_url': r're:^https?://.*',
'duration': 131.03,
+ 'live_status': 'not_live',
'concurrent_view_count': int,
'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
},
}, {
- 'note': 'Video with DASH manifest',
- 'url': 'https://www.facebook.com/video.php?v=957955867617029',
- 'md5': 'b2c28d528273b323abe5c6ab59f0f030',
- 'info_dict': {
- 'id': '957955867617029',
- 'ext': 'mp4',
- 'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...',
- 'uploader': 'Demy de Zeeuw',
- 'upload_date': '20160110',
- 'timestamp': 1452431627,
- },
- 'skip': 'Requires logging in',
- }, {
- 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570',
- 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6',
- 'info_dict': {
- 'id': '544765982287235',
- 'ext': 'mp4',
- 'title': '"What are you doing running in the snow?"',
- 'uploader': 'FailArmy',
- },
- 'skip': 'Video gone',
- }, {
- 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903',
- 'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3',
- 'info_dict': {
- 'id': '1035862816472149',
- 'ext': 'mp4',
- 'title': 'What the Flock Is Going On In New Zealand Credit: ViralHog',
- 'uploader': 'S. Saint',
- },
- 'skip': 'Video gone',
- }, {
- 'note': 'swf params escaped',
- 'url': 'https://www.facebook.com/barackobama/posts/10153664894881749',
- 'md5': '97ba073838964d12c70566e0085c2b91',
- 'info_dict': {
- 'id': '10153664894881749',
- 'ext': 'mp4',
- 'title': 'Average time to confirm recent Supreme Court nominees: 67 days Longest it\'s t...',
- 'thumbnail': r're:^https?://.*',
- 'timestamp': 1456259628,
- 'upload_date': '20160223',
- 'uploader': 'Barack Obama',
- },
- 'skip': 'Gif on giphy.com gone',
- }, {
- # have 1080P, but only up to 720p in swf params
# data.video.story.attachments[].media
'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
- 'md5': '1659aa21fb3dd1585874f668e81a72c8',
'info_dict': {
'id': '10155529876156509',
'ext': 'mp4',
'title': 'Holocaust survivor becomes US citizen',
'description': 'She survived the holocaust — and years later, she’s getting her citizenship so she can vote for Hillary Clinton http://cnn.it/2eERh5f',
+ 'thumbnail': r're:^https?://.*',
'timestamp': 1477818095,
'upload_date': '20161030',
'uploader': 'CNN',
- 'thumbnail': r're:^https?://.*',
- 'view_count': int,
'uploader_id': '100059479812265',
- 'concurrent_view_count': int,
+ 'uploader_url': r're:^https?://.*',
'duration': 44.181,
- },
- }, {
- # FIXME: unable to extract uploader, no formats found
- # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
- # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
- 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/',
- 'info_dict': {
- 'id': '1417995061575415',
- 'ext': 'mp4',
- 'title': 'Довгоочікуване відео | By Yaroslav - Facebook',
- 'description': 'Довгоочікуване відео',
- 'timestamp': 1486648217,
- 'upload_date': '20170209',
- 'uploader': 'Yaroslav Korpan',
- 'uploader_id': 'pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl',
+ 'live_status': 'not_live',
'concurrent_view_count': int,
- 'thumbnail': r're:^https?://.*',
'view_count': int,
- 'duration': 11736.446,
- },
- 'params': {
- 'skip_download': True,
+ 'like_count': int,
+ 'comment_count': int,
},
}, {
- # FIXME: Cannot parse data error
+ # web_link url: http://giphy.com/gifs/l3vR8BKU0m8uX2mAg
'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471',
'info_dict': {
- 'id': '1072691702860471',
+ 'id': 'l3vR8BKU0m8uX2mAg',
'ext': 'mp4',
- 'title': 'md5:ae2d22a93fbb12dad20dc393a869739d',
+ 'title': 'Nada mas satisfactorio que los otros 5... - La Guía Del Varón',
+ 'description': 'Nada mas satisfactorio que los otros 5 minutos',
+ 'tags': ['giphyupload'],
+ 'thumbnail': r're:^https?://.*',
'timestamp': 1477305000,
- 'upload_date': '20161024',
+ 'upload_date': '20161022',
'uploader': 'La Guía Del Varón',
- 'thumbnail': r're:^https?://.*',
+ 'uploader_id': '100050567346031',
+ 'uploader_url': r're:^https?://.*',
+ 'like_count': int,
+ 'comment_count': int,
},
- 'skip': 'Requires logging in',
+ 'skip': 'Gif on giphy.com',
}, {
- # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
+ # data.node.comet_sections.content.story.attachments[].styles.attachment.media
'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/',
'info_dict': {
'id': '202882990186699',
'ext': 'mp4',
- 'title': 'birb (O v O") | Hello? Yes your uber ride is here',
- 'description': 'Hello? Yes your uber ride is here * Jukin Media Verified * Find this video and others like it by visiting...',
- 'timestamp': 1486035513,
+ 'title': 'birb (O v O")',
+ 'description': 'md5:963dee8a667a2b49f2059cf7ab54fe55',
+ 'thumbnail': r're:^https?://.*',
+ 'timestamp': 1486035494,
'upload_date': '20170202',
'uploader': 'Elisabeth Ahtn',
- 'uploader_id': '100013949973717',
+ 'uploader_id': r're:^pfbid.*',
+ 'uploader_url': r're:^https?://.*',
+ 'duration': 23.714,
+ 'live_status': 'not_live',
+ 'concurrent_view_count': int,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
},
- 'skip': 'Requires logging in',
}, {
- # data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media
+ # web_link url: https://www.facebook.com/attn/videos/1569199726448814/
'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/',
'info_dict': {
'id': '1569199726448814',
'ext': 'mp4',
- 'title': 'Pence MUST GO!',
- 'description': 'Vickie Gentry shared a memory.',
+ 'title': 'What if marijuana companies were allowed to have TV ads like B...',
+ 'description': 'What if we treated marijuana ads like big pharma ads?',
+ 'thumbnail': r're:^https?://.*',
'timestamp': 1511548260,
'upload_date': '20171124',
- 'uploader': 'Vickie Gentry',
- 'uploader_id': 'pfbid0FkkycT95ySNNyfCw4Cho6u5G7WbbZEcxT496Hq8rtx1K3LcTCATpR3wnyYhmyGC5l',
- 'thumbnail': r're:^https?://.*',
+ 'uploader': 'ATTN:',
+ 'uploader_id': '100064451419378',
+ 'uploader_url': r're:^https?://.*',
'duration': 148.224,
+ 'live_status': 'not_live',
+ 'concurrent_view_count': int,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
},
}, {
# data.node.comet_sections.content.story.attachments[].styles.attachment.media
@@ -253,15 +198,21 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '6968553779868435',
'ext': 'mp4',
+ 'title': 'ATTN: - Learning new problem-solving skills is hard for...',
'description': 'md5:2f2fcf93e97ac00244fe64521bbdb0cb',
- 'uploader': 'ATTN:',
+ 'thumbnail': r're:^https?://.*',
+ 'timestamp': 1701975646,
'upload_date': '20231207',
- 'title': 'ATTN:',
- 'duration': 132.675,
+ 'uploader': 'ATTN:',
'uploader_id': '100064451419378',
+ 'uploader_url': r're:^https?://.*',
+ 'duration': 132.675,
+ 'live_status': 'not_live',
+ 'concurrent_view_count': int,
'view_count': int,
- 'thumbnail': r're:^https?://.*',
- 'timestamp': 1701975646,
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
},
}, {
# data.node.comet_sections.content.story.attachments[].styles.attachment.media
@@ -269,147 +220,226 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '270103405756416',
'ext': 'mp4',
- 'title': 'Lela Evans',
- 'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...',
+ 'title': 'Lela Evans - Today Makkovik\'s own Pilot Mandy Smith made...',
+ 'description': 'md5:cc93a91feb89923303c1f78656791e4d',
'thumbnail': r're:^https?://.*',
- 'uploader': 'Lela Evans',
- 'uploader_id': 'pfbid0swT2y7t6TAsZVBvcyeYPdhTMefGaS26mzUwML3vd1ma6ndGZKxsyS4Ssu3jitZLXl',
- 'upload_date': '20231228',
'timestamp': 1703804085,
+ 'upload_date': '20231228',
+ 'uploader': 'Lela Evans',
+ 'uploader_id': r're:^pfbid.*',
+ 'uploader_url': r're:^https?://.*',
'duration': 394.347,
+ 'live_status': 'not_live',
+ 'concurrent_view_count': int,
'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
},
}, {
'url': 'https://www.facebook.com/story.php?story_fbid=pfbid0Fnzhm8UuzjBYpPMNFzaSpFE9UmLdU4fJN8qTANi1Dmtj5q7DNrL5NERXfsAzDEV7l&id=100073071055552',
'only_matching': True,
- }, {
- 'url': 'https://www.facebook.com/video.php?v=10204634152394104',
- 'only_matching': True,
}, {
'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf',
'only_matching': True,
}, {
- # data.mediaset.currMedia.edges
+ # data.mediaset.currMedia.edges[].node
'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater',
- 'only_matching': True,
+ 'info_dict': {
+ 'id': '10153870694020942',
+ 'ext': 'mp4',
+ 'title': 'Premier\'s Playoff Challenge',
+ 'description': 'md5:079134a18ac00b11ec5815fccf75a5a8',
+ 'thumbnail': r're:^https?://.*',
+ 'timestamp': 1429133517,
+ 'upload_date': '20150415',
+ 'uploader': 'Christy Clark',
+ 'uploader_id': '100045032167189',
+ 'uploader_url': r're:^https?://.*',
+ 'duration': 31.197,
+ 'live_status': 'not_live',
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ },
}, {
# data.video.story.attachments[].media
'url': 'facebook:544765982287235',
'only_matching': True,
}, {
- # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
- 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',
- 'only_matching': True,
- }, {
- # data.video.creation_story.attachments[].media
'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/',
'only_matching': True,
}, {
- # data.video
'url': 'https://www.facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd.onion/video.php?v=274175099429670',
'only_matching': True,
}, {
- # no title
- 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/',
- 'only_matching': True,
- }, {
- # data.video
+ # data.video.story.attachments[].media
'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/',
'info_dict': {
'id': '359649331226507',
'ext': 'mp4',
'title': 'Fnatic vs. EG - Group A - Opening Match - ESL One Birmingham Day 1',
'description': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses',
+ 'thumbnail': r're:^https?://.*',
'timestamp': 1527084179,
'upload_date': '20180523',
'uploader': 'ESL One Dota 2',
'uploader_id': '100066514874195',
+ 'uploader_url': r're:^https?://.*',
'duration': 4524.001,
- 'view_count': int,
- 'thumbnail': r're:^https?://.*',
+ 'live_status': 'not_live',
'concurrent_view_count': int,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
},
'params': {
'skip_download': True,
},
}, {
- # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
+ # data.video.story.attachments[].media
'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/',
'info_dict': {
'id': '106560053808006',
'ext': 'mp4',
'title': 'Josef',
+ 'description': '',
'thumbnail': r're:^https?://.*',
- 'concurrent_view_count': int,
- 'uploader_id': 'pfbid02gpfwRM2XvdEJfsERupwQiNmBiDArc38RMRYZnap372q6Vs7MtFTVy72mmFWpJBTKl',
'timestamp': 1549275572,
- 'duration': 3.283,
- 'uploader': 'Josef Novak',
- 'description': '',
'upload_date': '20190204',
+ 'uploader': 'Josef Novak',
+ 'uploader_id': r're:^pfbid.*',
+ 'uploader_url': r're:^https?://.*',
+ 'duration': 3.283,
+ 'live_status': 'not_live',
+ 'concurrent_view_count': int,
+ 'comment_count': int,
},
}, {
# data.video.story.attachments[].media
'url': 'https://www.facebook.com/watch/?v=647537299265662',
- 'only_matching': True,
- }, {
- # FIXME: https://github.com/yt-dlp/yt-dlp/issues/542
- # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
- 'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271',
'info_dict': {
- 'id': '10157667649866271',
+ 'id': '647537299265662',
+ 'ext': 'mp4',
+ 'title': 'Padre enseña a su hijo a cómo bañar un recién nacido junto con su...',
+ 'description': 'Padre ense\u00f1a a su hijo a c\u00f3mo ba\u00f1ar un reci\u00e9n nacido junto con su gato y se hace viral, mir\u00e1 el video 😍',
+ 'thumbnail': r're:^https?://.*',
+ 'timestamp': 1605534618,
+ 'upload_date': '20201116',
+ 'uploader': 'InfoPico',
+ 'uploader_id': '100064391811349',
+ 'uploader_url': r're:^https?://.*',
+ 'duration': 136.179,
+ 'live_status': 'not_live',
+ 'concurrent_view_count': int,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
},
- 'playlist_count': 3,
- 'skip': 'Requires logging in',
}, {
- # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media
+ # data.video.story.attachments[].media
'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330',
'info_dict': {
'id': '117576630041613',
'ext': 'mp4',
- # TODO: title can be extracted from video page
- 'title': 'Facebook video #117576630041613',
- 'uploader_id': '189393014416438',
- 'upload_date': '20201123',
+ 'title': 'Officers Rescue Trapped Motorist from Mahoning River Crash 11-22-20',
+ 'thumbnail': r're:^https?://.*',
'timestamp': 1606162592,
+ 'upload_date': '20201123',
+ 'uploader': 'City of Alliance Police Department',
+ 'uploader_id': '100064413680392',
+ 'uploader_url': r're:^https?://.*',
+ 'duration': 101.504,
+ 'concurrent_view_count': int,
+ 'view_count': int,
+ 'comment_count': int,
},
'skip': 'Requires logging in',
}, {
- # node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media
'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/',
+ 'only_matching': True,
+ }, {
+ # data.video.story.attachments[].media
+ 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275',
'info_dict': {
- 'id': '211567722618337',
+ 'id': '1823658634322275',
'ext': 'mp4',
- 'title': 'Facebook video #211567722618337',
- 'uploader_id': '127875227654254',
- 'upload_date': '20161122',
- 'timestamp': 1479793574,
+ 'title': 'Live Webcam from Corfu - Greece',
+ 'description': 'md5:84c1af6894ecffe710c79744e4873e85',
+ 'thumbnail': r're:^https?://.*',
+ 'timestamp': 1521449766,
+ 'upload_date': '20180319',
+ 'uploader': 'SkylineWebcams',
+ 'uploader_id': '100064307154679',
+ 'uploader_url': r're:^https?://.*',
+ 'duration': 14424.199,
+ 'live_status': 'was_live',
+ 'concurrent_view_count': int,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
},
- 'skip': 'No video',
}, {
- # data.video.creation_story.attachments[].media
- 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275',
- 'only_matching': True,
+ # data.node.comet_sections.content.story.attachments[].style.attachment.all_subattachments.nodes[].media.video_grid_renderer.video
+ 'url': 'https://www.facebook.com/story.php?story_fbid=5268096689957022&id=100002702286715',
+ 'info_dict': {
+ 'id': '669977824610306',
+ 'ext': 'mp4',
+ 'title': 'md5:f2666feb05057a09f8b6f542cd7a3eda',
+ 'description': 'md5:f5775e7245153857caade33e757ceb21',
+ 'thumbnail': r're:^https?://.*',
+ 'timestamp': 1671780203,
+ 'upload_date': '20221223',
+ 'uploader': 'Azura Tan Siow Ling',
+ 'uploader_id': '100002702286715',
+ 'uploader_url': r're:^https?://.*',
+ 'duration': 20.666,
+ 'live_status': 'not_live',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ },
}, {
- 'url': 'https://www.facebook.com/watchparty/211641140192478',
+ # data.node.comet_sections.content.story.attachments[].styles.attachment.all_subattachments.nodes[].media.video_grid_renderer.video
+ 'url': 'https://www.facebook.com/hanaryushi/posts/pfbid02Thgaymz9f4QXZ1XogoP4eETpdY2WSy7CLGCMuy3VVQopeet9MHbYR7H9tXYD4UE5l',
'info_dict': {
- 'id': '211641140192478',
+ 'id': 'pfbid02Thgaymz9f4QXZ1XogoP4eETpdY2WSy7CLGCMuy3VVQopeet9MHbYR7H9tXYD4UE5l',
+ 'title': 'Hana Ryuushi - “sharing a relationship of having our...',
+ 'description': 'md5:75d2f9d921f40e90ba3b176f0d827cf7',
+ 'timestamp': 1706949770,
+ 'upload_date': '20240203',
+ 'uploader': 'Hana Ryuushi',
+ 'uploader_id': '100005357179289',
+ 'uploader_url': 'https://www.facebook.com/hanaryushi',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
},
- 'playlist_count': 1,
- 'skip': 'Requires logging in',
+ 'playlist_count': 2,
}, {
- # FIXME: Cannot parse data error
# data.event.cover_media_renderer.cover_video
- 'url': 'https://m.facebook.com/events/1509582499515440',
+ 'url': 'https://www.facebook.com/events/464667289575302/',
'info_dict': {
- 'id': '637246984455045',
+ 'id': '859946639295361',
'ext': 'mp4',
- 'title': 'ANALISI IN CAMPO OSCURO " Coaguli nel sangue dei vaccinati"',
- 'description': 'Other event by Comitato Liberi Pensatori on Tuesday, October 18 2022',
+ 'title': 'June Salsa & Bachata Classes On Sundays for Absolute Beginners, Improvers & Advance level.',
+ 'description': 'Dance event in Hong Kong by Dance With Style on Sunday, June 16 2024',
'thumbnail': r're:^https?://.*',
- 'uploader': 'Comitato Liberi Pensatori',
- 'uploader_id': '100065709540881',
+ 'uploader': 'Dance With Style',
+ 'uploader_id': '100064171651675',
+ 'uploader_url': r're:^https?://.*',
+ 'live_status': 'not_live',
},
+ }, {
+ 'url': 'https://m.facebook.com/stories/121668313179875/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.facebook.com/stories/100944752039935/UzpfSVNDOjY2ODMzMzk5NTUwMzc2MA==/',
+ 'only_matching': True,
}]
_SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)'
_api_config = {
@@ -417,6 +447,9 @@ class FacebookIE(InfoExtractor):
}
def _perform_login(self, username, password):
+ # raise error because login with username/password is not working
+ self.raise_login_required('Login with username/password is currently not working', method='cookies')
+
login_page_req = Request(self._LOGIN_URL)
self._set_cookie('facebook.com', 'locale', 'en_US')
login_page = self._download_webpage(login_page_req, None,
@@ -429,7 +462,7 @@ def _perform_login(self, username, password):
login_form = {
'email': username,
- 'pass': password,
+ 'pass': password, # "encpass" is needed instead of plain password
'lsd': lsd,
'lgnrnd': lgnrnd,
'next': 'http://facebook.com/home.php',
@@ -475,105 +508,276 @@ def _perform_login(self, username, password):
self.report_warning(f'unable to log in: {err}')
return
- def _extract_from_url(self, url, video_id):
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url
webpage = self._download_webpage(
- url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
-
- def extract_metadata(webpage):
- post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall(
- r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)]
- post = traverse_obj(post_data, (
- ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
- media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: (
- k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
- title = get_first(media, ('title', 'text'))
- description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text'))
- page_title = title or self._html_search_regex((
- r'
]*class="uiHeaderTitle"[^>]*>(?P[^<]*)
',
- r'(?s)(?P.*?)',
- self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'(?P.+?)',
- ), webpage, 'title', default=None, group='content')
- description = description or self._html_search_meta(
- ['description', 'og:description', 'twitter:description'],
- webpage, 'description', default=None)
- uploader_data = (
- get_first(media, ('owner', {dict}))
- or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name']))
- or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name']))
- or get_first(post, ('node', 'actors', ..., {dict}))
- or get_first(post, ('event', 'event_creator', {dict}))
- or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {})
- uploader = uploader_data.get('name') or (
- clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
- or self._search_regex(
- (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False))
- timestamp = int_or_none(self._search_regex(
- r']+data-utime=["\'](\d+)', webpage,
- 'timestamp', default=None))
- thumbnail = self._html_search_meta(
- ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None)
- # some webpages contain unretrievable thumbnail urls
- # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1
- # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/
- if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail):
- thumbnail = None
- info_dict = {
- 'description': description,
- 'uploader': uploader,
- 'uploader_id': uploader_data.get('id'),
- 'timestamp': timestamp,
- 'thumbnail': thumbnail,
- 'view_count': parse_count(self._search_regex(
- (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'),
- webpage, 'view count', default=None)),
- 'concurrent_view_count': get_first(post, (
- ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})),
- **traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', {
- 'like_count': ('likers', 'count', {int}),
- 'comment_count': ('total_comment_count', {int}),
- 'repost_count': ('share_count_reduced', {parse_count}),
- }), get_all=False),
- }
-
- info_json_ld = self._search_json_ld(webpage, video_id, default={})
- info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '')
- or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}')
- return merge_dicts(info_json_ld, info_dict)
-
- video_data = None
-
- def extract_video_data(instances):
- video_data = []
- for item in instances:
- if try_get(item, lambda x: x[1][0]) == 'VideoConfig':
- video_item = item[2][0]
- if video_item.get('video_id'):
- video_data.append(video_item['videoData'])
- return video_data
-
- server_js_data = self._parse_json(self._search_regex(
- [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'],
- webpage, 'server js data', default='{}'), video_id, fatal=False)
-
- if server_js_data:
- video_data = extract_video_data(server_js_data.get('instances', []))
-
- def extract_from_jsmods_instances(js_data):
- if js_data:
- return extract_video_data(try_get(
- js_data, lambda x: x['jsmods']['instances'], list) or [])
+ re.sub(r'://(?:[\w-]+\.)?facebook\.com/', '://www.facebook.com/', url), video_id)
+
+ post_data = re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)
+ sjs_data = [self._parse_json(j, video_id, fatal=False) for j in post_data]
+ cookies = self._get_cookies(url)
+ # user passed logged-in cookies or attempted to login
+ login_data = cookies.get('c_user') and cookies.get('xs')
+ logged_in = False
+ if login_data:
+ logged_in = get_first(sjs_data, (
+ 'require', ..., ..., ..., '__bbox', 'define',
+ lambda _, v: 'CurrentUserInitialData' in v, ..., 'ACCOUNT_ID'), default='0') != '0'
+ if logged_in and (info := get_first(sjs_data, (
+ 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data',
+ (('ufac_client', 'state', (('set_contact_point_state_renderer', 'title'),
+ ('intro_state_renderer', 'header_title'))),
+ ('epsilon_checkpoint', 'screen', 'title')),
+ ))):
+ if any(content in info for content in ['days left to appeal', 'suspended your account']):
+ raise ExtractorError('Your account is suspended', expected=True)
+ if 'Enter mobile number' == info:
+ raise ExtractorError('Facebook is requiring mobile number confirmation', expected=True)
+ if 'your account has been locked' in info:
+ raise ExtractorError('Your account has been locked', expected=True)
+
+ if props := get_first(sjs_data, (
+ 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., (None, (..., ...)), 'rootView',
+ lambda _, v: v.get('title') is not None)):
+ if not self._cookies_passed:
+ self.raise_login_required(method='cookies')
+ else:
+ msg = re.sub(r'\s{2,}', ' ', join_nonempty('title', 'body', delim='. ', from_dict=props))
+ raise ExtractorError(f'This video is not available. Facebook said: {msg}', expected=True)
+
+ if post_data and not re.search(r'"[^"]+[^(feed)]_story[^"]*":', ','.join(post_data)):
+ raise ExtractorError('An unknown error occurred. Please try again.', expected=True)
+
+ def find_json_obj(json_strings, *patterns, obj_in_value=False, get_all=False):
+ """
+ Find JSON object, in the form of a string, by regular expression
+ >>> obj = find_json_obj([json_a, _and_b], regex_a, (regex_b, _or_c), obj_in_value=False, get_all=True)
+ @param json_strings string/list JSON string/a list of JSON strings (match all)
+ *patterns string/tuple regex patterns (if tuple, return only the 1st matched pattern)
+ obj_in_value boolean False: find the object(s) containing (one of) the pattern(s)
+ True : given pattern(s) of the key(s) to find the
+ object(s) in the value of that key(s)
+ get_all boolean return the 1st or all of the results of each regex pattern
+ @return list of tuple a list of (matching pattern, matched JSON object)
+ """
+ def find_offset(string, bracket, quotation):
+ _BRACKET_MAP = {
+ '{': ([f'{{{quotation}'], ['},', '}]', '}}', f'}}{quotation}'], (1 if obj_in_value else -1)),
+ '}': (['},', '}]', '}}', f'}}{quotation}'], [f'{{{quotation}'], 1),
+ } # ([search pattern], [opposite sign], search direction); search direction: 1 - forward, -1 - backward
+ string = re.sub(rf'{{\\{quotation}([^{quotation}]+\\{quotation}:)', rf'{{{quotation}\1 ', string.replace('{}', '[]'))
+ count, b_sum, offset = 0, 0, 0
+ for y, x in zip(((string[1:] + ' ') if _BRACKET_MAP[bracket][2] > 0 else (' ' + string[:-1]))[::_BRACKET_MAP[bracket][2]],
+ string[::_BRACKET_MAP[bracket][2]]):
+ s = (x + y) if _BRACKET_MAP[bracket][2] > 0 else (y + x)
+ count += (1 if s in _BRACKET_MAP[bracket][0] or s in _BRACKET_MAP[bracket][1] else 0)
+ b_sum += (1 if s in _BRACKET_MAP[bracket][0] else (-1 if s in _BRACKET_MAP[bracket][1] else 0))
+ offset += 1
+ if count > 0 and b_sum >= (0 if obj_in_value else 1):
+ break
+ return offset * _BRACKET_MAP[bracket][2]
+
+ for json_str in variadic(json_strings): # loop all
+ if isinstance(json_str, str):
+ # check if json_str is a JSON string and get the quotation mark (either " or ')
+ if quotation := self._search_regex(r'(["\']):\s*[\[{]*\1', json_str, 'quotation', default=None):
+ for patterns_item in patterns:
+ for pattern in variadic(patterns_item):
+ # 'patterns_item' loop - loop each item in *patterns (item can be a str or tuple)
+ found = False
+ if isinstance(pattern, str):
+ for m in re.finditer(pattern, json_str): # break according to get_all
+ if obj_in_value:
+ i = (lambda x, y: (m.start(m.lastindex or 0) + x - 1) if x > 0
+ else ((m.end(m.lastindex or 0) + len(y.group(0)) - 1) if y else None)
+ )(m.group(m.lastindex or 0).rfind('{'),
+ re.match(r'^\w*(?:":)?:?\s*{', json_str[m.end(m.lastindex or 0):]))
+ else:
+ i = m.start(m.lastindex or 0)
+ if i:
+ opening = (i + find_offset(
+ json_str[(i * obj_in_value):(i * (not obj_in_value) - obj_in_value * 2 + 1)],
+ '{', quotation,
+ ) - obj_in_value)
+ closing = i + find_offset(json_str[i:], '}', quotation)
+ if isinstance(opening, int) and isinstance(closing, int):
+ found = True
+ yield (m.group(0), json_str[opening:closing])
+ if not get_all:
+ break
+ else: # if this for loop ends with break (i.e. not get_all), else clause is not executed
+ if found:
+ break # break 'patterns_item' loop if found and get_all
+ continue # move on to the next 'pattern' (if exists) in 'patterns_item' if not found
+ break # break 'patterns_item' loop if found and not get_all
+ if found and isinstance(patterns_item, tuple):
+ break # break 'patterns_item' loop if found and patterns_item is a tuple
+
+ def extract_metadata(field=None):
+ if webpage_info.get(field) is not None:
+ return webpage_info[field]
+ elif field is None and webpage_info.get('timestamp') is not None:
+ return webpage_info
+ # extract data
+ description = title = uploader_info = None
+ # uploader
+ if field == 'uploader':
+ for x in find_json_obj(post_data, (rf'actors{Q}:[^}}]+{Q}__isActor{Q}:',
+ rf'owner{Q}:[^}}]+{Q}name{Q}:\s*{Q}[^{Q}]'),
+ get_all=True):
+ if re.search(rf'id{Q}:\s*{Q}(?:{s_id}|{video_id}){Q}', x[1]):
+ uploader_info = traverse_obj(json.loads(x[1]), {
+ 'uploader': ((('actors', ...), ('owner', 'owner_as_page'), 'video_owner'),
+ 'name', {str}),
+ 'uploader_id': ((('actors', ...), ('owner', 'owner_as_page'), 'video_owner'),
+ 'id', {str}),
+ 'uploader_url': ((('actors', ...), ('owner', 'owner_as_page'), 'video_owner'),
+ ('url', 'profile_url'), {url_or_none}),
+ }, get_all=False)
+ break
+ if uploader_info:
+ webpage_info.update(uploader_info)
+ # title / description
+ if field in ('title', 'description', None):
+ for x in find_json_obj(post_data,
+ rf'{Q}message{Q}:(?:(?!{Q}message{Q}:)[^}}])+{Q}text{Q}:\s*{Q}[^{Q}](?:(?!{Q}id{Q}:).)+{Q}id{Q}:',
+ get_all=True):
+ x_dict = json.loads(x[1])
+ for i in [i for i in [s_id, p_id] if i is not None]:
+ if x_dict.get('id') == i:
+ if (description := x_dict['message'] if isinstance(x_dict['message'], str)
+ else traverse_obj(x_dict, ('message', 'text', {str_or_none}))):
+ if track_title := self._search_regex(rf'({Q}track_title{Q}:\s*{Q}(?:(?:[^{Q}\\]|\\.)*){Q})',
+ x[1], 'track title', default=None):
+ description += '. ' + json.loads('{' + track_title + '}')['track_title']
+ break
+ if description:
+ break
+ description = description or self._html_search_meta(
+ ['description', 'og:description', 'twitter:description'],
+ webpage, 'description', default='')
+ for x in find_json_obj(post_data, rf'title{Q}:\s*[^}}]+{Q}text{Q}:\s*{Q}[^{Q}]', get_all=True):
+ x_dict = json.loads(x[1])
+ if p_id:
+ if (text := traverse_obj(x_dict, ('title', 'text', {str_or_none}))):
+ title = title or (text if x_dict.get('id') == p_id else None)
+ description = description or (text if x_dict.get('id') == s_id else description)
+ if title and description:
+ break
+ title = (lambda x: x if x != extract_metadata('uploader') else None
+ )(title
+ or (self._html_search_regex(
+ (r'\s(?P[\s\S]+?)\s',
+ r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)
',
+ r'(?s)(?P.*?)'),
+ re.sub(r'(Facebook(\sLive)?)|(Video)', '', webpage),
+ 'title', default='', group='content')
+ or (lambda x: '' if not x or x.group(1) in ('Video', 'Facebook', 'Facebook Live')
+ else x.group(1).encode().decode('unicode_escape')
+ )(re.search(rf'{Q}meta{Q}:\s*{{{Q}title{Q}:\s*{Q}((?:[^{Q}\\]|\\.)*){Q}', webpage))
+ or og_title
+ ).split(' | ')[0]
+ or re.sub(r'(\s*\n\s*)', ' ', description))
+ webpage_info['title'] = title if len(title or '') <= 100 else title[:(47 + title[47:67].rfind(' '))] + '...'
+ webpage_info['description'] = description
+ # timestamp
+ if field in ('timestamp', None):
+ for x in find_json_obj(post_data, rf'creation_time{Q}:\s*\d+,', rf'created_time{Q}:\s*\d+,',
+ rf'publish_time{Q}:\s*\d+,', get_all=True):
+ if re.search(rf'id{Q}:\s*{Q}(?:(?:{s_id})|(?:{p_id})){Q}', x[1]):
+ webpage_info['timestamp'] = json.loads(x[1]).get(re.split(f'{Q}', x[0])[0])
+ break
+ # like count
+ if field in ('like_count', None):
+ like_count = (re.search(rf'localized_name{Q}:\s*{Q}Like{Q}.[^}}]+{Q}reaction_count{Q}:\s*(\d+)}}', feedback_data)
+ or re.search(rf'likers{Q}:\s*{{{Q}count{Q}:\s*(\d+)}}', feedback_data))
+ webpage_info['like_count'] = int(like_count.group(1)) if like_count else None
+ # comment count
+ if field in ('comment_count', None):
+ for x in find_json_obj(feedback_data, (rf'comments{Q}:\s*[^}}]+(total_count{Q}:\s*\d+)',
+ rf'total_comment_count{Q}:\s*\d+')):
+ webpage_info['comment_count'] = parse_count(traverse_obj(json.loads(x[1]),
+ (('total_count', (..., 'ig_comment_count'), 'total_comment_count'), {str_or_none}), get_all=False))
+ # share count
+ if field in ('repost_count', None):
+ for x in find_json_obj(feedback_data, (rf'share_count{Q}:\s*[^}}]+(count{Q}:\s*"?\d+)',
+ rf'share_count(?:_reduced){Q}:\s*"?\d+')):
+ webpage_info['repost_count'] = parse_count(traverse_obj(json.loads(x[1]),
+ (('count', 'share_count_reduced', 'total_count'), {str_or_none}), get_all=False))
+ # return data
+ return webpage_info.get(field) if field else webpage_info
+
+ og_title = self._og_search_title(webpage, default='').split(' | ')
+ if len(og_title) > 1 and re.search(r'\d+\w?\s(?:reactions|shares|views)', og_title[0]):
+ og_title.pop(0)
+ og_title = re.sub(r'(\s*\n\s*)', ' ', ' | '.join(og_title))
+ thumbnail = self._html_search_meta(
+ ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None)
+ if thumbnail and not re.search(r'\.(?:gif|jpg|png|webp)', thumbnail):
+ thumbnail = None
+
+ webpage_info = {
+ 'thumbnails': [{k: v for k, v in {
+ 'url': thumbnail,
+ 'height': int_or_none(self._search_regex(
+ r'stp=.+_[a-z]\d+x(\d+)&', thumbnail, 'thumbnail height', default=None)),
+ 'preference': None if 'stp=' in thumbnail else 1,
+ }.items() if v is not None}] if url_or_none(thumbnail) else [],
+ 'view_count': parse_count(self._search_regex(
+ (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'),
+ webpage, 'view count', default=None)),
+ }
- def extract_dash_manifest(vid_data, formats, mpd_url=None):
- dash_manifest = traverse_obj(
- vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
- if dash_manifest:
- formats.extend(self._parse_mpd_formats(
+ p_id, s_id, linked_url, data, feedback_data = None, None, None, [], ''
+ Q = self._search_regex(r'(["\']):\s*[\[{]*\1', (post_data[0] if post_data else ''), 'quotation', default='"')
+ for p_data in post_data[:]:
+ if rf'{Q}feed_unit{Q}:' in p_data or not re.search(
+ rf'{Q}(?:dash_manifest_urls?|message|event_description){Q}:', p_data):
+ # discard useless feed data
+ post_data.remove(p_data)
+ else:
+ if (not s_id or not p_id) and (f'{Q}story{Q}:' in p_data or f'{Q}creation_story{Q}:' in p_data):
+ p_id = p_id or self._search_regex(rf'{Q}(?:post_id|videoId|video_id){Q}:\s*{Q}(\d+){Q}', p_data,
+ 'post id', default=(video_id if video_id.isnumeric() else None))
+ s_id = s_id or self._search_regex(rf'id{Q}:\s*{Q}(Uzpf[^{Q}]+){Q}', p_data, 'story id', default=None)
+ if not data:
+ if re.search(rf'{Q}attachment{Q}:\s*{{{Q}(?:source|web_link){Q}:', p_data):
+ # linked video
+ for x in find_json_obj(p_data, rf'{Q}attachment{Q}:\s*{{{Q}(?:source|web_link){Q}:',
+ obj_in_value=True):
+ if linked_url := traverse_obj(
+ json.loads(x[1]), (('web_link', None), 'url', {url_or_none}), get_all=False):
+ url_transparent = '.facebook.com' not in urllib.parse.urlparse(linked_url).netloc
+ data = x[1]
+ break
+ elif f'{Q}dash_manifest_url' in p_data[:p_data.find(f'{Q}comment_list_renderer{Q}:')]:
+ for x in find_json_obj(p_data, rf'{Q}data{Q}:\s*{{', rf'{Q}data{Q}:', obj_in_value=True):
+ if f'{Q}dash_manifest_url' in x[1]:
+ data = x[1]
+ break
+ if (not feedback_data
+ and (f'{Q}likers{Q}:' in p_data or f'{Q}Like{Q}}},' in p_data or f'comment_count{Q}:' in p_data)):
+ for x in find_json_obj(p_data, rf'reaction_count{Q}:\s*{{', rf'feedback{Q}:\s*{{'):
+ if (((s_id or p_id or video_id) in x[1] or (p_id or video_id) in x[1])
+ and webpage.count(json.loads(x[1]).get('id', 'null')) > 1):
+ feedback_data = x[1]
+ break
+
+ if linked_url:
+ return self.url_result(linked_url, video_id=video_id, url_transparent=url_transparent,
+ **{k: v for k, v in (extract_metadata() if url_transparent else {}).items() if v})
+
+ def extract_dash_manifest(vid_data, formats, subtitle, mpd_url=None):
+ if dash_manifest := traverse_obj(vid_data, 'dash_manifest_xml_string', 'manifest_xml',
+ 'playlist', 'dash_manifest', expected_type=str):
+ fmts, subs = self._parse_mpd_formats_and_subtitles(
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
- mpd_url=url_or_none(vid_data.get('dash_manifest_url')) or mpd_url))
+ mpd_url=url_or_none(vid_data.get('dash_manifest_url')) or mpd_url)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitle[0])
def process_formats(info):
- # Downloads with browser's User-Agent are rate limited. Working around
- # with non-browser User-Agent.
for f in info['formats']:
# Downloads with browser's User-Agent are rate limited. Working around
# with non-browser User-Agent.
@@ -581,77 +785,45 @@ def process_formats(info):
# Formats larger than ~500MB will return error 403 unless chunk size is regulated
f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20
- def yield_all_relay_data(_filter):
- for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})', webpage):
- yield self._parse_json(relay_data, video_id, fatal=False) or {}
-
- def extract_relay_data(_filter):
- return next(filter(None, yield_all_relay_data(_filter)), {})
-
- def extract_relay_prefetched_data(_filter, target_keys=None):
- path = 'data'
- if target_keys is not None:
- path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys))
- return traverse_obj(yield_all_relay_data(_filter), (
- ..., 'require', (None, (..., ..., ..., '__bbox', 'require')),
- lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v),
- ..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {}
-
- if not video_data:
- server_js_data = self._parse_json(self._search_regex([
- r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX,
- rf'bigPipe\.onPageletArrive\(({{.*?id\s*:\s*"{self._SUPPORTED_PAGLETS_REGEX}".*?}})\);',
- ], webpage, 'js data', default='{}'), video_id, js_to_json, False)
- video_data = extract_from_jsmods_instances(server_js_data)
-
- if not video_data:
- data = extract_relay_prefetched_data(
- r'"(?:dash_manifest|playable_url(?:_quality_hd)?)',
- target_keys=('video', 'event', 'nodes', 'node', 'mediaset'))
- if data:
- entries = []
-
- def parse_graphql_video(video):
- v_id = video.get('videoId') or video.get('id') or video_id
- reel_info = traverse_obj(
- video, ('creation_story', 'short_form_video_context', 'playback_video', {dict}))
- if reel_info:
- video = video['creation_story']
- video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
- video.update(reel_info)
-
- formats = []
- q = qualities(['sd', 'hd'])
-
- # Legacy formats extraction
- fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
- for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
- ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
- ('browser_native_sd_url', 'sd')):
- playable_url = fmt_data.get(key)
- if not playable_url:
- continue
+ if data:
+ def parse_graphql_video(video):
+ v_id = video.get('videoId') or video.get('id') or video_id
+ formats, captions, subtitles = [], {}, {}
+ q = qualities(['sd', 'hd'])
+ is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool}))
+
+ # videoDeliveryLegacy formats extraction
+ fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
+ for key, format_id in (('browser_native_hd_url', 'hd'), ('browser_native_sd_url', 'sd')):
+ # obsoleted: ('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), ('playable_url_dash', '')
+ if playable_url := fmt_data.get(key):
if determine_ext(playable_url) == 'mpd':
- formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
+ fmts, subs = self._extract_mpd_formats_and_subtitles(playable_url, video_id, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=(captions if is_broadcast else subtitles))
else:
+ q = qualities(['sd', 'hd'])
formats.append({
'format_id': format_id,
# sd, hd formats w/o resolution info should be deprioritized below DASH
'quality': q(format_id) - 3,
'url': playable_url,
})
- extract_dash_manifest(fmt_data, formats)
+ extract_dash_manifest(fmt_data, formats, [captions if is_broadcast else subtitles])
- # New videoDeliveryResponse formats extraction
- fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
+ # videoDeliveryResponse formats extraction
+ if fmt_data := traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult')):
mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
for idx, dash_manifest in enumerate(dash_manifests):
- extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
+ extract_dash_manifest(dash_manifest, formats, [captions if is_broadcast else subtitles],
+ mpd_url=traverse_obj(mpd_urls, idx))
if not dash_manifests:
# Only extract from MPD URLs if the manifests are not already provided
for mpd_url in mpd_urls:
- formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
+ fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, video_id, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=(captions if is_broadcast else subtitles))
for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
formats.append({
@@ -661,157 +833,174 @@ def parse_graphql_video(video):
'url': prog_fmt['progressive_url'],
})
for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
- formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
-
- if not formats:
- # Do not append false positive entry w/o any formats
- return
-
- automatic_captions, subtitles = {}, {}
- is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool}))
- for caption in traverse_obj(video, (
- 'video_available_captions_locales',
- {lambda x: sorted(x, key=lambda c: c['locale'])},
- lambda _, v: url_or_none(v['captions_url']),
- )):
- lang = caption.get('localized_language') or 'und'
- subs = {
- 'url': caption['captions_url'],
- 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang),
- }
- if caption.get('localized_creation_method') or is_broadcast:
- automatic_captions.setdefault(caption['locale'], []).append(subs)
- else:
- subtitles.setdefault(caption['locale'], []).append(subs)
- captions_url = traverse_obj(video, ('captions_url', {url_or_none}))
- if captions_url and not automatic_captions and not subtitles:
- locale = self._html_search_meta(
- ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US')
- (automatic_captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}]
-
- info = {
- 'id': v_id,
- 'formats': formats,
- 'thumbnail': traverse_obj(
- video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')),
- 'uploader_id': traverse_obj(video, ('owner', 'id', {str_or_none})),
- 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none),
- 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000)
- or float_or_none(video.get('length_in_second'))),
- 'automatic_captions': automatic_captions,
- 'subtitles': subtitles,
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4',
+ fatal=False, m3u8_id='hls')
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=(captions if is_broadcast else subtitles))
+
+ # captions/subtitles
+ for caption in traverse_obj(video, (
+ 'video_available_captions_locales',
+ {lambda x: sorted(x, key=lambda c: c['locale'])},
+ lambda _, v: url_or_none(v['captions_url']),
+ )):
+ lang = caption.get('localized_language') or 'und'
+ subs = {
+ 'url': caption['captions_url'],
+ 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang),
}
- process_formats(info)
- description = try_get(video, lambda x: x['savable_description']['text'])
- title = video.get('name')
- if title:
- info.update({
- 'title': title,
- 'description': description,
- })
+ if caption.get('localized_creation_method') or is_broadcast:
+ captions.setdefault(caption['locale'], []).append(subs)
else:
- info['title'] = description or f'Facebook video #{v_id}'
- entries.append(info)
-
- def parse_attachment(attachment, key='media'):
- media = attachment.get(key) or {}
- if media.get('__typename') == 'Video':
- return parse_graphql_video(media)
-
- nodes = variadic(traverse_obj(data, 'nodes', 'node') or [])
- attachments = traverse_obj(nodes, (
- ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments',
- ..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')),
- 'attachment', {dict}))
- for attachment in attachments:
- ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}),
- ('target', 'attachments', ..., 'styles', 'attachment', {dict}))
- for n in ns:
- parse_attachment(n)
- parse_attachment(attachment)
-
- edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or []
- for edge in edges:
- parse_attachment(edge, key='node')
-
- video = traverse_obj(data, (
- 'event', 'cover_media_renderer', 'cover_video'), 'video', expected_type=dict) or {}
- if video:
- attachments = try_get(video, [
- lambda x: x['story']['attachments'],
- lambda x: x['creation_story']['attachments'],
- ], list) or []
- for attachment in attachments:
- parse_attachment(attachment)
- if not entries:
- parse_graphql_video(video)
-
- if len(entries) > 1:
- return self.playlist_result(entries, video_id)
-
- video_info = entries[0] if entries else {'id': video_id}
- webpage_info = extract_metadata(webpage)
- # honor precise duration in video info
- if video_info.get('duration'):
- webpage_info['duration'] = video_info['duration']
- # preserve preferred_thumbnail in video info
- if video_info.get('thumbnail'):
- webpage_info['thumbnail'] = video_info['thumbnail']
- return merge_dicts(webpage_info, video_info)
-
- if not video_data:
- m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">(.*?)
', webpage)
- if m_msg is not None:
- raise ExtractorError(
- f'The video is not available, Facebook said: "{m_msg.group(1)}"',
- expected=True)
- elif any(p in webpage for p in (
- '>You must log in to continue',
- 'id="login_form"',
- 'id="loginbutton"')):
- self.raise_login_required()
-
- if not video_data and '/watchparty/' in url:
- post_data = {
- 'doc_id': 3731964053542869,
- 'variables': json.dumps({
- 'livingRoomID': video_id,
- }),
- }
-
- prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{')
- if prefetched_data:
- lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict)
- if lsd:
- post_data[lsd['name']] = lsd['value']
-
- relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,')
- for define in (relay_data.get('define') or []):
- if define[0] == 'RelayAPIConfigDefaults':
- self._api_config = define[2]
+ subtitles.setdefault(caption['locale'], []).append(subs)
+ captions_url = traverse_obj(video, ('captions_url', {url_or_none}))
+ if captions_url and not captions and not subtitles:
+ locale = self._html_search_meta(
+ ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US')
+ (captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}]
+ # thumbnails
+ thumbnails = []
+ for url in [uri for uri in [traverse_obj(video, path) for path in [
+ ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri'),
+ ('image', 'uri'), ('previewImage', 'uri'),
+ ]] if url_or_none(uri) is not None]:
+ if (re.search(r'\.(?:jpg|png)', url)
+ and not any(url.split('_cat=')[0] in t['url'] for t in thumbnails)):
+ thumbnails.append({k: v for k, v in {
+ 'url': url,
+ 'height': int_or_none(self._search_regex(
+ r'stp=.+_[a-z]\d+x(\d+)&', url, 'thumbnail height', default=None)),
+ 'preference': None if 'stp=' in url else 1,
+ }.items() if v is not None})
+ # timestamp
+ v_timestamp = traverse_obj(video, 'publish_time', 'creation_time', 'created_time', {int_or_none})
+ if not v_timestamp and v_id != video_id:
+ for x in find_json_obj(post_data, rf'creation_time{Q}:\s*\d+,', rf'created_time{Q}:\s*\d+,',
+ rf'publish_time{Q}:\s*\d+,', get_all=True):
+ if re.search(rf'id{Q}:\s*{Q}{v_id}{Q}', x[1]):
+ if v_timestamp := json.loads(x[1]).get(x[0].split(f'{Q}')[0]):
+ break
+ # uploader
+ if uploader_id := traverse_obj(video, ('owner', 'id', {str_or_none})):
+ if x := list(find_json_obj(data, (rf'id{Q}:\s*{Q}{uploader_id}{Q}[^}}]*{Q}name{Q}:\s*{Q}[^{Q}]',
+ rf'{Q}name{Q}:\s*{Q}[^{Q}][^}}]*{Q}id{Q}:\s*{Q}{uploader_id}{Q}'))):
+ if x[0][1]:
+ video['owner'] = merge_dicts(video['owner'], json.loads(x[0][1]))
+ elif x := list(find_json_obj(post_data, (rf'(owner{Q}:)[^}}]*{Q}name{Q}:\s*{Q}[^{Q}]'),
+ rf'((?!share)\w{{5}}_creator{Q}:)[^}}]*{Q}name{Q}:\s*{Q}[^{Q}]',
+ obj_in_value=True)):
+ if x[0][1]:
+ video['owner'] = json.loads(x[0][1])
+ uploader_id = traverse_obj(video, ('owner', 'id', {str_or_none}))
+ uploader = traverse_obj(video, ('owner', 'name', {str_or_none})) or extract_metadata('uploader')
+ # description
+ v_desc = traverse_obj(video, ('savable_description', 'text', {str_or_none}))
+ if not v_desc and v_id != video_id:
+ if vs_id := traverse_obj(video, (
+ (None, (..., 'video')), 'creation_story', 'id', {str_or_none}), get_all=False):
+ if x := list(find_json_obj(
+ data, rf'{Q}message{Q}:(?:(?!{Q}message{Q}:)[^}}])+{Q}text{Q}:\s*{Q}[^{Q}](?:(?!{Q}id{Q}:).)+{Q}id{Q}:\s*{Q}{vs_id}{Q}')):
+ v_desc = (lambda x: x if x != uploader else None)(json.loads(x[0][1])['message']['text'])
+ # else:
+ # for x in find_json_obj(data, rf'video{Q}:\s*{{{Q}id{Q}:\s*{Q}{v_id}{Q}', get_all=True):
+ # if v_desc := traverse_obj(json.loads(x[1]), ('message', 'text', {str_or_none})):
+ # break
+ # title
+ if v_name := video.get('name'):
+ v_title = v_name if len(v_name) <= 100 else v_name[:(47 + v_name[47:67].rfind(' '))] + '...'
+
+ info = {
+ 'id': v_id,
+ 'title': ((v_title if video.get('name') else None)
+ or (f"{extract_metadata('title')} Facebook Video #{v_id}" if extract_metadata('title')
+ else (f'{uploader} Facebook Video #{v_id}' if uploader else f'Facebook Video #{v_id}'))),
+ 'description': v_desc or extract_metadata('description'),
+ 'thumbnails': thumbnails,
+ 'timestamp': v_timestamp or extract_metadata('timestamp'),
+ 'uploader': uploader,
+ 'uploader_id': uploader_id or webpage_info.get('uploader_id'),
+ 'uploader_url': (traverse_obj(video, ('owner', 'url', {url_or_none}))
+ or (webpage_info.get('uploader_url') if webpage_info.get('uploader') == uploader else None)
+ or (lambda x: f'https://www.facebook.com/profile.php?id={x}' if x else None
+ )(uploader_id or webpage_info.get('uploader_id'))),
+ 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000)
+ or float_or_none(video.get('length_in_second'))),
+ 'formats': formats,
+ 'automatic_captions': captions,
+ 'subtitles': subtitles,
+ 'is_live': video.get('is_live_streaming'),
+ 'was_live': (video.get('broadcast_status') == 'VOD_READY'),
+ 'concurrent_view_count': video.get('liveViewerCount'),
+ 'like_count': extract_metadata('like_count'),
+ 'comment_count': extract_metadata('comment_count'),
+ 'repost_count': extract_metadata('repost_count'),
+ 'age_limit': 18 if (re.search(rf'{Q}validator{Q}:\s*{Q}GRAPHIC{Q}', data)
+ and f'{Q}OverlayWarningScreenViewModel{Q}' in data) else None,
+ }
+ process_formats(info)
+ entries.append(info)
+
+ entries, video_ids = [], []
+ # for idx, x in enumerate(find_json_obj(data, (rf'dash_manifest_url{Q}:\s*{Q}',
+ # rf'_hd_url{Q}:\s*{Q}', rf'_sd_url{Q}:\s*{Q}'), get_all=True)):
+ for idx, x in enumerate(find_json_obj(data, (rf'videoDeliveryLegacyFields{Q}:'), get_all=True)):
+ media = json.loads(x[1])
+ if (media.get('__typename', 'Video') == 'Video'
+ and not media.get('sticker_image')
+ and media.get('id', f'{video_id}_{idx}') not in video_ids):
+ video_ids.append(media.get('id', f'{video_id}_{idx}'))
+ parse_graphql_video(media)
+ if media.get('id') == video_id:
+ break
+
+ if len(entries) > 1:
+ return self.playlist_result(entries, video_id, **{
+ k: v for k, v in extract_metadata().items() if v})
+
+ video_info = entries[0] if entries else {'id': video_id}
+ video_info['title'] = re.sub(r' Facebook Video #\d{15,}$', '', video_info.get('title'))
+ if webpage_info['thumbnails']:
+ if not (any(webpage_info['thumbnails'][0]['url'].split('_cat=')[0] in thumbnail['url']
+ for thumbnail in video_info['thumbnails'])):
+ video_info['thumbnails'].extend(webpage_info['thumbnails'])
+ return merge_dicts(video_info, webpage_info)
+
+ # if 'data' not found
+ video_data = None
- living_room = self._download_json(
- urljoin(url, self._api_config['graphURI']), video_id,
- data=urlencode_postdata(post_data))['data']['living_room']
+ def extract_video_data(instances):
+ video_data = []
+ for item in instances:
+ if try_get(item, lambda x: x[1][0]) == 'VideoConfig':
+ video_item = item[2][0]
+ if video_item.get('video_id'):
+ video_data.append(video_item['videoData'])
+ return video_data
- entries = []
- for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []):
- video = try_get(edge, lambda x: x['node']['video']) or {}
- v_id = video.get('id')
- if not v_id:
- continue
- v_id = str(v_id)
- entries.append(self.url_result(
- self._VIDEO_PAGE_TEMPLATE % v_id,
- self.ie_key(), v_id, video.get('name')))
+ def extract_from_jsmods_instances(js_data):
+ if js_data:
+ return extract_video_data(try_get(
+ js_data, lambda x: x['jsmods']['instances'], list) or [])
- return self.playlist_result(entries, video_id)
+ if server_js_data := self._parse_json(self._search_regex(
+ [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'],
+ webpage, 'server js data', default='{}'), video_id, fatal=False):
+ video_data = extract_video_data(server_js_data.get('instances', []))
if not video_data:
+ if server_js_data := self._parse_json(self._search_regex([
+ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX,
+ rf'bigPipe\.onPageletArrive\(({{.*?id\s*:\s*"{self._SUPPORTED_PAGLETS_REGEX}".*?}})\);',
+ ], webpage, 'js data', default='{}'), video_id, js_to_json, False):
+ video_data = extract_from_jsmods_instances(server_js_data)
+
+ if not video_data and False: # skipped because not working
# Video info not in first request, do a secondary request using
# tahoe player specific URL
tahoe_data = self._download_webpage(
- self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id,
+ self._VIDEO_PAGE_TAHOE_TEMPLATE % p_id, video_id,
+ fatal=False,
+ expected_status=404,
data=urlencode_postdata({
'__a': 1,
'__pc': self._search_regex(
@@ -827,15 +1016,30 @@ def parse_attachment(attachment, key='media'):
headers={
'Content-Type': 'application/x-www-form-urlencoded',
})
- tahoe_js_data = self._parse_json(
- self._search_regex(
- r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data,
- 'tahoe js data', default='{}'),
- video_id, fatal=False)
- video_data = extract_from_jsmods_instances(tahoe_js_data)
+ if tahoe_data:
+ tahoe_js_data = self._parse_json(
+ self._search_regex(
+ r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data,
+ 'tahoe js data', default='{}'),
+ video_id, fatal=False)
+ video_data = extract_from_jsmods_instances(tahoe_js_data)
if not video_data:
- raise ExtractorError('Cannot parse data')
+ m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">(.*?)
', webpage)
+ if m_msg is not None:
+ raise ExtractorError(
+ f'The video is not available, Facebook said: "{m_msg.group(1)}"',
+ expected=True)
+ elif any(p in webpage for p in (
+ '>You must log in to continue',
+ 'id="login_form"',
+ 'id="loginbutton"')):
+ self.raise_login_required(method='cookies')
+ elif not login_data:
+ self.raise_login_required('No video formats found', method='cookies')
+ elif not logged_in:
+ self.raise_login_required('Failed to login with provided data', method='cookies')
+ self.raise_no_formats('No video formats found!')
if len(video_data) > 1:
entries = []
@@ -846,6 +1050,7 @@ def parse_attachment(attachment, key='media'):
entries.append(self.url_result(urljoin(
url, video_url), self.ie_key(), v[0].get('video_id')))
return self.playlist_result(entries, video_id)
+
video_data = video_data[0]
formats = []
@@ -871,7 +1076,7 @@ def parse_attachment(attachment, key='media'):
'quality': preference,
'height': 720 if quality == 'hd' else None,
})
- extract_dash_manifest(f[0], formats)
+ extract_dash_manifest(f[0], formats, [subtitles])
subtitles_src = f[0].get('subtitles_src')
if subtitles_src:
subtitles.setdefault('en', []).append({'url': subtitles_src})
@@ -882,31 +1087,33 @@ def parse_attachment(attachment, key='media'):
'subtitles': subtitles,
}
process_formats(info_dict)
- info_dict.update(extract_metadata(webpage))
+ info_dict.update(webpage_info)
return info_dict
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url
- return self._extract_from_url(real_url, video_id)
-
class FacebookPluginsVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?Phttps.+)'
_TESTS = [{
'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560',
- 'md5': '5954e92cdfe51fe5782ae9bda7058a07',
'info_dict': {
'id': '10154383743583686',
'ext': 'mp4',
- # TODO: Fix title, uploader
'title': 'What to do during the haze?',
- 'uploader': 'Gov.sg',
+ 'description': 'md5:81839c0979803a014b20798df255ed0b',
+ 'thumbnail': r're:^https?://.*',
+ 'uploader': 'gov.sg',
+ 'uploader_id': '100064718678925',
+ 'uploader_url': r're:^https?://.*',
'upload_date': '20160826',
'timestamp': 1472184808,
+ 'duration': 65.087,
+ 'view_count': int,
+ 'concurrent_view_count': int,
+ 'live_status': 'not_live',
+ 'like_count': int,
+ 'comment_count': int,
},
'add_ie': [FacebookIE.ie_key()],
}, {
@@ -937,7 +1144,7 @@ class FacebookRedirectURLIE(InfoExtractor):
'playable_in_embed': True,
'categories': ['Music'],
'channel': 'Boiler Room',
- 'uploader_id': 'brtvofficial',
+ 'uploader_id': '@boilerroom',
'uploader': 'Boiler Room',
'tags': 'count:11',
'duration': 3332,
@@ -945,11 +1152,16 @@ class FacebookRedirectURLIE(InfoExtractor):
'thumbnail': 'https://i.ytimg.com/vi/pO8h3EaFRdo/maxresdefault.jpg',
'channel_url': 'https://www.youtube.com/channel/UCGBpxWJr9FNOcFYA5GkKrMg',
'availability': 'public',
- 'uploader_url': 'http://www.youtube.com/user/brtvofficial',
+ 'uploader_url': r're:^https?://.*',
'upload_date': '20150917',
+ 'timestamp': 1442489450,
'age_limit': 0,
'view_count': int,
'like_count': int,
+ 'heatmap': 'count:100',
+ 'channel_is_verified': True,
+ 'channel_follower_count': int,
+ 'comment_count': int,
},
'add_ie': ['Youtube'],
'params': {'skip_download': 'Youtube'},
@@ -968,18 +1180,19 @@ class FacebookReelIE(InfoExtractor):
_TESTS = [{
'url': 'https://www.facebook.com/reel/1195289147628387',
- 'md5': 'a53256d10fc2105441fe0c4212ed8cea',
'info_dict': {
'id': '1195289147628387',
'ext': 'mp4',
- 'title': r're:9\.6K views · 355 reactions .+ Let the “Slapathon” commence!! .+ LL COOL J · Mama Said Knock You Out$',
- 'description': r're:When your trying to help your partner .+ LL COOL J · Mama Said Knock You Out$',
+ 'title': 'md5:32aab9976c6b8a145fc0d799631e2b74',
+ 'description': 'md5:3ea795c5ebb7ed28e3e78bb7b1191753',
'uploader': 'Beast Camp Training',
'uploader_id': '100040874179269',
+ 'uploader_url': r're:^https?://.*',
'duration': 9.579,
'timestamp': 1637502609,
'upload_date': '20211121',
'thumbnail': r're:^https?://.*',
+ 'live_status': 'not_live',
'like_count': int,
'comment_count': int,
'repost_count': int,
@@ -989,7 +1202,7 @@ class FacebookReelIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
return self.url_result(
- f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id)
+ f'https://www.facebook.com/watch/?v={video_id}', FacebookIE, video_id)
class FacebookAdsIE(InfoExtractor):
@@ -1073,11 +1286,15 @@ def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- post_data = traverse_obj(
- re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage), (..., {json.loads}))
- data = get_first(post_data, (
- 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ...,
- 'entryPointRoot', 'otherProps', 'deeplinkAdCard', 'snapshot', {dict}))
+ if post_data := traverse_obj(
+ re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage), (..., {json.loads})):
+ data = get_first(post_data, (
+ 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ...,
+ 'entryPointRoot', 'otherProps', 'deeplinkAdCard', 'snapshot', {dict}))
+ elif post_data := traverse_obj(
+ re.findall(r's\.handle\(({.*})\);requireLazy\(', webpage), (..., {json.loads})):
+ data = get_first(post_data, (
+ 'require', ..., ..., ..., 'props', 'deeplinkAdCard', 'snapshot', {dict}))
if not data:
raise ExtractorError('Unable to extract ad data')