From d144b0164735520bfc71bc058980a074bc0267c1 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Fri, 16 Aug 2024 09:12:08 +0800 Subject: [PATCH] [ie/facebook] Experimental - extract data from JSON strings by searching with regex instead of traversing dicts - extract linked video - improved thumbnail info - more lenient url pattern - live status Known issue: videos in comments might be included --- yt_dlp/extractor/facebook.py | 1257 +++++++++++++++++++--------------- 1 file changed, 711 insertions(+), 546 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index a43ffe95e2f4..ad0617f3a25e 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -12,7 +12,6 @@ determine_ext, float_or_none, format_field, - get_element_by_id, get_first, int_or_none, join_nonempty, @@ -27,7 +26,6 @@ url_or_none, urlencode_postdata, urljoin, - variadic, ) @@ -39,19 +37,12 @@ class FacebookIE(InfoExtractor): (?:[^#]*?\#!/)? (?: (?: - permalink\.php| - video/video\.php| - photo\.php| - video\.php| - video/embed| - story\.php| - watch(?:/live)?/? + (?:video/)?[a-z]{5,}(?:\.php|/live)?/? )\?(?:.*?)(?:v|video_id|story_fbid)=| - [^/]+/videos/(?:[^/]+/)?| - [^/]+/posts/| + [^/]+/(?:videos|posts)/(?:[^/]+/)?| events/(?:[^/]+/)?| groups/[^/]+/(?:permalink|posts)/| - watchparty/ + [a-z]{5,}/| )| facebook: ) @@ -73,177 +64,196 @@ class FacebookIE(InfoExtractor): _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' _TESTS = [{ - 'url': 'https://www.facebook.com/radiokicksfm/videos/3676516585958356/', - 'info_dict': { - 'id': '3676516585958356', - 'ext': 'mp4', - 'title': 'dr Adam Przygoda', - 'description': 'md5:34675bda53336b1d16400265c2bb9b3b', - 'uploader': 'RADIO KICKS FM', - 'upload_date': '20230818', - 'timestamp': 1692346159, - 'thumbnail': r're:^https?://.*', - 'uploader_id': '100063551323670', - 'duration': 3132.184, - 'view_count': int, - 'concurrent_view_count': 0, - }, - }, { - 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', - 'md5': '6a40d33c0eccbb1af76cf0485a052659', - 'info_dict': { - 'id': '637842556329505', - 'ext': 'mp4', - 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', - 'uploader': 'Tennis on Facebook', - 'upload_date': '20140908', - 'timestamp': 1410199200, - }, - 'skip': 'Requires logging in', - }, { - # data.video + # data.video.story.attachments[].media 'url': 'https://www.facebook.com/video.php?v=274175099429670', 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Asif', + 'title': 'AsifFacebook', 'description': '', - 'uploader': 'Asif Nawab Butt', - 'upload_date': '20140506', - 'timestamp': 1399398998, 'thumbnail': r're:^https?://.*', - 'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl', + 'timestamp': 1399398998, + 'upload_date': '20140506', + 'uploader': 'Asif Nawab Butt', + 'uploader_id': r're:^pfbid.*', + 'uploader_url': r're:^https?://.*', 'duration': 131.03, + 'view_count': int, 'concurrent_view_count': int, + 'live_status': 'not_live', }, }, { - 'note': 'Video with DASH manifest', - 'url': 'https://www.facebook.com/video.php?v=957955867617029', - 'md5': 'b2c28d528273b323abe5c6ab59f0f030', + # data.video.story.attachments[].media + 'url': 'https://www.facebook.com/radiokicksfm/videos/3676516585958356/', 'info_dict': { - 'id': '957955867617029', + 'id': '3676516585958356', 'ext': 'mp4', - 'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...', - 'uploader': 'Demy de Zeeuw', - 'upload_date': '20160110', - 'timestamp': 1452431627, + 'title': 'dr Adam Przygoda', + 'description': 'md5:34675bda53336b1d16400265c2bb9b3b', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1692346159, + 'upload_date': '20230818', + 'uploader': 'RADIO KICKS FM', + 'uploader_id': '100063551323670', + 'uploader_url': r're:^https?://.*', + 'duration': 3133.583, + 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'was_live', }, - 'skip': 'Requires logging in', }, { - 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', - 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', + # data.video.story.attachments[].media + 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', 'info_dict': { - 'id': '544765982287235', + 'id': '10155529876156509', 'ext': 'mp4', - 'title': '"What are you doing running in the snow?"', - 'uploader': 'FailArmy', + 'title': 'Holocaust survivor becomes US citizen', + 'description': 'She survived the holocaust — and years later, she’s getting her citizenship so she can vote for Hillary Clinton http://cnn.it/2eERh5f', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1477818095, + 'upload_date': '20161030', + 'uploader': 'CNN', + 'uploader_id': '100059479812265', + 'uploader_url': r're:^https?://.*', + 'duration': 44.181, + 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'not_live', }, - 'skip': 'Video gone', }, { - 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903', - 'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3', + # data.video.story.attachments[].media + 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', 'info_dict': { - 'id': '1035862816472149', + 'id': '359649331226507', 'ext': 'mp4', - 'title': 'What the Flock Is Going On In New Zealand Credit: ViralHog', - 'uploader': 'S. Saint', + 'title': 'Fnatic vs. EG - Group A - Opening Match - ESL One Birmingham Day 1', + 'description': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1527084179, + 'upload_date': '20180523', + 'uploader': 'ESL One Dota 2', + 'uploader_id': '100066514874195', + 'uploader_url': r're:^https?://.*', + 'duration': 4524.001, + 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'not_live', + }, + 'params': { + 'skip_download': True, }, - 'skip': 'Video gone', }, { - 'note': 'swf params escaped', - 'url': 'https://www.facebook.com/barackobama/posts/10153664894881749', - 'md5': '97ba073838964d12c70566e0085c2b91', + # data.video.story.attachments[].media + 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', 'info_dict': { - 'id': '10153664894881749', + 'id': '106560053808006', 'ext': 'mp4', - 'title': 'Average time to confirm recent Supreme Court nominees: 67 days Longest it\'s t...', + 'title': 'JosefFacebook', + 'description': '', 'thumbnail': r're:^https?://.*', - 'timestamp': 1456259628, - 'upload_date': '20160223', - 'uploader': 'Barack Obama', + 'timestamp': 1549275572, + 'upload_date': '20190204', + 'uploader': 'Josef Novak', + 'uploader_id': r're:^pfbid.*', + 'uploader_url': r're:^https?://.*', + 'duration': 3.283, + 'concurrent_view_count': int, + 'live_status': 'not_live', }, - 'skip': 'Gif on giphy.com gone', }, { - # have 1080P, but only up to 720p in swf params # data.video.story.attachments[].media - 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': 'ca63897a90c9452efee5f8c40d080e25', + 'url': 'https://www.facebook.com/watch/?v=647537299265662', 'info_dict': { - 'id': '10155529876156509', + 'id': '647537299265662', 'ext': 'mp4', - 'title': 'Holocaust survivor becomes US citizen', - 'description': 'She survived the holocaust — and years later, she’s getting her citizenship so she can vote for Hillary Clinton http://cnn.it/2eERh5f', - 'timestamp': 1477818095, - 'upload_date': '20161030', - 'uploader': 'CNN', + 'title': 'Padre enseña a su hijo a cómo bañar un recién nacido junto con su...', + 'description': 'Padre ense\u00f1a a su hijo a c\u00f3mo ba\u00f1ar un reci\u00e9n nacido junto con su gato y se hace viral, mir\u00e1 el video 😍', 'thumbnail': r're:^https?://.*', + 'timestamp': 1605534618, + 'upload_date': '20201116', + 'uploader': 'InfoPico', + 'uploader_id': '100064391811349', + 'uploader_url': r're:^https?://.*', + 'duration': 136.179, 'view_count': int, - 'uploader_id': '100059479812265', 'concurrent_view_count': int, - 'duration': 44.478, + 'live_status': 'not_live', }, }, { - # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media - 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', + # data.video.story.attachments[].media + 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', 'info_dict': { - 'id': '1417995061575415', + 'id': '1823658634322275', 'ext': 'mp4', - 'title': 'Довгоочікуване відео | By Yaroslav - Facebook', - 'description': 'Довгоочікуване відео', - 'timestamp': 1486648217, - 'upload_date': '20170209', - 'uploader': 'Yaroslav Korpan', - 'uploader_id': 'pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl', - 'concurrent_view_count': int, + 'title': 'Live Webcam from Corfu - Greece', + 'description': 'md5:84c1af6894ecffe710c79744e4873e85', 'thumbnail': r're:^https?://.*', + 'timestamp': 1521449766, + 'upload_date': '20180319', + 'uploader': 'SkylineWebcams', + 'uploader_id': '100064307154679', + 'uploader_url': r're:^https?://.*', + 'duration': 14424.199, 'view_count': int, - 'duration': 11736.446, + 'concurrent_view_count': int, + 'live_status': 'was_live', }, 'params': { 'skip_download': True, }, }, { - # FIXME: Cannot parse data error - 'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471', + # data.video.story.attachments[].media + 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', 'info_dict': { - 'id': '1072691702860471', + 'id': '117576630041613', 'ext': 'mp4', - 'title': 'md5:ae2d22a93fbb12dad20dc393a869739d', - 'timestamp': 1477305000, - 'upload_date': '20161024', - 'uploader': 'La Guía Del Varón', + 'title': 'Officers Rescue Trapped Motorist from Mahoning River Crash 11-22-20', 'thumbnail': r're:^https?://.*', + 'timestamp': 1606162592, + 'upload_date': '20201123', + 'uploader': 'City of Alliance Police Department', + 'uploader_id': '100064413680392', + 'uploader_url': r're:^https?://.*', + 'duration': 101.504, + 'view_count': int, + 'concurrent_view_count': int, }, 'skip': 'Requires logging in', }, { - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media - 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', + # web_link url: http://giphy.com/gifs/l3vR8BKU0m8uX2mAg + 'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471', 'info_dict': { - 'id': '202882990186699', + 'id': 'l3vR8BKU0m8uX2mAg', 'ext': 'mp4', - 'title': 'birb (O v O") | Hello? Yes your uber ride is here', - 'description': 'Hello? Yes your uber ride is here * Jukin Media Verified * Find this video and others like it by visiting...', - 'timestamp': 1486035513, - 'upload_date': '20170202', - 'uploader': 'Elisabeth Ahtn', - 'uploader_id': '100013949973717', + 'title': 'Nada mas satisfactorio que los otros 5... - La Guía Del Varón', + 'description': 'Nada mas satisfactorio que los otros 5 minutos', + 'tags': ['giphyupload'], + 'thumbnail': r're:^https?://.*', + 'timestamp': 1477305000, + 'upload_date': '20161022', + 'uploader': 'La Guía Del Varón', + 'uploader_id': '100050567346031', + 'uploader_url': r're:^https?://.*', }, - 'skip': 'Requires logging in', + 'skip': 'Gif on giphy.com', }, { - # data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media + # web_link url: https://www.facebook.com/attn/videos/1569199726448814/ 'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/', 'info_dict': { 'id': '1569199726448814', 'ext': 'mp4', - 'title': 'Pence MUST GO!', - 'description': 'Vickie Gentry shared a memory.', + 'title': 'What if marijuana companies were allowed to have TV ads like B...', + 'description': 'What if we treated marijuana ads like big pharma ads?', + 'thumbnail': r're:^https?://.*', 'timestamp': 1511548260, 'upload_date': '20171124', - 'uploader': 'Vickie Gentry', - 'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl', - 'thumbnail': r're:^https?://.*', - 'duration': 148.435, + 'uploader': 'ATTN:', + 'uploader_id': '100064451419378', + 'uploader_url': r're:^https?://.*', + 'duration': 148.224, + 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'not_live', }, }, { # data.node.comet_sections.content.story.attachments[].styles.attachment.media @@ -251,15 +261,37 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '6968553779868435', 'ext': 'mp4', + 'title': 'ATTN: - Learning new problem-solving skills is hard for...', 'description': 'md5:2f2fcf93e97ac00244fe64521bbdb0cb', - 'uploader': 'ATTN:', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1701975646, 'upload_date': '20231207', - 'title': 'ATTN:', - 'duration': 132.675, + 'uploader': 'ATTN:', 'uploader_id': '100064451419378', + 'uploader_url': r're:^https?://.*', + 'duration': 132.675, 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'not_live', + }, + }, { + # data.node.comet_sections.content.story.attachments[].style.attachment.media + 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', + 'info_dict': { + 'id': '202882990186699', + 'ext': 'mp4', + 'title': 'birb (O v O")', + 'description': 'md5:963dee8a667a2b49f2059cf7ab54fe55', 'thumbnail': r're:^https?://.*', - 'timestamp': 1701975646, + 'timestamp': 1486035494, + 'upload_date': '20170202', + 'uploader': 'Elisabeth Ahtn', + 'uploader_id': r're:^pfbid.*', + 'uploader_url': r're:^https?://.*', + 'duration': 23.714, + 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'not_live', }, }, { # data.node.comet_sections.content.story.attachments[].styles.attachment.media @@ -267,146 +299,110 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '270103405756416', 'ext': 'mp4', - 'title': 'Lela Evans', - 'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...', + 'title': 'Lela Evans - Today Makkovik\'s own Pilot Mandy Smith made...', + 'description': 'md5:cc93a91feb89923303c1f78656791e4d', 'thumbnail': r're:^https?://.*', - 'uploader': 'Lela Evans', - 'uploader_id': 'pfbid0shZJipuigyy5mqrUJn9ub5LJFWNHvan5prtyi3LrDuuuJ4NwrURgnQHYR9fywBepl', - 'upload_date': '20231228', 'timestamp': 1703804085, + 'upload_date': '20231228', + 'uploader': 'Lela Evans', + 'uploader_id': r're:^pfbid.*', + 'uploader_url': r're:^https?://.*', 'duration': 394.347, 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'not_live', }, }, { - 'url': 'https://www.facebook.com/story.php?story_fbid=pfbid0Fnzhm8UuzjBYpPMNFzaSpFE9UmLdU4fJN8qTANi1Dmtj5q7DNrL5NERXfsAzDEV7l&id=100073071055552', - 'only_matching': True, - }, { - 'url': 'https://www.facebook.com/video.php?v=10204634152394104', - 'only_matching': True, + # data.node.comet_sections.content.story.attachments[].style.attachment.all_subattachments.nodes[].media.video_grid_renderer.video + 'url': 'https://www.facebook.com/story.php?story_fbid=5268096689957022&id=100002702286715', + 'info_dict': { + 'id': '669977824610306', + 'ext': 'mp4', + 'title': 'md5:f2666feb05057a09f8b6f542cd7a3eda', + 'description': 'md5:f5775e7245153857caade33e757ceb21', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1671780203, + 'upload_date': '20221223', + 'uploader': 'Azura Tan Siow Ling', + 'uploader_id': '100002702286715', + 'uploader_url': r're:^https?://.*', + 'duration': 20.666, + 'live_status': 'not_live', + }, }, { - 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', - 'only_matching': True, + # data.node.comet_sections.content.story.attachments[].styles.attachment.all_subattachments.nodes[].media.video_grid_renderer.video + 'url': 'https://www.facebook.com/hanaryushi/posts/pfbid02Thgaymz9f4QXZ1XogoP4eETpdY2WSy7CLGCMuy3VVQopeet9MHbYR7H9tXYD4UE5l', + 'info_dict': { + 'id': 'pfbid02Thgaymz9f4QXZ1XogoP4eETpdY2WSy7CLGCMuy3VVQopeet9MHbYR7H9tXYD4UE5l', + 'title': 'Hana Ryuushi - “sharing a relationship of having our...', + 'description': 'md5:75d2f9d921f40e90ba3b176f0d827cf7', + 'timestamp': 1706949770, + 'upload_date': '20240203', + 'uploader': 'Hana Ryuushi', + 'uploader_id': '100005357179289', + 'uploader_url': 'https://www.facebook.com/hanaryushi', + }, + 'playlist_count': 2, }, { # data.mediaset.currMedia.edges 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', - 'only_matching': True, - }, { - # data.video.story.attachments[].media - 'url': 'facebook:544765982287235', - 'only_matching': True, - }, { - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media - 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', - 'only_matching': True, - }, { - # data.video.creation_story.attachments[].media - 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', - 'only_matching': True, - }, { - # data.video - 'url': 'https://www.facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd.onion/video.php?v=274175099429670', - 'only_matching': True, - }, { - # no title - 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', - 'only_matching': True, - }, { - # data.video - 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', 'info_dict': { - 'id': '359649331226507', + 'id': '10153870694020942', 'ext': 'mp4', - 'title': 'Fnatic vs. EG - Group A - Opening Match - ESL One Birmingham Day 1', - 'description': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses', - 'timestamp': 1527084179, - 'upload_date': '20180523', - 'uploader': 'ESL One Dota 2', - 'uploader_id': '100066514874195', - 'duration': 4524.212, - 'view_count': int, + 'title': 'Premier\'s Playoff Challenge', + 'description': 'md5:079134a18ac00b11ec5815fccf75a5a8', 'thumbnail': r're:^https?://.*', - 'concurrent_view_count': int, - }, - 'params': { - 'skip_download': True, + 'timestamp': 1429133517, + 'upload_date': '20150415', + 'uploader': 'Christy Clark', + 'uploader_id': '100045032167189', + 'uploader_url': r're:^https?://.*', + 'duration': 31.197, + 'view_count': int, + 'live_status': 'not_live', }, }, { - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media - 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', + # data.event.cover_media_renderer.cover_video + 'url': 'https://www.facebook.com/events/464667289575302/', 'info_dict': { - 'id': '106560053808006', + 'id': '859946639295361', 'ext': 'mp4', - 'title': 'Josef', + 'title': 'June Salsa & Bachata Classes On Sundays for Absolute Beginners, Improvers & Advance level.', + 'description': 'Dance event in Hong Kong by Dance With Style on Sunday, June 16 2024', 'thumbnail': r're:^https?://.*', - 'concurrent_view_count': int, - 'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl', - 'timestamp': 1549275572, - 'duration': 3.413, - 'uploader': 'Josef Novak', - 'description': '', - 'upload_date': '20190204', + 'uploader': 'Dance With Style', + 'uploader_id': '100064171651675', + 'uploader_url': r're:^https?://.*', + 'live_status': 'not_live', }, }, { # data.video.story.attachments[].media - 'url': 'https://www.facebook.com/watch/?v=647537299265662', + 'url': 'facebook:544765982287235', 'only_matching': True, }, { - # FIXME: https://github.com/yt-dlp/yt-dlp/issues/542 - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media - 'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271', - 'info_dict': { - 'id': '10157667649866271', - }, - 'playlist_count': 3, - 'skip': 'Requires logging in', + 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', + 'only_matching': True, }, { - # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media - 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', - 'info_dict': { - 'id': '117576630041613', - 'ext': 'mp4', - # TODO: title can be extracted from video page - 'title': 'Facebook video #117576630041613', - 'uploader_id': '189393014416438', - 'upload_date': '20201123', - 'timestamp': 1606162592, - }, - 'skip': 'Requires logging in', + 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', + 'only_matching': True, }, { - # node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/', - 'info_dict': { - 'id': '211567722618337', - 'ext': 'mp4', - 'title': 'Facebook video #211567722618337', - 'uploader_id': '127875227654254', - 'upload_date': '20161122', - 'timestamp': 1479793574, - }, - 'skip': 'No video', + 'only_matching': True, }, { - # data.video.creation_story.attachments[].media - 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', + 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', 'only_matching': True, }, { - 'url': 'https://www.facebook.com/watchparty/211641140192478', - 'info_dict': { - 'id': '211641140192478', - }, - 'playlist_count': 1, - 'skip': 'Requires logging in', + 'url': 'https://www.facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd.onion/video.php?v=274175099429670', + 'only_matching': True, }, { - # data.event.cover_media_renderer.cover_video - 'url': 'https://m.facebook.com/events/1509582499515440', - 'info_dict': { - 'id': '637246984455045', - 'ext': 'mp4', - 'title': 'ANALISI IN CAMPO OSCURO " Coaguli nel sangue dei vaccinati"', - 'description': 'Other event by Comitato Liberi Pensatori on Tuesday, October 18 2022', - 'thumbnail': r're:^https?://.*', - 'uploader': 'Comitato Liberi Pensatori', - 'uploader_id': '100065709540881', - }, + 'url': 'https://www.facebook.com/story.php?story_fbid=pfbid0Fnzhm8UuzjBYpPMNFzaSpFE9UmLdU4fJN8qTANi1Dmtj5q7DNrL5NERXfsAzDEV7l&id=100073071055552', + 'only_matching': True, + }, { + 'url': 'https://m.facebook.com/stories/121668313179875/', + 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/stories/100944752039935/UzpfSVNDOjY2ODMzMzk5NTUwMzc2MA==/', + 'only_matching': True, }] _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' _api_config = { @@ -414,6 +410,9 @@ class FacebookIE(InfoExtractor): } def _perform_login(self, username, password): + # raise error because login with username/password is not working + self.raise_login_required('Login with username/password is currently not working', method='cookies') + login_page_req = Request(self._LOGIN_URL) self._set_cookie('facebook.com', 'locale', 'en_US') login_page = self._download_webpage(login_page_req, None, @@ -426,7 +425,7 @@ def _perform_login(self, username, password): login_form = { 'email': username, - 'pass': password, + 'pass': password, # "encpass" is needed instead of plain password 'lsd': lsd, 'lgnrnd': lgnrnd, 'next': 'http://facebook.com/home.php', @@ -472,65 +471,415 @@ def _perform_login(self, username, password): self.report_warning(f'unable to log in: {err}') return - def _extract_from_url(self, url, video_id): + def _real_extract(self, url): + video_id = self._match_id(url) + url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url webpage = self._download_webpage( - url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) - - def extract_metadata(webpage): - post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( - r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] - post = traverse_obj(post_data, ( - ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] - media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( - k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) - title = get_first(media, ('title', 'text')) - description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) - page_title = title or self._html_search_regex(( - r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', - r'(?s)(?P.*?)', - self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'(?P<content>.+?)', - ), webpage, 'title', default=None, group='content') - description = description or self._html_search_meta( - ['description', 'og:description', 'twitter:description'], - webpage, 'description', default=None) - uploader_data = ( - get_first(media, ('owner', {dict})) - or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name'])) - or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) - or get_first(post, ('node', 'actors', ..., {dict})) - or get_first(post, ('event', 'event_creator', {dict})) or {}) - uploader = uploader_data.get('name') or ( - clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) - or self._search_regex( - (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False)) - timestamp = int_or_none(self._search_regex( - r']+data-utime=["\'](\d+)', webpage, - 'timestamp', default=None)) - thumbnail = self._html_search_meta( - ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) - # some webpages contain unretrievable thumbnail urls - # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1 - # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ - if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): - thumbnail = None - info_dict = { - 'description': description, - 'uploader': uploader, - 'uploader_id': uploader_data.get('id'), - 'timestamp': timestamp, - 'thumbnail': thumbnail, - 'view_count': parse_count(self._search_regex( - (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'), - webpage, 'view count', default=None)), - 'concurrent_view_count': get_first(post, ( - ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), - } + re.sub(r'://(?:[\w-]+\.)?facebook\.com/', '://www.facebook.com/', url), video_id) + + post_data = re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage) + sjs_data = [self._parse_json(j, video_id, fatal=False) for j in post_data] + cookies = self._get_cookies(url) + # user passed logged-in cookies or attempted to login + login_data = cookies.get('c_user') and cookies.get('xs') + logged_in = False + if login_data: + logged_in = get_first(sjs_data, ( + 'require', ..., ..., ..., '__bbox', 'define', + lambda _, v: 'CurrentUserInitialData' in v, ..., 'ACCOUNT_ID'), default='0') != '0' + if logged_in and (info := get_first(sjs_data, ( + 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data', + (('ufac_client', 'state', (('set_contact_point_state_renderer', 'title'), + ('intro_state_renderer', 'header_title'))), + ('epsilon_checkpoint', 'screen', 'title')), + ))): + if any(content in info for content in ['days left to appeal', 'suspended your account']): + raise ExtractorError('Your account is suspended', expected=True) + if 'Enter mobile number' == info: + raise ExtractorError('Facebook is requiring mobile number confirmation', expected=True) + if 'your account has been locked' in info: + raise ExtractorError('Your account has been locked', expected=True) + + if props := get_first(sjs_data, ( + 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., (None, (..., ...)), 'rootView', + lambda _, v: v.get('title') is not None)): + if not self._cookies_passed: + self.raise_login_required(method='cookies') + else: + msg = re.sub(r'\s{2,}', ' ', join_nonempty('title', 'body', delim='. ', from_dict=props)) + raise ExtractorError(f'This video is not available. Facebook said: {msg}', expected=True) + + if post_data and not re.search(r'"[^"]+[^(feed)]_story[^"]*":', ','.join(post_data)): + raise ExtractorError('An unknown error occurred. Please try again.', expected=True) + + def find_json_obj(json_strings, *patterns, obj_in_value=False, get_all=False): + """ + Find JSON object, in the form of a string, by regular expression + >>> obj = find_json_obj([json_a, _and_b], regex_a, (regex_b, _or_c), obj_in_value=False, get_all=True) + @param json_strings string/list JSON string/a list of JSON strings (match all) + *patterns string/tuple regex patterns (if tuple, return only the 1st matched pattern) + obj_in_value boolean False: find the object(s) containing (one of) the pattern(s) + True : given pattern(s) of the key(s) to find the + object(s) in the value of that key(s) + get_all boolean return the 1st or all of the results of each regex pattern + @return list of tuple a list of (matching pattern, matched JSON object) + """ + def find_offset(string, bracket, quote): + _BRACKET_MAP = { + '{': ([f'{{{quote}'], ['},', '}]', '}}', f'}}{quote}'], (1 if obj_in_value else -1)), + '}': (['},', '}]', '}}', f'}}{quote}'], [f'{{{quote}'], 1), + } # ([search pattern], [opposite sign], search direction); search direction: 1 - forward, -1 - backward + string = re.sub(rf'{{\\{quote}([^{quote}]+\\{quote}:)', rf'{{{quote}\1 ', string.replace('{}', '[]')) + count, b_sum, offset = 0, 0, 0 + for y, x in zip(((string[1:] + ' ') if _BRACKET_MAP[bracket][2] > 0 else (' ' + string[:-1]))[::_BRACKET_MAP[bracket][2]], + string[::_BRACKET_MAP[bracket][2]]): + s = (x + y) if _BRACKET_MAP[bracket][2] > 0 else (y + x) + count += (1 if s in _BRACKET_MAP[bracket][0] or s in _BRACKET_MAP[bracket][1] else 0) + b_sum += (1 if s in _BRACKET_MAP[bracket][0] else (-1 if s in _BRACKET_MAP[bracket][1] else 0)) + offset += 1 + if count > 0 and b_sum >= (0 if obj_in_value else 1): + break + return offset * _BRACKET_MAP[bracket][2] + + for json_str in (json_strings if isinstance(json_strings, list) else [json_strings]): # loop all + if isinstance(json_str, str): + # check if json_str is a JSON string and get the quotation mark (either " or ') + if quote := (lambda x: x.group(1) if x else None)(re.search(r'(["\']):\s*[\[{]*\1', json_str)): + for patterns_item in patterns: + for pattern in (patterns_item if isinstance(patterns_item, tuple) else [patterns_item]): + # 'patterns_item' loop - loop each item in *patterns (item can be a str or tuple) + found = False + if isinstance(pattern, str): + for m in re.finditer(pattern, json_str): # break according to get_all + if obj_in_value: + i = (lambda x, y: (m.start(m.lastindex or 0) + x - 1) if x > 0 + else ((m.end(m.lastindex or 0) + len(y.group(0)) - 1) if y else None) + )(m.group(m.lastindex or 0).rfind('{'), + re.match(r'^\w*(?:":)?:?\s*{', json_str[m.end(m.lastindex or 0):])) + else: + i = m.start(m.lastindex or 0) + if i: + opening = (i + find_offset( + json_str[(i * obj_in_value):(i * (not obj_in_value) - obj_in_value * 2 + 1)], '{', quote, + ) - obj_in_value) + closing = i + find_offset(json_str[i:], '}', quote) + if isinstance(opening, int) and isinstance(closing, int): + found = True + yield (m.group(0), json_str[opening:closing]) + if not get_all: + break + else: # if this for loop ends with break (i.e. not get_all), else clause is not executed + if found: + break # break 'patterns_item' loop if found and get_all + continue # move on to the next 'pattern' (if exists) in 'patterns_item' if not found + break # break 'patterns_item' loop if found and not get_all + if found and isinstance(patterns_item, tuple): + break # break 'patterns_item' loop if found and patterns_item is a tuple + + def extract_metadata(field=None): + if webpage_info.get(field) is not None: + return webpage_info[field] + elif field is None and webpage_info.get('timestamp') is not None: + return webpage_info + # extract data + description = title = timestamp = uploader_info = None + # uploader + if field == 'uploader': + for x in find_json_obj(post_data, (rf'actors{Q}:[^}}]+{Q}__isActor{Q}:', + rf'owner{Q}:[^}}]+{Q}name{Q}:\s*{Q}[^{Q}]'), + get_all=True): + if re.search(rf'id{Q}:\s*{Q}{s_id}{Q}', x[1]): + uploader_info = traverse_obj(json.loads(x[1]), { + 'uploader': ((('actors', ...), ('owner', 'owner_as_page'), 'video_owner'), + 'name', {str}), + 'uploader_id': ((('actors', ...), ('owner', 'owner_as_page'), 'video_owner'), + 'id', {str}), + 'uploader_url': ((('actors', ...), ('owner', 'owner_as_page'), 'video_owner'), + 'url', {url_or_none}), + }, get_all=False) + break + if uploader_info: + webpage_info.update(uploader_info) + # title / description + if field in ('title', 'description', None): + for x in find_json_obj(post_data, + rf'{Q}message{Q}:(?:(?!{Q}message{Q}:)[^}}])+{Q}text{Q}:\s*{Q}[^{Q}](?:(?!{Q}id{Q}:).)+{Q}id{Q}:', + get_all=True): + x_dict = json.loads(x[1]) + for i in [i for i in [s_id, p_id] if i is not None]: + if x_dict.get('id') == i: + if (description := x_dict['message'] if isinstance(x_dict['message'], str) + else traverse_obj(x_dict, ('message', 'text', {str_or_none}))): + if (track_title := (lambda x: x.group(0) if x else None + )(re.search(rf'{Q}track_title{Q}:\s*{Q}((?:[^{Q}\\]|\\.)*){Q}', x[1]))): + description += '. ' + json.loads('{' + track_title + '}')['track_title'] + break + if description: + break + description = description or self._html_search_meta( + ['description', 'og:description', 'twitter:description'], + webpage, 'description', default='') + for x in find_json_obj(post_data, rf'title{Q}:\s*[^}}]+{Q}text{Q}:\s*{Q}[^{Q}]', get_all=True): + x_dict = json.loads(x[1]) + if p_id: + if (text := traverse_obj(x_dict, ('title', 'text', {str_or_none}))): + title = title or (text if x_dict.get('id') == p_id else None) + description = description or (text if x_dict.get('id') == s_id else description) + if title and description: + break + title = (lambda x: x if x != extract_metadata('uploader') else None + )(title + or (self._html_search_regex( + (r'\s(?P<content>[\s\S]+?)\s', + r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', + r'(?s)(?P.*?)'), + re.sub(r'(Facebook(\sLive)?)|(Video)', '', webpage), + 'title', default='', group='content') + or (lambda x: '' if not x or x.group(1) in ('Video', 'Facebook', 'Facebook Live') + else x.group(1).encode().decode('unicode_escape') + )(re.search(rf'{Q}meta{Q}:\s*{{{Q}title{Q}:\s*{Q}((?:[^{Q}\\]|\\.)*){Q}', webpage)) + or og_title + ).split(' | ')[0] + or re.sub(r'(\s*\n\s*)', ' ', description)) + webpage_info['title'] = title if len(title or '') <= 100 else title[:(47 + title[47:67].rfind(' '))] + '...' + webpage_info['description'] = description + # timestamp + if field in ('timestamp', None): + for x in find_json_obj(post_data, + rf'creation_time{Q}:\s*\d+,', rf'created_time{Q}:\s*\d+,', rf'publish_time{Q}:\s*\d+,', + get_all=True): + if re.search(rf'id{Q}:\s*{Q}(?:(?:{s_id})|(?:{p_id})){Q}', x[1]): + if timestamp := json.loads(x[1]).get(re.split(f'{Q}', x[0])[0]): + break + webpage_info['timestamp'] = timestamp + # return data + return webpage_info.get(field) if field else webpage_info + + og_title = self._og_search_title(webpage, default='').split(' | ') + if len(og_title) > 1 and re.search(r'\s(?:reactions|shares|views)\s', og_title[0]): + og_title.pop(0) + og_title = re.sub(r'(\s*\n\s*)', ' ', ' | '.join(og_title)) + thumbnail = self._html_search_meta( + ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) + if thumbnail and not re.search(r'\.(?:gif|jpg|png|webp)', thumbnail): + thumbnail = None + + webpage_info = { + 'thumbnails': [{k: v for k, v in { + 'url': thumbnail, + 'height': (lambda x: int_or_none(x.group(1)) if x else None + )(re.search(r'stp=.+_[a-z]\d+x(\d+)&', thumbnail)), + 'preference': None if 'stp=' in thumbnail else 1, + }.items() if v is not None}] if url_or_none(thumbnail) else [], + 'view_count': parse_count(self._search_regex( + (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'), + webpage, 'view count', default=None)), + } + + p_id, s_id, linked_url, data = None, None, None, [] + Q = (lambda x: x.group(1) if x else '"')(re.search(r'(["\']):\s*[\[{]*\1', (post_data[0] if post_data else ''))) + for p_data in post_data[:]: + if rf'{Q}feed_unit{Q}:' in p_data or not re.search(rf'{Q}(?:dash_manifest_url|message|event_description){Q}:', p_data): + # discard useless data + post_data.remove(p_data) + else: + if (not s_id or not p_id) and (f'{Q}story{Q}:' in p_data or f'{Q}creation_story{Q}:' in p_data): + p_id = p_id if p_id else (lambda x: x.group(1) if x else + (video_id if video_id.isnumeric() else None) + )(re.search(rf'{Q}(?:post_id|videoId|video_id){Q}:\s*{Q}(\d+){Q}', p_data)) + s_id = s_id if s_id else (lambda x: x.group(1) if x else None + )(re.search(rf'id{Q}:\s*{Q}(Uzpf[^{Q}]+){Q}', p_data)) + if not data: + if re.search(rf'{Q}attachment{Q}:\s*{{{Q}(?:source|web_link){Q}:', p_data): + # linked video + for x in find_json_obj(p_data, rf'{Q}attachment{Q}:\s*{{{Q}(?:source|web_link){Q}:', obj_in_value=True): + if linked_url := traverse_obj( + json.loads(x[1]), (('web_link', None), 'url', {url_or_none}), get_all=False): + url_transparent = '.facebook.com' not in urllib.parse.urlparse(linked_url).netloc + data = x[1] + break + elif f'{Q}dash_manifest_url{Q}:' in p_data[:p_data.find(f'{Q}comment_list_renderer{Q}:')]: + for x in find_json_obj(p_data, rf'{Q}data{Q}:\s*{{', rf'{Q}data{Q}:', obj_in_value=True): + if f'{Q}dash_manifest_url{Q}:' in x[1]: + data = x[1] + break + + if linked_url: + return self.url_result(linked_url, video_id=video_id, url_transparent=url_transparent, + **{k: v for k, v in (extract_metadata() if url_transparent else {}).items() if v}) + + def extract_dash_manifest(video, formats=[], subtitles={}): + dash_manifest = traverse_obj(video, 'playlist', 'dash_manifest', expected_type=str) + if dash_manifest: + dash_fmts, dash_subs = self._parse_mpd_formats_and_subtitles( + compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), + mpd_url=video.get('dash_manifest_url')) + formats.extend(dash_fmts) + self._merge_subtitles(dash_subs, target=subtitles) + return formats, subtitles - info_json_ld = self._search_json_ld(webpage, video_id, default={}) - info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '') - or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}') - return merge_dicts(info_json_ld, info_dict) + def process_formats(info): + for f in info['formats']: + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. + f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' + # Formats larger than ~500MB will return error 403 unless chunk size is regulated + f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20 + if data: + def parse_graphql_video(video): + v_id = video.get('videoId') or video.get('id') or video_id + formats = [] + captions, subtitles = {}, {} + is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool})) + for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), + ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), + ('browser_native_sd_url', 'sd')): + playable_url = video.get(key) + if not playable_url: + continue + if determine_ext(playable_url) == 'mpd': + dash_fmts, dash_subs = self._extract_mpd_formats_and_subtitles(playable_url, video_id) + formats.extend(dash_fmts) + self._merge_subtitles(dash_subs, target=(captions if is_broadcast else subtitles)) + else: + q = qualities(['sd', 'hd']) + formats.append({ + 'format_id': format_id, + # sd, hd formats w/o resolution info should be deprioritized below DASH + 'quality': q(format_id) - 3, + 'url': playable_url, + }) + if is_broadcast: + formats, captions = extract_dash_manifest(video, formats, captions) + else: + formats, subtitles = extract_dash_manifest(video, formats, subtitles) + + # captions/subtitles + for caption in traverse_obj(video, ( + 'video_available_captions_locales', + {lambda x: sorted(x, key=lambda c: c['locale'])}, + lambda _, v: url_or_none(v['captions_url']), + )): + lang = caption.get('localized_language') or 'und' + subs = { + 'url': caption['captions_url'], + 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang), + } + if caption.get('localized_creation_method') or is_broadcast: + captions.setdefault(caption['locale'], []).append(subs) + else: + subtitles.setdefault(caption['locale'], []).append(subs) + captions_url = traverse_obj(video, ('captions_url', {url_or_none})) + if captions_url and not captions and not subtitles: + locale = self._html_search_meta( + ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US') + (captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}] + # thumbnails + thumbnails = [] + for url in [uri for uri in [traverse_obj(video, path) for path in [ + ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri'), + ('image', 'uri'), ('previewImage', 'uri'), + ]] if url_or_none(uri) is not None]: + if (re.search(r'\.(?:jpg|png)', url) + and not any(url.split('_cat=')[0] in t['url'] for t in thumbnails)): + thumbnails.append({k: v for k, v in { + 'url': url, + 'height': (lambda x: int_or_none(x.group(1)) if x else None + )(re.search(r'stp=.+_[a-z]\d+x(\d+)&', url)), + 'preference': None if 'stp=' in url else 1, + }.items() if v is not None}) + # timestamp + v_timestamp = traverse_obj(video, 'publish_time', 'creation_time', 'created_time', {int_or_none}) + if not v_timestamp and v_id != video_id: + for x in find_json_obj(post_data, + rf'creation_time{Q}:\s*\d+,', rf'created_time{Q}:\s*\d+,', rf'publish_time{Q}:\s*\d+,', + get_all=True): + if re.search(rf'id{Q}:\s*{Q}{v_id}{Q}', x[1]): + if v_timestamp := json.loads(x[1]).get(x[0].split(f'{Q}')[0]): + break + # uploader + if uploader_id := traverse_obj(video, ('owner', 'id', {str_or_none})): + if x := list(find_json_obj(data, ( + rf'id{Q}:\s*{Q}{uploader_id}{Q}[^}}]*{Q}name{Q}:\s*{Q}[^{Q}]', + rf'{Q}name{Q}:\s*{Q}[^{Q}][^}}]*{Q}id{Q}:\s*{Q}{uploader_id}{Q}'))): + if x[0][1]: + video['owner'] = merge_dicts(video['owner'], json.loads(x[0][1])) + elif x := list(find_json_obj(post_data, (rf'(owner{Q}:)[^}}]*{Q}name{Q}:\s*{Q}[^{Q}]'), + rf'((?!share)\w{{5}}_creator{Q}:)[^}}]*{Q}name{Q}:\s*{Q}[^{Q}]', + obj_in_value=True)): + if x[0][1]: + video['owner'] = json.loads(x[0][1]) + uploader_id = traverse_obj(video, ('owner', 'id', {str_or_none})) + uploader = traverse_obj(video, ('owner', 'name', {str_or_none})) or extract_metadata('uploader') + # description + v_desc = traverse_obj(video, ('savable_description', 'text', {str_or_none})) + if not v_desc and v_id != video_id: + if vs_id := traverse_obj(video, ( + (None, (..., 'video')), 'creation_story', 'id', {str_or_none}), get_all=False): + if x := list(find_json_obj( + data, rf'{Q}message{Q}:(?:(?!{Q}message{Q}:)[^}}])+{Q}text{Q}:\s*{Q}[^{Q}](?:(?!{Q}id{Q}:).)+{Q}id{Q}:\s*{Q}{vs_id}{Q}')): + v_desc = (lambda x: x if x != uploader else None)(json.loads(x[0][1])['message']['text']) + else: + for x in find_json_obj(data, rf'video{Q}:\s*{{{Q}id{Q}:\s*{Q}{v_id}{Q}', get_all=True): + if v_desc := traverse_obj(json.loads(x[1]), ('message', 'text', {str_or_none})): + break + # title + if v_name := video.get('name'): + v_title = v_name if len(v_name) <= 100 else v_name[:(47 + v_name[47:67].rfind(' '))] + '...' + + info = { + 'id': v_id, + 'title': ((v_title if video.get('name') else None) + or (f"{extract_metadata('title')} Facebook Video #{v_id}" if extract_metadata('title') + else (f'{uploader}Facebook Video #{v_id}' if uploader else f'Facebook Video #{v_id}'))), + 'description': v_desc or extract_metadata('description'), + 'thumbnails': thumbnails, + 'timestamp': v_timestamp or extract_metadata('timestamp'), + 'uploader': uploader, + 'uploader_id': uploader_id or webpage_info.get('uploader_id'), + 'uploader_url': (traverse_obj(video, ('owner', 'url', {url_or_none})) + or (webpage_info.get('uploader_url') if webpage_info.get('uploader') == uploader else None) + or (lambda x: f'https://www.facebook.com/profile.php?id={x}' if x else None + )(uploader_id or webpage_info.get('uploader_id'))), + 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000) + or float_or_none(video.get('length_in_second'))), + 'formats': formats, + 'automatic_captions': captions, + 'subtitles': subtitles, + 'is_live': video.get('is_live_streaming'), + 'was_live': (video.get('broadcast_status') == 'VOD_READY'), + 'concurrent_view_count': video.get('liveViewerCount'), + } + process_formats(info) + entries.append(info) + + entries, video_ids = [], [] + for idx, x in enumerate(find_json_obj(data, + (rf'dash_manifest_url{Q}:\s*{Q}', rf'_hd_url{Q}:\s*{Q}', rf'_sd_url{Q}:\s*{Q}'), + get_all=True)): + media = json.loads(x[1]) + if (media.get('__typename', 'Video') == 'Video' + and not media.get('sticker_image') + and media.get('id', f'{video_id}_{idx}') not in video_ids): + video_ids.append(media.get('id', f'{video_id}_{idx}')) + parse_graphql_video(media) + if media.get('id') == video_id: + break + + if len(entries) > 1: + return self.playlist_result(entries, video_id, **{ + k: v for k, v in extract_metadata().items() if v}) + + video_info = entries[0] if entries else {'id': video_id} + video_info['title'] = re.sub(r'(?({{.*?{_filter}.*?}})', webpage): - yield self._parse_json(relay_data, video_id, fatal=False) or {} - - def extract_relay_data(_filter): - return next(filter(None, yield_all_relay_data(_filter)), {}) - - def extract_relay_prefetched_data(_filter, target_keys=None): - path = 'data' - if target_keys is not None: - path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys)) - return traverse_obj(yield_all_relay_data(_filter), ( - ..., 'require', (None, (..., ..., ..., '__bbox', 'require')), - lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v), - ..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {} + if server_js_data := self._parse_json(self._search_regex( + [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'], + webpage, 'server js data', default='{}'), video_id, fatal=False): + video_data = extract_video_data(server_js_data.get('instances', [])) if not video_data: - server_js_data = self._parse_json(self._search_regex([ + if server_js_data := self._parse_json(self._search_regex([ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, rf'bigPipe\.onPageletArrive\(({{.*?id\s*:\s*"{self._SUPPORTED_PAGLETS_REGEX}".*?}})\);', - ], webpage, 'js data', default='{}'), video_id, js_to_json, False) - video_data = extract_from_jsmods_instances(server_js_data) - - if not video_data: - data = extract_relay_prefetched_data( - r'"(?:dash_manifest|playable_url(?:_quality_hd)?)', - target_keys=('video', 'event', 'nodes', 'node', 'mediaset')) - if data: - entries = [] - - def parse_graphql_video(video): - v_id = video.get('videoId') or video.get('id') or video_id - reel_info = traverse_obj( - video, ('creation_story', 'short_form_video_context', 'playback_video', {dict})) - if reel_info: - video = video['creation_story'] - video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) - video.update(reel_info) - formats = [] - q = qualities(['sd', 'hd']) - for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), - ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), - ('browser_native_sd_url', 'sd')): - playable_url = video.get(key) - if not playable_url: - continue - if determine_ext(playable_url) == 'mpd': - formats.extend(self._extract_mpd_formats(playable_url, video_id)) - else: - formats.append({ - 'format_id': format_id, - # sd, hd formats w/o resolution info should be deprioritized below DASH - 'quality': q(format_id) - 3, - 'url': playable_url, - }) - extract_dash_manifest(video, formats) - if not formats: - # Do not append false positive entry w/o any formats - return - - automatic_captions, subtitles = {}, {} - is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool})) - for caption in traverse_obj(video, ( - 'video_available_captions_locales', - {lambda x: sorted(x, key=lambda c: c['locale'])}, - lambda _, v: url_or_none(v['captions_url']), - )): - lang = caption.get('localized_language') or 'und' - subs = { - 'url': caption['captions_url'], - 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang), - } - if caption.get('localized_creation_method') or is_broadcast: - automatic_captions.setdefault(caption['locale'], []).append(subs) - else: - subtitles.setdefault(caption['locale'], []).append(subs) - captions_url = traverse_obj(video, ('captions_url', {url_or_none})) - if captions_url and not automatic_captions and not subtitles: - locale = self._html_search_meta( - ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US') - (automatic_captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}] - - info = { - 'id': v_id, - 'formats': formats, - 'thumbnail': traverse_obj( - video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')), - 'uploader_id': traverse_obj(video, ('owner', 'id', {str_or_none})), - 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none), - 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000) - or float_or_none(video.get('length_in_second'))), - 'automatic_captions': automatic_captions, - 'subtitles': subtitles, - } - process_formats(info) - description = try_get(video, lambda x: x['savable_description']['text']) - title = video.get('name') - if title: - info.update({ - 'title': title, - 'description': description, - }) - else: - info['title'] = description or f'Facebook video #{v_id}' - entries.append(info) - - def parse_attachment(attachment, key='media'): - media = attachment.get(key) or {} - if media.get('__typename') == 'Video': - return parse_graphql_video(media) - - nodes = variadic(traverse_obj(data, 'nodes', 'node') or []) - attachments = traverse_obj(nodes, ( - ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments', - ..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')), - 'attachment', {dict})) - for attachment in attachments: - ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}), - ('target', 'attachments', ..., 'styles', 'attachment', {dict})) - for n in ns: - parse_attachment(n) - parse_attachment(attachment) - - edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] - for edge in edges: - parse_attachment(edge, key='node') - - video = traverse_obj(data, ( - 'event', 'cover_media_renderer', 'cover_video'), 'video', expected_type=dict) or {} - if video: - attachments = try_get(video, [ - lambda x: x['story']['attachments'], - lambda x: x['creation_story']['attachments'], - ], list) or [] - for attachment in attachments: - parse_attachment(attachment) - if not entries: - parse_graphql_video(video) - - if len(entries) > 1: - return self.playlist_result(entries, video_id) - - video_info = entries[0] if entries else {'id': video_id} - webpage_info = extract_metadata(webpage) - # honor precise duration in video info - if video_info.get('duration'): - webpage_info['duration'] = video_info['duration'] - # preserve preferred_thumbnail in video info - if video_info.get('thumbnail'): - webpage_info['thumbnail'] = video_info['thumbnail'] - return merge_dicts(webpage_info, video_info) + ], webpage, 'js data', default='{}'), video_id, js_to_json, False): + video_data = extract_from_jsmods_instances(server_js_data) - if not video_data: - m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) - if m_msg is not None: - raise ExtractorError( - f'The video is not available, Facebook said: "{m_msg.group(1)}"', - expected=True) - elif any(p in webpage for p in ( - '>You must log in to continue', - 'id="login_form"', - 'id="loginbutton"')): - self.raise_login_required() - - if not video_data and '/watchparty/' in url: - post_data = { - 'doc_id': 3731964053542869, - 'variables': json.dumps({ - 'livingRoomID': video_id, - }), - } - - prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{') - if prefetched_data: - lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict) - if lsd: - post_data[lsd['name']] = lsd['value'] - - relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,') - for define in (relay_data.get('define') or []): - if define[0] == 'RelayAPIConfigDefaults': - self._api_config = define[2] - - living_room = self._download_json( - urljoin(url, self._api_config['graphURI']), video_id, - data=urlencode_postdata(post_data))['data']['living_room'] - - entries = [] - for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []): - video = try_get(edge, lambda x: x['node']['video']) or {} - v_id = video.get('id') - if not v_id: - continue - v_id = str(v_id) - entries.append(self.url_result( - self._VIDEO_PAGE_TEMPLATE % v_id, - self.ie_key(), v_id, video.get('name'))) - - return self.playlist_result(entries, video_id) - - if not video_data: + if not video_data and False: # skipped because not working # Video info not in first request, do a secondary request using # tahoe player specific URL tahoe_data = self._download_webpage( - self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, + self._VIDEO_PAGE_TAHOE_TEMPLATE % p_id, video_id, + fatal=False, + expected_status=404, data=urlencode_postdata({ '__a': 1, '__pc': self._search_regex( @@ -791,15 +930,30 @@ def parse_attachment(attachment, key='media'): headers={ 'Content-Type': 'application/x-www-form-urlencoded', }) - tahoe_js_data = self._parse_json( - self._search_regex( - r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, - 'tahoe js data', default='{}'), - video_id, fatal=False) - video_data = extract_from_jsmods_instances(tahoe_js_data) + if tahoe_data: + tahoe_js_data = self._parse_json( + self._search_regex( + r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, + 'tahoe js data', default='{}'), + video_id, fatal=False) + video_data = extract_from_jsmods_instances(tahoe_js_data) if not video_data: - raise ExtractorError('Cannot parse data') + m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) + if m_msg is not None: + raise ExtractorError( + f'The video is not available, Facebook said: "{m_msg.group(1)}"', + expected=True) + elif any(p in webpage for p in ( + '>You must log in to continue', + 'id="login_form"', + 'id="loginbutton"')): + self.raise_login_required(method='cookies') + elif not login_data: + self.raise_login_required('No video formats found', method='cookies') + elif not logged_in: + self.raise_login_required('Failed to login with provided data', method='cookies') + self.raise_no_formats('No video formats found!') if len(video_data) > 1: entries = [] @@ -810,6 +964,7 @@ def parse_attachment(attachment, key='media'): entries.append(self.url_result(urljoin( url, video_url), self.ie_key(), v[0].get('video_id'))) return self.playlist_result(entries, video_id) + video_data = video_data[0] formats = [] @@ -835,7 +990,7 @@ def parse_attachment(attachment, key='media'): 'quality': preference, 'height': 720 if quality == 'hd' else None, }) - extract_dash_manifest(f[0], formats) + formats, subtitles = extract_dash_manifest(f[0], formats, subtitles) subtitles_src = f[0].get('subtitles_src') if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src}) @@ -846,31 +1001,31 @@ def parse_attachment(attachment, key='media'): 'subtitles': subtitles, } process_formats(info_dict) - info_dict.update(extract_metadata(webpage)) + info_dict.update(webpage_info) return info_dict - def _real_extract(self, url): - video_id = self._match_id(url) - - real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url - return self._extract_from_url(real_url, video_id) - class FacebookPluginsVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?Phttps.+)' _TESTS = [{ 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560', - 'md5': '5954e92cdfe51fe5782ae9bda7058a07', 'info_dict': { 'id': '10154383743583686', 'ext': 'mp4', - # TODO: Fix title, uploader 'title': 'What to do during the haze?', - 'uploader': 'Gov.sg', + 'description': 'md5:81839c0979803a014b20798df255ed0b', + 'thumbnail': r're:^https?://.*', + 'uploader': 'gov.sg', + 'uploader_id': '100064718678925', + 'uploader_url': r're:^https?://.*', 'upload_date': '20160826', 'timestamp': 1472184808, + 'duration': 65.087, + 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'not_live', }, 'add_ie': [FacebookIE.ie_key()], }, { @@ -901,7 +1056,7 @@ class FacebookRedirectURLIE(InfoExtractor): 'playable_in_embed': True, 'categories': ['Music'], 'channel': 'Boiler Room', - 'uploader_id': 'brtvofficial', + 'uploader_id': '@boilerroom', 'uploader': 'Boiler Room', 'tags': 'count:11', 'duration': 3332, @@ -909,11 +1064,16 @@ class FacebookRedirectURLIE(InfoExtractor): 'thumbnail': 'https://i.ytimg.com/vi/pO8h3EaFRdo/maxresdefault.jpg', 'channel_url': 'https://www.youtube.com/channel/UCGBpxWJr9FNOcFYA5GkKrMg', 'availability': 'public', - 'uploader_url': 'http://www.youtube.com/user/brtvofficial', + 'uploader_url': r're:^https?://.*', 'upload_date': '20150917', + 'timestamp': 1442489450, 'age_limit': 0, 'view_count': int, 'like_count': int, + 'heatmap': 'count:100', + 'channel_is_verified': True, + 'channel_follower_count': int, + 'comment_count': int, }, 'add_ie': ['Youtube'], 'params': {'skip_download': 'Youtube'}, @@ -932,25 +1092,26 @@ class FacebookReelIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.facebook.com/reel/1195289147628387', - 'md5': 'f13dd37f2633595982db5ed8765474d3', 'info_dict': { 'id': '1195289147628387', 'ext': 'mp4', - 'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e', - 'description': 'md5:22f03309b216ac84720183961441d8db', - 'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1', - 'uploader_id': '100040874179269', + 'title': 'md5:32aab9976c6b8a145fc0d799631e2b74', + 'description': 'md5:3ea795c5ebb7ed28e3e78bb7b1191753', + 'uploader': 'Beast Camp Training', + 'uploader_id': '1738535909799870', + 'uploader_url': r're:^https?://.*', 'duration': 9.579, 'timestamp': 1637502609, 'upload_date': '20211121', 'thumbnail': r're:^https?://.*', + 'live_status': 'not_live', }, }] def _real_extract(self, url): video_id = self._match_id(url) return self.url_result( - f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id) + f'https://www.facebook.com/watch/?v={video_id}', FacebookIE, video_id) class FacebookAdsIE(InfoExtractor): @@ -1034,11 +1195,15 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - post_data = traverse_obj( - re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage), (..., {json.loads})) - data = get_first(post_data, ( - 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., - 'entryPointRoot', 'otherProps', 'deeplinkAdCard', 'snapshot', {dict})) + if post_data := traverse_obj( + re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage), (..., {json.loads})): + data = get_first(post_data, ( + 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., + 'entryPointRoot', 'otherProps', 'deeplinkAdCard', 'snapshot', {dict})) + elif post_data := traverse_obj( + re.findall(r's\.handle\(({.*})\);requireLazy\(', webpage), (..., {json.loads})): + data = get_first(post_data, ( + 'require', ..., ..., ..., 'props', 'deeplinkAdCard', 'snapshot', {dict})) if not data: raise ExtractorError('Unable to extract ad data')