From e9d5849209582e850784bdbc3f943205ac39f922 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Wed, 18 Dec 2024 02:06:41 +0800 Subject: [PATCH] [ie/facebook] Experimental - extract data from JSON strings by searching with regex instead of traversing dicts - extract linked video - improved thumbnail info - more lenient url pattern - live status --- yt_dlp/extractor/facebook.py | 1236 ++++++++++++++++++++-------------- 1 file changed, 725 insertions(+), 511 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 24ecb03505e6..90bd674586c1 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -12,7 +12,6 @@ determine_ext, float_or_none, format_field, - get_element_by_id, get_first, int_or_none, join_nonempty, @@ -39,19 +38,12 @@ class FacebookIE(InfoExtractor): (?:[^#]*?\#!/)? (?: (?: - permalink\.php| - video/video\.php| - photo\.php| - video\.php| - video/embed| - story\.php| - watch(?:/live)?/? + (?:video/)?[a-z]{5,}(?:\.php|/live)?/? )\?(?:.*?)(?:v|video_id|story_fbid)=| - [^/]+/videos/(?:[^/]+/)?| - [^/]+/posts/| + [^/]+/(?:videos|posts)/(?:[^/]+/)?| events/(?:[^/]+/)?| - groups/[^/]+/(?:permalink|posts)/(?:[\da-f]+/)?| - watchparty/ + groups/[^/]+/(?:permalink|posts)/| + [a-z]{5,}/| )| facebook: ) @@ -73,179 +65,132 @@ class FacebookIE(InfoExtractor): _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' _TESTS = [{ + # data.video.story.attachments[].media 'url': 'https://www.facebook.com/radiokicksfm/videos/3676516585958356/', 'info_dict': { 'id': '3676516585958356', 'ext': 'mp4', 'title': 'dr Adam Przygoda', 'description': 'md5:34675bda53336b1d16400265c2bb9b3b', - 'uploader': 'RADIO KICKS FM', - 'upload_date': '20230818', - 'timestamp': 1692346159, 'thumbnail': r're:^https?://.*', + 'timestamp': 1692346159, + 'upload_date': '20230818', + 'uploader': 'RADIO KICKS FM', 'uploader_id': '100063551323670', + 'uploader_url': r're:^https?://.*', 'duration': 3133.583, + 'live_status': 'was_live', + 'concurrent_view_count': int, 'view_count': int, - 'concurrent_view_count': 0, + 'like_count': int, + 'comment_count': int, }, }, { 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', - 'md5': '6a40d33c0eccbb1af76cf0485a052659', - 'info_dict': { - 'id': '637842556329505', - 'ext': 'mp4', - 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', - 'uploader': 'Tennis on Facebook', - 'upload_date': '20140908', - 'timestamp': 1410199200, - }, - 'skip': 'Requires logging in', + 'only_matching': True, }, { - # data.video + # data.video.story.attachments[].media 'url': 'https://www.facebook.com/video.php?v=274175099429670', 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', 'title': 'Asif', 'description': '', - 'uploader': 'Asif Nawab Butt', - 'upload_date': '20140506', - 'timestamp': 1399398998, 'thumbnail': r're:^https?://.*', - 'uploader_id': 'pfbid05AzrFTXgY37tqwaSgbFTTEpCLBjjEJHkigogwGiRPtKEpAsJYJpzE94H1RxYXWEtl', + 'timestamp': 1399398998, + 'upload_date': '20140506', + 'uploader': 'Asif Nawab Butt', + 'uploader_id': r're:^pfbid.*', + 'uploader_url': r're:^https?://.*', 'duration': 131.03, + 'live_status': 'not_live', 'concurrent_view_count': int, 'view_count': int, + 'like_count': int, + 'comment_count': int, }, }, { - 'note': 'Video with DASH manifest', - 'url': 'https://www.facebook.com/video.php?v=957955867617029', - 'md5': 'b2c28d528273b323abe5c6ab59f0f030', - 'info_dict': { - 'id': '957955867617029', - 'ext': 'mp4', - 'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...', - 'uploader': 'Demy de Zeeuw', - 'upload_date': '20160110', - 'timestamp': 1452431627, - }, - 'skip': 'Requires logging in', - }, { - 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', - 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', - 'info_dict': { - 'id': '544765982287235', - 'ext': 'mp4', - 'title': '"What are you doing running in the snow?"', - 'uploader': 'FailArmy', - }, - 'skip': 'Video gone', - }, { - 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903', - 'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3', - 'info_dict': { - 'id': '1035862816472149', - 'ext': 'mp4', - 'title': 'What the Flock Is Going On In New Zealand Credit: ViralHog', - 'uploader': 'S. Saint', - }, - 'skip': 'Video gone', - }, { - 'note': 'swf params escaped', - 'url': 'https://www.facebook.com/barackobama/posts/10153664894881749', - 'md5': '97ba073838964d12c70566e0085c2b91', - 'info_dict': { - 'id': '10153664894881749', - 'ext': 'mp4', - 'title': 'Average time to confirm recent Supreme Court nominees: 67 days Longest it\'s t...', - 'thumbnail': r're:^https?://.*', - 'timestamp': 1456259628, - 'upload_date': '20160223', - 'uploader': 'Barack Obama', - }, - 'skip': 'Gif on giphy.com gone', - }, { - # have 1080P, but only up to 720p in swf params # data.video.story.attachments[].media 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': '1659aa21fb3dd1585874f668e81a72c8', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', 'title': 'Holocaust survivor becomes US citizen', 'description': 'She survived the holocaust — and years later, she’s getting her citizenship so she can vote for Hillary Clinton http://cnn.it/2eERh5f', + 'thumbnail': r're:^https?://.*', 'timestamp': 1477818095, 'upload_date': '20161030', 'uploader': 'CNN', - 'thumbnail': r're:^https?://.*', - 'view_count': int, 'uploader_id': '100059479812265', - 'concurrent_view_count': int, + 'uploader_url': r're:^https?://.*', 'duration': 44.181, - }, - }, { - # FIXME: unable to extract uploader, no formats found - # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media - 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', - 'info_dict': { - 'id': '1417995061575415', - 'ext': 'mp4', - 'title': 'Довгоочікуване відео | By Yaroslav - Facebook', - 'description': 'Довгоочікуване відео', - 'timestamp': 1486648217, - 'upload_date': '20170209', - 'uploader': 'Yaroslav Korpan', - 'uploader_id': 'pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl', + 'live_status': 'not_live', 'concurrent_view_count': int, - 'thumbnail': r're:^https?://.*', 'view_count': int, - 'duration': 11736.446, - }, - 'params': { - 'skip_download': True, + 'like_count': int, + 'comment_count': int, }, }, { - # FIXME: Cannot parse data error + # web_link url: http://giphy.com/gifs/l3vR8BKU0m8uX2mAg 'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471', 'info_dict': { - 'id': '1072691702860471', + 'id': 'l3vR8BKU0m8uX2mAg', 'ext': 'mp4', - 'title': 'md5:ae2d22a93fbb12dad20dc393a869739d', + 'title': 'Nada mas satisfactorio que los otros 5... - La Guía Del Varón', + 'description': 'Nada mas satisfactorio que los otros 5 minutos', + 'tags': ['giphyupload'], + 'thumbnail': r're:^https?://.*', 'timestamp': 1477305000, - 'upload_date': '20161024', + 'upload_date': '20161022', 'uploader': 'La Guía Del Varón', - 'thumbnail': r're:^https?://.*', + 'uploader_id': '100050567346031', + 'uploader_url': r're:^https?://.*', + 'like_count': int, + 'comment_count': int, }, - 'skip': 'Requires logging in', + 'skip': 'Gif on giphy.com', }, { - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media + # data.node.comet_sections.content.story.attachments[].styles.attachment.media 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', 'info_dict': { 'id': '202882990186699', 'ext': 'mp4', - 'title': 'birb (O v O") | Hello? Yes your uber ride is here', - 'description': 'Hello? Yes your uber ride is here * Jukin Media Verified * Find this video and others like it by visiting...', - 'timestamp': 1486035513, + 'title': 'birb (O v O")', + 'description': 'md5:963dee8a667a2b49f2059cf7ab54fe55', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1486035494, 'upload_date': '20170202', 'uploader': 'Elisabeth Ahtn', - 'uploader_id': '100013949973717', + 'uploader_id': r're:^pfbid.*', + 'uploader_url': r're:^https?://.*', + 'duration': 23.714, + 'live_status': 'not_live', + 'concurrent_view_count': int, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, - 'skip': 'Requires logging in', }, { - # data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media + # web_link url: https://www.facebook.com/attn/videos/1569199726448814/ 'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/', 'info_dict': { 'id': '1569199726448814', 'ext': 'mp4', - 'title': 'Pence MUST GO!', - 'description': 'Vickie Gentry shared a memory.', + 'title': 'What if marijuana companies were allowed to have TV ads like B...', + 'description': 'What if we treated marijuana ads like big pharma ads?', + 'thumbnail': r're:^https?://.*', 'timestamp': 1511548260, 'upload_date': '20171124', - 'uploader': 'Vickie Gentry', - 'uploader_id': 'pfbid0FkkycT95ySNNyfCw4Cho6u5G7WbbZEcxT496Hq8rtx1K3LcTCATpR3wnyYhmyGC5l', - 'thumbnail': r're:^https?://.*', + 'uploader': 'ATTN:', + 'uploader_id': '100064451419378', + 'uploader_url': r're:^https?://.*', 'duration': 148.224, + 'live_status': 'not_live', + 'concurrent_view_count': int, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, }, { # data.node.comet_sections.content.story.attachments[].styles.attachment.media @@ -253,15 +198,21 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '6968553779868435', 'ext': 'mp4', + 'title': 'ATTN: - Learning new problem-solving skills is hard for...', 'description': 'md5:2f2fcf93e97ac00244fe64521bbdb0cb', - 'uploader': 'ATTN:', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1701975646, 'upload_date': '20231207', - 'title': 'ATTN:', - 'duration': 132.675, + 'uploader': 'ATTN:', 'uploader_id': '100064451419378', + 'uploader_url': r're:^https?://.*', + 'duration': 132.675, + 'live_status': 'not_live', + 'concurrent_view_count': int, 'view_count': int, - 'thumbnail': r're:^https?://.*', - 'timestamp': 1701975646, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { # data.node.comet_sections.content.story.attachments[].styles.attachment.media @@ -269,149 +220,225 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '270103405756416', 'ext': 'mp4', - 'title': 'Lela Evans', - 'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...', + 'title': 'Lela Evans - Today Makkovik\'s own Pilot Mandy Smith made...', + 'description': 'md5:cc93a91feb89923303c1f78656791e4d', 'thumbnail': r're:^https?://.*', - 'uploader': 'Lela Evans', - 'uploader_id': 'pfbid0swT2y7t6TAsZVBvcyeYPdhTMefGaS26mzUwML3vd1ma6ndGZKxsyS4Ssu3jitZLXl', - 'upload_date': '20231228', 'timestamp': 1703804085, + 'upload_date': '20231228', + 'uploader': 'Lela Evans', + 'uploader_id': r're:^pfbid.*', + 'uploader_url': r're:^https?://.*', 'duration': 394.347, + 'live_status': 'not_live', + 'concurrent_view_count': int, 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { 'url': 'https://www.facebook.com/story.php?story_fbid=pfbid0Fnzhm8UuzjBYpPMNFzaSpFE9UmLdU4fJN8qTANi1Dmtj5q7DNrL5NERXfsAzDEV7l&id=100073071055552', 'only_matching': True, - }, { - 'url': 'https://www.facebook.com/video.php?v=10204634152394104', - 'only_matching': True, }, { 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', 'only_matching': True, }, { - # data.mediaset.currMedia.edges + # data.mediaset.currMedia.edges[].node 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', - 'only_matching': True, + 'info_dict': { + 'id': '10153870694020942', + 'ext': 'mp4', + 'title': 'Premier\'s Playoff Challenge', + 'description': 'md5:079134a18ac00b11ec5815fccf75a5a8', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1429133517, + 'upload_date': '20150415', + 'uploader': 'Christy Clark', + 'uploader_id': '100045032167189', + 'uploader_url': r're:^https?://.*', + 'duration': 31.197, + 'live_status': 'not_live', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, }, { # data.video.story.attachments[].media 'url': 'facebook:544765982287235', 'only_matching': True, }, { - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media - 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', - 'only_matching': True, - }, { - # data.video.creation_story.attachments[].media 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', 'only_matching': True, }, { - # data.video 'url': 'https://www.facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd.onion/video.php?v=274175099429670', 'only_matching': True, }, { - # no title - 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', - 'only_matching': True, - }, { - # data.video + # data.video.story.attachments[].media 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', 'info_dict': { 'id': '359649331226507', 'ext': 'mp4', 'title': 'Fnatic vs. EG - Group A - Opening Match - ESL One Birmingham Day 1', 'description': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses', + 'thumbnail': r're:^https?://.*', 'timestamp': 1527084179, 'upload_date': '20180523', 'uploader': 'ESL One Dota 2', 'uploader_id': '100066514874195', + 'uploader_url': r're:^https?://.*', 'duration': 4524.001, - 'view_count': int, - 'thumbnail': r're:^https?://.*', + 'live_status': 'not_live', 'concurrent_view_count': int, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, 'params': { 'skip_download': True, }, }, { - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media + # data.video.story.attachments[].media 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', 'info_dict': { 'id': '106560053808006', 'ext': 'mp4', 'title': 'Josef', + 'description': '', 'thumbnail': r're:^https?://.*', - 'concurrent_view_count': int, - 'uploader_id': 'pfbid02gpfwRM2XvdEJfsERupwQiNmBiDArc38RMRYZnap372q6Vs7MtFTVy72mmFWpJBTKl', 'timestamp': 1549275572, - 'duration': 3.283, - 'uploader': 'Josef Novak', - 'description': '', 'upload_date': '20190204', + 'uploader': 'Josef Novak', + 'uploader_id': r're:^pfbid.*', + 'uploader_url': r're:^https?://.*', + 'duration': 3.283, + 'live_status': 'not_live', + 'concurrent_view_count': int, + 'comment_count': int, }, }, { # data.video.story.attachments[].media 'url': 'https://www.facebook.com/watch/?v=647537299265662', - 'only_matching': True, - }, { - # FIXME: https://github.com/yt-dlp/yt-dlp/issues/542 - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media - 'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271', 'info_dict': { - 'id': '10157667649866271', + 'id': '647537299265662', + 'ext': 'mp4', + 'title': 'Padre enseña a su hijo a cómo bañar un recién nacido junto con su...', + 'description': 'Padre ense\u00f1a a su hijo a c\u00f3mo ba\u00f1ar un reci\u00e9n nacido junto con su gato y se hace viral, mir\u00e1 el video 😍', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1605534618, + 'upload_date': '20201116', + 'uploader': 'InfoPico', + 'uploader_id': '100064391811349', + 'uploader_url': r're:^https?://.*', + 'duration': 136.179, + 'live_status': 'not_live', + 'concurrent_view_count': int, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, - 'playlist_count': 3, - 'skip': 'Requires logging in', }, { - # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media + # data.video.story.attachments[].media 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', 'info_dict': { 'id': '117576630041613', 'ext': 'mp4', - # TODO: title can be extracted from video page - 'title': 'Facebook video #117576630041613', - 'uploader_id': '189393014416438', - 'upload_date': '20201123', + 'title': 'Officers Rescue Trapped Motorist from Mahoning River Crash 11-22-20', + 'thumbnail': r're:^https?://.*', 'timestamp': 1606162592, + 'upload_date': '20201123', + 'uploader': 'City of Alliance Police Department', + 'uploader_id': '100064413680392', + 'uploader_url': r're:^https?://.*', + 'duration': 101.504, + 'concurrent_view_count': int, + 'view_count': int, + 'comment_count': int, }, 'skip': 'Requires logging in', }, { - # node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/', + 'only_matching': True, + }, { + # data.video.story.attachments[].media + 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', 'info_dict': { - 'id': '211567722618337', + 'id': '1823658634322275', 'ext': 'mp4', - 'title': 'Facebook video #211567722618337', - 'uploader_id': '127875227654254', - 'upload_date': '20161122', - 'timestamp': 1479793574, + 'title': 'Live Webcam from Corfu - Greece', + 'description': 'md5:84c1af6894ecffe710c79744e4873e85', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1521449766, + 'upload_date': '20180319', + 'uploader': 'SkylineWebcams', + 'uploader_id': '100064307154679', + 'uploader_url': r're:^https?://.*', + 'duration': 14424.199, + 'live_status': 'was_live', + 'concurrent_view_count': int, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, }, - 'skip': 'No video', }, { - # data.video.creation_story.attachments[].media - 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', - 'only_matching': True, + # data.node.comet_sections.content.story.attachments[].style.attachment.all_subattachments.nodes[].media.video_grid_renderer.video + 'url': 'https://www.facebook.com/story.php?story_fbid=5268096689957022&id=100002702286715', + 'info_dict': { + 'id': '669977824610306', + 'ext': 'mp4', + 'title': 'md5:f2666feb05057a09f8b6f542cd7a3eda', + 'description': 'md5:f5775e7245153857caade33e757ceb21', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1671780203, + 'upload_date': '20221223', + 'uploader': 'Azura Tan Siow Ling', + 'uploader_id': '100002702286715', + 'uploader_url': r're:^https?://.*', + 'duration': 20.666, + 'live_status': 'not_live', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, }, { - 'url': 'https://www.facebook.com/watchparty/211641140192478', + # data.node.comet_sections.content.story.attachments[].styles.attachment.all_subattachments.nodes[].media.video_grid_renderer.video + 'url': 'https://www.facebook.com/hanaryushi/posts/pfbid02Thgaymz9f4QXZ1XogoP4eETpdY2WSy7CLGCMuy3VVQopeet9MHbYR7H9tXYD4UE5l', 'info_dict': { - 'id': '211641140192478', + 'id': 'pfbid02Thgaymz9f4QXZ1XogoP4eETpdY2WSy7CLGCMuy3VVQopeet9MHbYR7H9tXYD4UE5l', + 'title': 'Hana Ryuushi - “sharing a relationship of having our...', + 'description': 'md5:75d2f9d921f40e90ba3b176f0d827cf7', + 'timestamp': 1706949770, + 'upload_date': '20240203', + 'uploader': 'Hana Ryuushi', + 'uploader_id': '100005357179289', + 'uploader_url': 'https://www.facebook.com/hanaryushi', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, - 'playlist_count': 1, - 'skip': 'Requires logging in', + 'playlist_count': 2, }, { - # FIXME: Cannot parse data error # data.event.cover_media_renderer.cover_video - 'url': 'https://m.facebook.com/events/1509582499515440', + 'url': 'https://www.facebook.com/events/464667289575302/', 'info_dict': { - 'id': '637246984455045', + 'id': '859946639295361', 'ext': 'mp4', - 'title': 'ANALISI IN CAMPO OSCURO " Coaguli nel sangue dei vaccinati"', - 'description': 'Other event by Comitato Liberi Pensatori on Tuesday, October 18 2022', + 'title': 'June Salsa & Bachata Classes On Sundays for Absolute Beginners, Improvers & Advance level.', + 'description': 'Dance event in Hong Kong by Dance With Style on Sunday, June 16 2024', 'thumbnail': r're:^https?://.*', - 'uploader': 'Comitato Liberi Pensatori', - 'uploader_id': '100065709540881', + 'uploader': 'Dance With Style', + 'uploader_id': '100064171651675', + 'uploader_url': r're:^https?://.*', + 'live_status': 'not_live', }, }, { - 'url': 'https://www.facebook.com/groups/1513990329015294/posts/d41d8cd9/2013209885760000/?app=fbl', + 'url': 'https://m.facebook.com/stories/121668313179875/', + 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/stories/100944752039935/UzpfSVNDOjY2ODMzMzk5NTUwMzc2MA==/', 'only_matching': True, }] _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' @@ -420,6 +447,9 @@ class FacebookIE(InfoExtractor): } def _perform_login(self, username, password): + # raise error because login with username/password is not working + self.raise_login_required('Login with username/password is currently not working', method='cookies') + login_page_req = Request(self._LOGIN_URL) self._set_cookie('facebook.com', 'locale', 'en_US') login_page = self._download_webpage(login_page_req, None, @@ -432,7 +462,7 @@ def _perform_login(self, username, password): login_form = { 'email': username, - 'pass': password, + 'pass': password, # "encpass" is needed instead of plain password 'lsd': lsd, 'lgnrnd': lgnrnd, 'next': 'http://facebook.com/home.php', @@ -478,105 +508,276 @@ def _perform_login(self, username, password): self.report_warning(f'unable to log in: {err}') return - def _extract_from_url(self, url, video_id): + def _real_extract(self, url): + video_id = self._match_id(url) + url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url webpage = self._download_webpage( - url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) - - def extract_metadata(webpage): - post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( - r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] - post = traverse_obj(post_data, ( - ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] - media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( - k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) - title = get_first(media, ('title', 'text')) - description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) - page_title = title or self._html_search_regex(( - r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', - r'(?s)(?P.*?)', - self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'(?P<content>.+?)', - ), webpage, 'title', default=None, group='content') - description = description or self._html_search_meta( - ['description', 'og:description', 'twitter:description'], - webpage, 'description', default=None) - uploader_data = ( - get_first(media, ('owner', {dict})) - or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name'])) - or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) - or get_first(post, ('node', 'actors', ..., {dict})) - or get_first(post, ('event', 'event_creator', {dict})) - or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {}) - uploader = uploader_data.get('name') or ( - clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) - or self._search_regex( - (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False)) - timestamp = int_or_none(self._search_regex( - r']+data-utime=["\'](\d+)', webpage, - 'timestamp', default=None)) - thumbnail = self._html_search_meta( - ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) - # some webpages contain unretrievable thumbnail urls - # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1 - # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ - if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): - thumbnail = None - info_dict = { - 'description': description, - 'uploader': uploader, - 'uploader_id': uploader_data.get('id'), - 'timestamp': timestamp, - 'thumbnail': thumbnail, - 'view_count': parse_count(self._search_regex( - (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'), - webpage, 'view count', default=None)), - 'concurrent_view_count': get_first(post, ( - ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), - **traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', { - 'like_count': ('likers', 'count', {int}), - 'comment_count': ('total_comment_count', {int}), - 'repost_count': ('share_count_reduced', {parse_count}), - }), get_all=False), - } - - info_json_ld = self._search_json_ld(webpage, video_id, default={}) - info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '') - or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}') - return merge_dicts(info_json_ld, info_dict) - - video_data = None - - def extract_video_data(instances): - video_data = [] - for item in instances: - if try_get(item, lambda x: x[1][0]) == 'VideoConfig': - video_item = item[2][0] - if video_item.get('video_id'): - video_data.append(video_item['videoData']) - return video_data - - server_js_data = self._parse_json(self._search_regex( - [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'], - webpage, 'server js data', default='{}'), video_id, fatal=False) - - if server_js_data: - video_data = extract_video_data(server_js_data.get('instances', [])) - - def extract_from_jsmods_instances(js_data): - if js_data: - return extract_video_data(try_get( - js_data, lambda x: x['jsmods']['instances'], list) or []) + re.sub(r'://(?:[\w-]+\.)?facebook\.com/', '://www.facebook.com/', url), video_id) + + post_data = re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage) + sjs_data = [self._parse_json(j, video_id, fatal=False) for j in post_data] + cookies = self._get_cookies(url) + # user passed logged-in cookies or attempted to login + login_data = cookies.get('c_user') and cookies.get('xs') + logged_in = False + if login_data: + logged_in = get_first(sjs_data, ( + 'require', ..., ..., ..., '__bbox', 'define', + lambda _, v: 'CurrentUserInitialData' in v, ..., 'ACCOUNT_ID'), default='0') != '0' + if logged_in and (info := get_first(sjs_data, ( + 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data', + (('ufac_client', 'state', (('set_contact_point_state_renderer', 'title'), + ('intro_state_renderer', 'header_title'))), + ('epsilon_checkpoint', 'screen', 'title')), + ))): + if any(content in info for content in ['days left to appeal', 'suspended your account']): + raise ExtractorError('Your account is suspended', expected=True) + if 'Enter mobile number' == info: + raise ExtractorError('Facebook is requiring mobile number confirmation', expected=True) + if 'your account has been locked' in info: + raise ExtractorError('Your account has been locked', expected=True) + + if props := get_first(sjs_data, ( + 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., (None, (..., ...)), 'rootView', + lambda _, v: v.get('title') is not None)): + if not self._cookies_passed: + self.raise_login_required(method='cookies') + else: + msg = re.sub(r'\s{2,}', ' ', join_nonempty('title', 'body', delim='. ', from_dict=props)) + raise ExtractorError(f'This video is not available. Facebook said: {msg}', expected=True) + + if post_data and not re.search(r'"[^"]+[^(feed)]_story[^"]*":', ','.join(post_data)): + raise ExtractorError('An unknown error occurred. Please try again.', expected=True) + + def find_json_obj(json_strings, *patterns, obj_in_value=False, get_all=False): + """ + Find JSON object, in the form of a string, by regular expression + >>> obj = find_json_obj([json_a, _and_b], regex_a, (regex_b, _or_c), obj_in_value=False, get_all=True) + @param json_strings string/list JSON string/a list of JSON strings (match all) + *patterns string/tuple regex patterns (if tuple, return only the 1st matched pattern) + obj_in_value boolean False: find the object(s) containing (one of) the pattern(s) + True : given pattern(s) of the key(s) to find the + object(s) in the value of that key(s) + get_all boolean return the 1st or all of the results of each regex pattern + @return list of tuple a list of (matching pattern, matched JSON object) + """ + def find_offset(string, bracket, quotation): + _BRACKET_MAP = { + '{': ([f'{{{quotation}'], ['},', '}]', '}}', f'}}{quotation}'], (1 if obj_in_value else -1)), + '}': (['},', '}]', '}}', f'}}{quotation}'], [f'{{{quotation}'], 1), + } # ([search pattern], [opposite sign], search direction); search direction: 1 - forward, -1 - backward + string = re.sub(rf'{{\\{quotation}([^{quotation}]+\\{quotation}:)', rf'{{{quotation}\1 ', string.replace('{}', '[]')) + count, b_sum, offset = 0, 0, 0 + for y, x in zip(((string[1:] + ' ') if _BRACKET_MAP[bracket][2] > 0 else (' ' + string[:-1]))[::_BRACKET_MAP[bracket][2]], + string[::_BRACKET_MAP[bracket][2]]): + s = (x + y) if _BRACKET_MAP[bracket][2] > 0 else (y + x) + count += (1 if s in _BRACKET_MAP[bracket][0] or s in _BRACKET_MAP[bracket][1] else 0) + b_sum += (1 if s in _BRACKET_MAP[bracket][0] else (-1 if s in _BRACKET_MAP[bracket][1] else 0)) + offset += 1 + if count > 0 and b_sum >= (0 if obj_in_value else 1): + break + return offset * _BRACKET_MAP[bracket][2] + + for json_str in variadic(json_strings): # loop all + if isinstance(json_str, str): + # check if json_str is a JSON string and get the quotation mark (either " or ') + if quotation := self._search_regex(r'(["\']):\s*[\[{]*\1', json_str, 'quotation', default=None): + for patterns_item in patterns: + for pattern in variadic(patterns_item): + # 'patterns_item' loop - loop each item in *patterns (item can be a str or tuple) + found = False + if isinstance(pattern, str): + for m in re.finditer(pattern, json_str): # break according to get_all + if obj_in_value: + i = (lambda x, y: (m.start(m.lastindex or 0) + x - 1) if x > 0 + else ((m.end(m.lastindex or 0) + len(y.group(0)) - 1) if y else None) + )(m.group(m.lastindex or 0).rfind('{'), + re.match(r'^\w*(?:":)?:?\s*{', json_str[m.end(m.lastindex or 0):])) + else: + i = m.start(m.lastindex or 0) + if i: + opening = (i + find_offset( + json_str[(i * obj_in_value):(i * (not obj_in_value) - obj_in_value * 2 + 1)], + '{', quotation, + ) - obj_in_value) + closing = i + find_offset(json_str[i:], '}', quotation) + if isinstance(opening, int) and isinstance(closing, int): + found = True + yield (m.group(0), json_str[opening:closing]) + if not get_all: + break + else: # if this for loop ends with break (i.e. not get_all), else clause is not executed + if found: + break # break 'patterns_item' loop if found and get_all + continue # move on to the next 'pattern' (if exists) in 'patterns_item' if not found + break # break 'patterns_item' loop if found and not get_all + if found and isinstance(patterns_item, tuple): + break # break 'patterns_item' loop if found and patterns_item is a tuple + + def extract_metadata(field=None): + if webpage_info.get(field) is not None: + return webpage_info[field] + elif field is None and webpage_info.get('timestamp') is not None: + return webpage_info + # extract data + description = title = uploader_info = None + # uploader + if field == 'uploader': + for x in find_json_obj(post_data, (rf'actors{Q}:[^}}]+{Q}__isActor{Q}:', + rf'owner{Q}:[^}}]+{Q}name{Q}:\s*{Q}[^{Q}]'), + get_all=True): + if re.search(rf'id{Q}:\s*{Q}(?:{s_id}|{video_id}){Q}', x[1]): + uploader_info = traverse_obj(json.loads(x[1]), { + 'uploader': ((('actors', ...), ('owner', 'owner_as_page'), 'video_owner'), + 'name', {str}), + 'uploader_id': ((('actors', ...), ('owner', 'owner_as_page'), 'video_owner'), + 'id', {str}), + 'uploader_url': ((('actors', ...), ('owner', 'owner_as_page'), 'video_owner'), + ('url', 'profile_url'), {url_or_none}), + }, get_all=False) + break + if uploader_info: + webpage_info.update(uploader_info) + # title / description + if field in ('title', 'description', None): + for x in find_json_obj(post_data, + rf'{Q}message{Q}:(?:(?!{Q}message{Q}:)[^}}])+{Q}text{Q}:\s*{Q}[^{Q}](?:(?!{Q}id{Q}:).)+{Q}id{Q}:', + get_all=True): + x_dict = json.loads(x[1]) + for i in [i for i in [s_id, p_id] if i is not None]: + if x_dict.get('id') == i: + if (description := x_dict['message'] if isinstance(x_dict['message'], str) + else traverse_obj(x_dict, ('message', 'text', {str_or_none}))): + if track_title := self._search_regex(rf'({Q}track_title{Q}:\s*{Q}(?:(?:[^{Q}\\]|\\.)*){Q})', + x[1], 'track title', default=None): + description += '. ' + json.loads('{' + track_title + '}')['track_title'] + break + if description: + break + description = description or self._html_search_meta( + ['description', 'og:description', 'twitter:description'], + webpage, 'description', default='') + for x in find_json_obj(post_data, rf'title{Q}:\s*[^}}]+{Q}text{Q}:\s*{Q}[^{Q}]', get_all=True): + x_dict = json.loads(x[1]) + if p_id: + if (text := traverse_obj(x_dict, ('title', 'text', {str_or_none}))): + title = title or (text if x_dict.get('id') == p_id else None) + description = description or (text if x_dict.get('id') == s_id else description) + if title and description: + break + title = (lambda x: x if x != extract_metadata('uploader') else None + )(title + or (self._html_search_regex( + (r'\s(?P<content>[\s\S]+?)\s', + r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', + r'(?s)(?P.*?)'), + re.sub(r'(Facebook(\sLive)?)|(Video)', '', webpage), + 'title', default='', group='content') + or (lambda x: '' if not x or x.group(1) in ('Video', 'Facebook', 'Facebook Live') + else x.group(1).encode().decode('unicode_escape') + )(re.search(rf'{Q}meta{Q}:\s*{{{Q}title{Q}:\s*{Q}((?:[^{Q}\\]|\\.)*){Q}', webpage)) + or og_title + ).split(' | ')[0] + or re.sub(r'(\s*\n\s*)', ' ', description)) + webpage_info['title'] = title if len(title or '') <= 100 else title[:(47 + title[47:67].rfind(' '))] + '...' + webpage_info['description'] = description + # timestamp + if field in ('timestamp', None): + for x in find_json_obj(post_data, rf'creation_time{Q}:\s*\d+,', rf'created_time{Q}:\s*\d+,', + rf'publish_time{Q}:\s*\d+,', get_all=True): + if re.search(rf'id{Q}:\s*{Q}(?:(?:{s_id})|(?:{p_id})){Q}', x[1]): + webpage_info['timestamp'] = json.loads(x[1]).get(re.split(f'{Q}', x[0])[0]) + break + # like count + if field in ('like_count', None): + like_count = (re.search(rf'localized_name{Q}:\s*{Q}Like{Q}.[^}}]+{Q}reaction_count{Q}:\s*(\d+)}}', feedback_data) + or re.search(rf'likers{Q}:\s*{{{Q}count{Q}:\s*(\d+)}}', feedback_data)) + webpage_info['like_count'] = int(like_count.group(1)) if like_count else None + # comment count + if field in ('comment_count', None): + for x in find_json_obj(feedback_data, (rf'comments{Q}:\s*[^}}]+(total_count{Q}:\s*\d+)', + rf'total_comment_count{Q}:\s*\d+')): + webpage_info['comment_count'] = parse_count(traverse_obj(json.loads(x[1]), + (('total_count', (..., 'ig_comment_count'), 'total_comment_count'), {str_or_none}), get_all=False)) + # share count + if field in ('repost_count', None): + for x in find_json_obj(feedback_data, (rf'share_count{Q}:\s*[^}}]+(count{Q}:\s*"?\d+)', + rf'share_count(?:_reduced){Q}:\s*"?\d+')): + webpage_info['repost_count'] = parse_count(traverse_obj(json.loads(x[1]), + (('count', 'share_count_reduced', 'total_count'), {str_or_none}), get_all=False)) + # return data + return webpage_info.get(field) if field else webpage_info + + og_title = self._og_search_title(webpage, default='').split(' | ') + if len(og_title) > 1 and re.search(r'\d+\w?\s(?:reactions|shares|views)', og_title[0]): + og_title.pop(0) + og_title = re.sub(r'(\s*\n\s*)', ' ', ' | '.join(og_title)) + thumbnail = self._html_search_meta( + ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) + if thumbnail and not re.search(r'\.(?:gif|jpg|png|webp)', thumbnail): + thumbnail = None + + webpage_info = { + 'thumbnails': [{k: v for k, v in { + 'url': thumbnail, + 'height': int_or_none(self._search_regex( + r'stp=.+_[a-z]\d+x(\d+)&', thumbnail, 'thumbnail height', default=None)), + 'preference': None if 'stp=' in thumbnail else 1, + }.items() if v is not None}] if url_or_none(thumbnail) else [], + 'view_count': parse_count(self._search_regex( + (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'), + webpage, 'view count', default=None)), + } - def extract_dash_manifest(vid_data, formats, mpd_url=None): - dash_manifest = traverse_obj( - vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str) - if dash_manifest: - formats.extend(self._parse_mpd_formats( + p_id, s_id, linked_url, data, feedback_data = None, None, None, [], '' + Q = self._search_regex(r'(["\']):\s*[\[{]*\1', (post_data[0] if post_data else ''), 'quotation', default='"') + for p_data in post_data[:]: + if rf'{Q}feed_unit{Q}:' in p_data or not re.search( + rf'{Q}(?:dash_manifest_urls?|message|event_description){Q}:', p_data): + # discard useless feed data + post_data.remove(p_data) + else: + if (not s_id or not p_id) and (f'{Q}story{Q}:' in p_data or f'{Q}creation_story{Q}:' in p_data): + p_id = p_id or self._search_regex(rf'{Q}(?:post_id|videoId|video_id){Q}:\s*{Q}(\d+){Q}', p_data, + 'post id', default=(video_id if video_id.isnumeric() else None)) + s_id = s_id or self._search_regex(rf'id{Q}:\s*{Q}(Uzpf[^{Q}]+){Q}', p_data, 'story id', default=None) + if not data: + if re.search(rf'{Q}attachment{Q}:\s*{{{Q}(?:source|web_link){Q}:', p_data): + # linked video + for x in find_json_obj(p_data, rf'{Q}attachment{Q}:\s*{{{Q}(?:source|web_link){Q}:', + obj_in_value=True): + if linked_url := traverse_obj( + json.loads(x[1]), (('web_link', None), 'url', {url_or_none}), get_all=False): + url_transparent = '.facebook.com' not in urllib.parse.urlparse(linked_url).netloc + data = x[1] + break + elif f'{Q}dash_manifest_url' in p_data[:p_data.find(f'{Q}comment_list_renderer{Q}:')]: + for x in find_json_obj(p_data, rf'{Q}data{Q}:\s*{{', rf'{Q}data{Q}:', obj_in_value=True): + if f'{Q}dash_manifest_url' in x[1]: + data = x[1] + break + if (not feedback_data + and (f'{Q}likers{Q}:' in p_data or f'{Q}Like{Q}}},' in p_data or f'comment_count{Q}:' in p_data)): + for x in find_json_obj(p_data, rf'reaction_count{Q}:\s*{{', rf'feedback{Q}:\s*{{'): + if (((s_id or p_id or video_id) in x[1] or (p_id or video_id) in x[1]) + and webpage.count(json.loads(x[1]).get('id', 'null')) > 1): + feedback_data = x[1] + break + + if linked_url: + return self.url_result(linked_url, video_id=video_id, url_transparent=url_transparent, + **{k: v for k, v in (extract_metadata() if url_transparent else {}).items() if v}) + + def extract_dash_manifest(vid_data, formats, subtitle, mpd_url=None): + if dash_manifest := traverse_obj(vid_data, 'dash_manifest_xml_string', 'manifest_xml', + 'playlist', 'dash_manifest', expected_type=str): + fmts, subs = self._parse_mpd_formats_and_subtitles( compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), - mpd_url=url_or_none(vid_data.get('dash_manifest_url')) or mpd_url)) + mpd_url=url_or_none(vid_data.get('dash_manifest_url')) or mpd_url) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitle[0]) def process_formats(info): - # Downloads with browser's User-Agent are rate limited. Working around - # with non-browser User-Agent. for f in info['formats']: # Downloads with browser's User-Agent are rate limited. Working around # with non-browser User-Agent. @@ -584,77 +785,45 @@ def process_formats(info): # Formats larger than ~500MB will return error 403 unless chunk size is regulated f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20 - def yield_all_relay_data(_filter): - for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})', webpage): - yield self._parse_json(relay_data, video_id, fatal=False) or {} - - def extract_relay_data(_filter): - return next(filter(None, yield_all_relay_data(_filter)), {}) - - def extract_relay_prefetched_data(_filter, target_keys=None): - path = 'data' - if target_keys is not None: - path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys)) - return traverse_obj(yield_all_relay_data(_filter), ( - ..., 'require', (None, (..., ..., ..., '__bbox', 'require')), - lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v), - ..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {} - - if not video_data: - server_js_data = self._parse_json(self._search_regex([ - r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, - rf'bigPipe\.onPageletArrive\(({{.*?id\s*:\s*"{self._SUPPORTED_PAGLETS_REGEX}".*?}})\);', - ], webpage, 'js data', default='{}'), video_id, js_to_json, False) - video_data = extract_from_jsmods_instances(server_js_data) - - if not video_data: - data = extract_relay_prefetched_data( - r'"(?:dash_manifest|playable_url(?:_quality_hd)?)', - target_keys=('video', 'event', 'nodes', 'node', 'mediaset')) - if data: - entries = [] - - def parse_graphql_video(video): - v_id = video.get('videoId') or video.get('id') or video_id - reel_info = traverse_obj( - video, ('creation_story', 'short_form_video_context', 'playback_video', {dict})) - if reel_info: - video = video['creation_story'] - video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) - video.update(reel_info) - - formats = [] - q = qualities(['sd', 'hd']) - - # Legacy formats extraction - fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video - for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), - ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), - ('browser_native_sd_url', 'sd')): - playable_url = fmt_data.get(key) - if not playable_url: - continue + if data: + def parse_graphql_video(video): + v_id = video.get('videoId') or video.get('id') or video_id + formats, captions, subtitles = [], {}, {} + q = qualities(['sd', 'hd']) + is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool})) + + # videoDeliveryLegacy formats extraction + fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video + for key, format_id in (('browser_native_hd_url', 'hd'), ('browser_native_sd_url', 'sd')): + # obsoleted: ('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), ('playable_url_dash', '') + if playable_url := fmt_data.get(key): if determine_ext(playable_url) == 'mpd': - formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles(playable_url, video_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=(captions if is_broadcast else subtitles)) else: + q = qualities(['sd', 'hd']) formats.append({ 'format_id': format_id, # sd, hd formats w/o resolution info should be deprioritized below DASH 'quality': q(format_id) - 3, 'url': playable_url, }) - extract_dash_manifest(fmt_data, formats) + extract_dash_manifest(fmt_data, formats, [captions if is_broadcast else subtitles]) - # New videoDeliveryResponse formats extraction - fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult')) + # videoDeliveryResponse formats extraction + if fmt_data := traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult')): mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none})) dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml'])) for idx, dash_manifest in enumerate(dash_manifests): - extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx)) + extract_dash_manifest(dash_manifest, formats, [captions if is_broadcast else subtitles], + mpd_url=traverse_obj(mpd_urls, idx)) if not dash_manifests: # Only extract from MPD URLs if the manifests are not already provided for mpd_url in mpd_urls: - formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, video_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=(captions if is_broadcast else subtitles)) for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])): format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower})) formats.append({ @@ -664,157 +833,174 @@ def parse_graphql_video(video): 'url': prog_fmt['progressive_url'], }) for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})): - formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls')) - - if not formats: - # Do not append false positive entry w/o any formats - return - - automatic_captions, subtitles = {}, {} - is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool})) - for caption in traverse_obj(video, ( - 'video_available_captions_locales', - {lambda x: sorted(x, key=lambda c: c['locale'])}, - lambda _, v: url_or_none(v['captions_url']), - )): - lang = caption.get('localized_language') or 'und' - subs = { - 'url': caption['captions_url'], - 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang), - } - if caption.get('localized_creation_method') or is_broadcast: - automatic_captions.setdefault(caption['locale'], []).append(subs) - else: - subtitles.setdefault(caption['locale'], []).append(subs) - captions_url = traverse_obj(video, ('captions_url', {url_or_none})) - if captions_url and not automatic_captions and not subtitles: - locale = self._html_search_meta( - ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US') - (automatic_captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}] - - info = { - 'id': v_id, - 'formats': formats, - 'thumbnail': traverse_obj( - video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')), - 'uploader_id': traverse_obj(video, ('owner', 'id', {str_or_none})), - 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none), - 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000) - or float_or_none(video.get('length_in_second'))), - 'automatic_captions': automatic_captions, - 'subtitles': subtitles, + fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', + fatal=False, m3u8_id='hls') + formats.extend(fmts) + self._merge_subtitles(subs, target=(captions if is_broadcast else subtitles)) + + # captions/subtitles + for caption in traverse_obj(video, ( + 'video_available_captions_locales', + {lambda x: sorted(x, key=lambda c: c['locale'])}, + lambda _, v: url_or_none(v['captions_url']), + )): + lang = caption.get('localized_language') or 'und' + subs = { + 'url': caption['captions_url'], + 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang), } - process_formats(info) - description = try_get(video, lambda x: x['savable_description']['text']) - title = video.get('name') - if title: - info.update({ - 'title': title, - 'description': description, - }) + if caption.get('localized_creation_method') or is_broadcast: + captions.setdefault(caption['locale'], []).append(subs) else: - info['title'] = description or f'Facebook video #{v_id}' - entries.append(info) - - def parse_attachment(attachment, key='media'): - media = attachment.get(key) or {} - if media.get('__typename') == 'Video': - return parse_graphql_video(media) - - nodes = variadic(traverse_obj(data, 'nodes', 'node') or []) - attachments = traverse_obj(nodes, ( - ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments', - ..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')), - 'attachment', {dict})) - for attachment in attachments: - ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}), - ('target', 'attachments', ..., 'styles', 'attachment', {dict})) - for n in ns: - parse_attachment(n) - parse_attachment(attachment) - - edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] - for edge in edges: - parse_attachment(edge, key='node') - - video = traverse_obj(data, ( - 'event', 'cover_media_renderer', 'cover_video'), 'video', expected_type=dict) or {} - if video: - attachments = try_get(video, [ - lambda x: x['story']['attachments'], - lambda x: x['creation_story']['attachments'], - ], list) or [] - for attachment in attachments: - parse_attachment(attachment) - if not entries: - parse_graphql_video(video) - - if len(entries) > 1: - return self.playlist_result(entries, video_id) - - video_info = entries[0] if entries else {'id': video_id} - webpage_info = extract_metadata(webpage) - # honor precise duration in video info - if video_info.get('duration'): - webpage_info['duration'] = video_info['duration'] - # preserve preferred_thumbnail in video info - if video_info.get('thumbnail'): - webpage_info['thumbnail'] = video_info['thumbnail'] - return merge_dicts(webpage_info, video_info) - - if not video_data: - m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) - if m_msg is not None: - raise ExtractorError( - f'The video is not available, Facebook said: "{m_msg.group(1)}"', - expected=True) - elif any(p in webpage for p in ( - '>You must log in to continue', - 'id="login_form"', - 'id="loginbutton"')): - self.raise_login_required() - - if not video_data and '/watchparty/' in url: - post_data = { - 'doc_id': 3731964053542869, - 'variables': json.dumps({ - 'livingRoomID': video_id, - }), - } - - prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{') - if prefetched_data: - lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict) - if lsd: - post_data[lsd['name']] = lsd['value'] - - relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,') - for define in (relay_data.get('define') or []): - if define[0] == 'RelayAPIConfigDefaults': - self._api_config = define[2] + subtitles.setdefault(caption['locale'], []).append(subs) + captions_url = traverse_obj(video, ('captions_url', {url_or_none})) + if captions_url and not captions and not subtitles: + locale = self._html_search_meta( + ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US') + (captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}] + # thumbnails + thumbnails = [] + for url in [uri for uri in [traverse_obj(video, path) for path in [ + ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri'), + ('image', 'uri'), ('previewImage', 'uri'), + ]] if url_or_none(uri) is not None]: + if (re.search(r'\.(?:jpg|png)', url) + and not any(url.split('_cat=')[0] in t['url'] for t in thumbnails)): + thumbnails.append({k: v for k, v in { + 'url': url, + 'height': int_or_none(self._search_regex( + r'stp=.+_[a-z]\d+x(\d+)&', url, 'thumbnail height', default=None)), + 'preference': None if 'stp=' in url else 1, + }.items() if v is not None}) + # timestamp + v_timestamp = traverse_obj(video, 'publish_time', 'creation_time', 'created_time', {int_or_none}) + if not v_timestamp and v_id != video_id: + for x in find_json_obj(post_data, rf'creation_time{Q}:\s*\d+,', rf'created_time{Q}:\s*\d+,', + rf'publish_time{Q}:\s*\d+,', get_all=True): + if re.search(rf'id{Q}:\s*{Q}{v_id}{Q}', x[1]): + if v_timestamp := json.loads(x[1]).get(x[0].split(f'{Q}')[0]): + break + # uploader + if uploader_id := traverse_obj(video, ('owner', 'id', {str_or_none})): + if x := list(find_json_obj(data, (rf'id{Q}:\s*{Q}{uploader_id}{Q}[^}}]*{Q}name{Q}:\s*{Q}[^{Q}]', + rf'{Q}name{Q}:\s*{Q}[^{Q}][^}}]*{Q}id{Q}:\s*{Q}{uploader_id}{Q}'))): + if x[0][1]: + video['owner'] = merge_dicts(video['owner'], json.loads(x[0][1])) + elif x := list(find_json_obj(post_data, (rf'(owner{Q}:)[^}}]*{Q}name{Q}:\s*{Q}[^{Q}]'), + rf'((?!share)\w{{5}}_creator{Q}:)[^}}]*{Q}name{Q}:\s*{Q}[^{Q}]', + obj_in_value=True)): + if x[0][1]: + video['owner'] = json.loads(x[0][1]) + uploader_id = traverse_obj(video, ('owner', 'id', {str_or_none})) + uploader = traverse_obj(video, ('owner', 'name', {str_or_none})) or extract_metadata('uploader') + # description + v_desc = traverse_obj(video, ('savable_description', 'text', {str_or_none})) + if not v_desc and v_id != video_id: + if vs_id := traverse_obj(video, ( + (None, (..., 'video')), 'creation_story', 'id', {str_or_none}), get_all=False): + if x := list(find_json_obj( + data, rf'{Q}message{Q}:(?:(?!{Q}message{Q}:)[^}}])+{Q}text{Q}:\s*{Q}[^{Q}](?:(?!{Q}id{Q}:).)+{Q}id{Q}:\s*{Q}{vs_id}{Q}')): + v_desc = (lambda x: x if x != uploader else None)(json.loads(x[0][1])['message']['text']) + # else: + # for x in find_json_obj(data, rf'video{Q}:\s*{{{Q}id{Q}:\s*{Q}{v_id}{Q}', get_all=True): + # if v_desc := traverse_obj(json.loads(x[1]), ('message', 'text', {str_or_none})): + # break + # title + if v_name := video.get('name'): + v_title = v_name if len(v_name) <= 100 else v_name[:(47 + v_name[47:67].rfind(' '))] + '...' + + info = { + 'id': v_id, + 'title': ((v_title if video.get('name') else None) + or (f"{extract_metadata('title')} Facebook Video #{v_id}" if extract_metadata('title') + else (f'{uploader} Facebook Video #{v_id}' if uploader else f'Facebook Video #{v_id}'))), + 'description': v_desc or extract_metadata('description'), + 'thumbnails': thumbnails, + 'timestamp': v_timestamp or extract_metadata('timestamp'), + 'uploader': uploader, + 'uploader_id': uploader_id or webpage_info.get('uploader_id'), + 'uploader_url': (traverse_obj(video, ('owner', 'url', {url_or_none})) + or (webpage_info.get('uploader_url') if webpage_info.get('uploader') == uploader else None) + or (lambda x: f'https://www.facebook.com/profile.php?id={x}' if x else None + )(uploader_id or webpage_info.get('uploader_id'))), + 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000) + or float_or_none(video.get('length_in_second'))), + 'formats': formats, + 'automatic_captions': captions, + 'subtitles': subtitles, + 'is_live': video.get('is_live_streaming'), + 'was_live': (video.get('broadcast_status') == 'VOD_READY'), + 'concurrent_view_count': video.get('liveViewerCount'), + 'like_count': extract_metadata('like_count'), + 'comment_count': extract_metadata('comment_count'), + 'repost_count': extract_metadata('repost_count'), + 'age_limit': 18 if (re.search(rf'{Q}validator{Q}:\s*{Q}GRAPHIC{Q}', data) + and f'{Q}OverlayWarningScreenViewModel{Q}' in data) else None, + } + process_formats(info) + entries.append(info) + + entries, video_ids = [], [] + # for idx, x in enumerate(find_json_obj(data, (rf'dash_manifest_url{Q}:\s*{Q}', + # rf'_hd_url{Q}:\s*{Q}', rf'_sd_url{Q}:\s*{Q}'), get_all=True)): + for idx, x in enumerate(find_json_obj(data, (rf'videoDeliveryLegacyFields{Q}:'), get_all=True)): + media = json.loads(x[1]) + if (media.get('__typename', 'Video') == 'Video' + and not media.get('sticker_image') + and media.get('id', f'{video_id}_{idx}') not in video_ids): + video_ids.append(media.get('id', f'{video_id}_{idx}')) + parse_graphql_video(media) + if media.get('id') == video_id: + break + + if len(entries) > 1: + return self.playlist_result(entries, video_id, **{ + k: v for k, v in extract_metadata().items() if v}) + + video_info = entries[0] if entries else {'id': video_id} + video_info['title'] = re.sub(r' Facebook Video #\d{15,}$', '', video_info.get('title')) + if webpage_info['thumbnails']: + if not (any(webpage_info['thumbnails'][0]['url'].split('_cat=')[0] in thumbnail['url'] + for thumbnail in video_info['thumbnails'])): + video_info['thumbnails'].extend(webpage_info['thumbnails']) + return merge_dicts(video_info, webpage_info) + + # if 'data' not found + video_data = None - living_room = self._download_json( - urljoin(url, self._api_config['graphURI']), video_id, - data=urlencode_postdata(post_data))['data']['living_room'] + def extract_video_data(instances): + video_data = [] + for item in instances: + if try_get(item, lambda x: x[1][0]) == 'VideoConfig': + video_item = item[2][0] + if video_item.get('video_id'): + video_data.append(video_item['videoData']) + return video_data - entries = [] - for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []): - video = try_get(edge, lambda x: x['node']['video']) or {} - v_id = video.get('id') - if not v_id: - continue - v_id = str(v_id) - entries.append(self.url_result( - self._VIDEO_PAGE_TEMPLATE % v_id, - self.ie_key(), v_id, video.get('name'))) + def extract_from_jsmods_instances(js_data): + if js_data: + return extract_video_data(try_get( + js_data, lambda x: x['jsmods']['instances'], list) or []) - return self.playlist_result(entries, video_id) + if server_js_data := self._parse_json(self._search_regex( + [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'], + webpage, 'server js data', default='{}'), video_id, fatal=False): + video_data = extract_video_data(server_js_data.get('instances', [])) if not video_data: + if server_js_data := self._parse_json(self._search_regex([ + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, + rf'bigPipe\.onPageletArrive\(({{.*?id\s*:\s*"{self._SUPPORTED_PAGLETS_REGEX}".*?}})\);', + ], webpage, 'js data', default='{}'), video_id, js_to_json, False): + video_data = extract_from_jsmods_instances(server_js_data) + + if not video_data and False: # skipped because not working # Video info not in first request, do a secondary request using # tahoe player specific URL tahoe_data = self._download_webpage( - self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, + self._VIDEO_PAGE_TAHOE_TEMPLATE % p_id, video_id, + fatal=False, + expected_status=404, data=urlencode_postdata({ '__a': 1, '__pc': self._search_regex( @@ -830,15 +1016,30 @@ def parse_attachment(attachment, key='media'): headers={ 'Content-Type': 'application/x-www-form-urlencoded', }) - tahoe_js_data = self._parse_json( - self._search_regex( - r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, - 'tahoe js data', default='{}'), - video_id, fatal=False) - video_data = extract_from_jsmods_instances(tahoe_js_data) + if tahoe_data: + tahoe_js_data = self._parse_json( + self._search_regex( + r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, + 'tahoe js data', default='{}'), + video_id, fatal=False) + video_data = extract_from_jsmods_instances(tahoe_js_data) if not video_data: - raise ExtractorError('Cannot parse data') + m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) + if m_msg is not None: + raise ExtractorError( + f'The video is not available, Facebook said: "{m_msg.group(1)}"', + expected=True) + elif any(p in webpage for p in ( + '>You must log in to continue', + 'id="login_form"', + 'id="loginbutton"')): + self.raise_login_required(method='cookies') + elif not login_data: + self.raise_login_required('No video formats found', method='cookies') + elif not logged_in: + self.raise_login_required('Failed to login with provided data', method='cookies') + self.raise_no_formats('No video formats found!') if len(video_data) > 1: entries = [] @@ -849,6 +1050,7 @@ def parse_attachment(attachment, key='media'): entries.append(self.url_result(urljoin( url, video_url), self.ie_key(), v[0].get('video_id'))) return self.playlist_result(entries, video_id) + video_data = video_data[0] formats = [] @@ -874,7 +1076,7 @@ def parse_attachment(attachment, key='media'): 'quality': preference, 'height': 720 if quality == 'hd' else None, }) - extract_dash_manifest(f[0], formats) + extract_dash_manifest(f[0], formats, [subtitles]) subtitles_src = f[0].get('subtitles_src') if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src}) @@ -885,31 +1087,33 @@ def parse_attachment(attachment, key='media'): 'subtitles': subtitles, } process_formats(info_dict) - info_dict.update(extract_metadata(webpage)) + info_dict.update(webpage_info) return info_dict - def _real_extract(self, url): - video_id = self._match_id(url) - - real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url - return self._extract_from_url(real_url, video_id) - class FacebookPluginsVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?Phttps.+)' _TESTS = [{ 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560', - 'md5': '5954e92cdfe51fe5782ae9bda7058a07', 'info_dict': { 'id': '10154383743583686', 'ext': 'mp4', - # TODO: Fix title, uploader 'title': 'What to do during the haze?', - 'uploader': 'Gov.sg', + 'description': 'md5:81839c0979803a014b20798df255ed0b', + 'thumbnail': r're:^https?://.*', + 'uploader': 'gov.sg', + 'uploader_id': '100064718678925', + 'uploader_url': r're:^https?://.*', 'upload_date': '20160826', 'timestamp': 1472184808, + 'duration': 65.087, + 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'not_live', + 'like_count': int, + 'comment_count': int, }, 'add_ie': [FacebookIE.ie_key()], }, { @@ -940,7 +1144,7 @@ class FacebookRedirectURLIE(InfoExtractor): 'playable_in_embed': True, 'categories': ['Music'], 'channel': 'Boiler Room', - 'uploader_id': 'brtvofficial', + 'uploader_id': '@boilerroom', 'uploader': 'Boiler Room', 'tags': 'count:11', 'duration': 3332, @@ -948,11 +1152,16 @@ class FacebookRedirectURLIE(InfoExtractor): 'thumbnail': 'https://i.ytimg.com/vi/pO8h3EaFRdo/maxresdefault.jpg', 'channel_url': 'https://www.youtube.com/channel/UCGBpxWJr9FNOcFYA5GkKrMg', 'availability': 'public', - 'uploader_url': 'http://www.youtube.com/user/brtvofficial', + 'uploader_url': r're:^https?://.*', 'upload_date': '20150917', + 'timestamp': 1442489450, 'age_limit': 0, 'view_count': int, 'like_count': int, + 'heatmap': 'count:100', + 'channel_is_verified': True, + 'channel_follower_count': int, + 'comment_count': int, }, 'add_ie': ['Youtube'], 'params': {'skip_download': 'Youtube'}, @@ -971,18 +1180,19 @@ class FacebookReelIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.facebook.com/reel/1195289147628387', - 'md5': 'a53256d10fc2105441fe0c4212ed8cea', 'info_dict': { 'id': '1195289147628387', 'ext': 'mp4', - 'title': r're:9\.6K views · 355 reactions .+ Let the “Slapathon” commence!! .+ LL COOL J · Mama Said Knock You Out$', - 'description': r're:When your trying to help your partner .+ LL COOL J · Mama Said Knock You Out$', + 'title': 'md5:32aab9976c6b8a145fc0d799631e2b74', + 'description': 'md5:3ea795c5ebb7ed28e3e78bb7b1191753', 'uploader': 'Beast Camp Training', 'uploader_id': '100040874179269', + 'uploader_url': r're:^https?://.*', 'duration': 9.579, 'timestamp': 1637502609, 'upload_date': '20211121', 'thumbnail': r're:^https?://.*', + 'live_status': 'not_live', 'like_count': int, 'comment_count': int, 'repost_count': int, @@ -992,7 +1202,7 @@ class FacebookReelIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) return self.url_result( - f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id) + f'https://www.facebook.com/watch/?v={video_id}', FacebookIE, video_id) class FacebookAdsIE(InfoExtractor): @@ -1076,11 +1286,15 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - post_data = traverse_obj( - re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage), (..., {json.loads})) - data = get_first(post_data, ( - 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., - 'entryPointRoot', 'otherProps', 'deeplinkAdCard', 'snapshot', {dict})) + if post_data := traverse_obj( + re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage), (..., {json.loads})): + data = get_first(post_data, ( + 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., + 'entryPointRoot', 'otherProps', 'deeplinkAdCard', 'snapshot', {dict})) + elif post_data := traverse_obj( + re.findall(r's\.handle\(({.*})\);requireLazy\(', webpage), (..., {json.loads})): + data = get_first(post_data, ( + 'require', ..., ..., ..., 'props', 'deeplinkAdCard', 'snapshot', {dict})) if not data: raise ExtractorError('Unable to extract ad data')