forked from bellingcat/auto-archiver
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wacz_enricher.py
658 lines (522 loc) · 32.6 KB
/
wacz_enricher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
import jsonlines
import mimetypes
import os, shutil, subprocess
from zipfile import ZipFile
from loguru import logger
from warcio.archiveiterator import ArchiveIterator
from ..core import Media, Metadata, ArchivingContext
from . import Enricher
from ..archivers import Archiver
from ..utils import UrlUtil, random_str
import time
class WaczArchiverEnricher(Enricher, Archiver):
"""
Uses https://github.com/webrecorder/browsertrix-crawler to generate a .WACZ archive of the URL
If used with [profiles](https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
it can become quite powerful for archiving private content.
When used as an archiver it will extract the media from the .WACZ archive so it can be enriched.
"""
name = "wacz_archiver_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
@staticmethod
def configs() -> dict:
return {
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
# "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
"timeout": {"default": 200, "help": "timeout for WACZ generation in seconds"},
"extract_media": {"default": True, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media. The .wacz file will be kept untouched."}
}
# DM setup and clenup are new functions
# which I'm commenting out for now to make sure everything still works
def setup(self) -> None:
#self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
#self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')
#self.cwd_dind = f"/crawls/crawls{random_str(8)}"
#self.browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST')
#self.browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or self.browsertrix_home_host
# create crawls folder if not exists, so it can be safely removed in cleanup
#if self.docker_in_docker:
# os.makedirs(self.cwd_dind, exist_ok=True)
foo = 1
def cleanup(self) -> None:
#if self.docker_in_docker:
# logger.debug(f"Removing {self.cwd_dind=}")
# shutil.rmtree(self.cwd_dind, ignore_errors=True)
# cleanup the linux tmp directory
linux_tmp_dir ='/home/dave/aatmp'
if os.path.exists(linux_tmp_dir):
shutil.rmtree(linux_tmp_dir)
def download(self, item: Metadata) -> Metadata:
# this new Metadata object is required to avoid duplication
result = Metadata()
result.merge(item)
if self.enrich(result):
return result.success("wacz")
# On WSL2 in Dev I've seen spurious :ERR_NETWORK_CHANGED at
# errors from browsertrix which fails out of the crawl
# It seems to be more solid on Linux production
def enrich(self, to_enrich: Metadata) -> bool:
url = to_enrich.get_url()
if "facebook.com" in to_enrich.netloc:
logger.debug("Special codepath using playwright with a logged in facebook profile to do a screenshot")
# where 1.png etc are saved
tmp_dir = ArchivingContext.get_tmp_dir()
command = ["pipenv", "run", "xvfb-run", "python3", "c60playwright_facebook.py", url, tmp_dir]
# '/mnt/c/dev/v6-auto-archiver' - where the c60.py file is called
working_directory = os.getcwd()
# Use subprocess.run to execute the command with the specified working directory
sub_result = subprocess.run(command, cwd=working_directory, capture_output=True, text=True)
# Print the output and error (if any)
logger.debug(f"Playwright Output: {sub_result.stdout}")
fn = os.path.join(tmp_dir, f"1.png")
m = Media(filename=fn)
to_enrich.add_media(m, f"c60playwright-screenshot")
if to_enrich.get_media_by_id("browsertrix"):
logger.info(f"WACZ enricher had already been executed: {to_enrich.get_media_by_id('browsertrix')}")
return True
collection = random_str(8)
# need a temp directory for docker to write to outside of current working directory for this app
# otherwise working_directory = os.getcwd() will fail after docker has run once.
linux_tmp_dir ='/home/dave/aatmp'
# Check if the directory exists, and if not, create it
if not os.path.exists(linux_tmp_dir):
os.makedirs(linux_tmp_dir)
# This doesn't work
# '/mnt/c/dev/v6-auto-archiver/tmpa22nvh69'
# browsertrix_home_host = os.path.abspath(ArchivingContext.get_tmp_dir())
# try:
# browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST') or os.path.abspath(ArchivingContext.get_tmp_dir())
# except FileNotFoundError as e:
# logger.debug('Dev environment found using ' + hard_code_directory_for_wsl2)
# browsertrix_home_host = hard_code_directory_for_wsl2 + ArchivingContext.get_tmp_dir()[1:]
# works
# browsertrix_home_host ='/mnt/c/dev/test'
browsertrix_home_host = linux_tmp_dir
# browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or browsertrix_home_host
browsertrix_home_container = linux_tmp_dir
cmd = [
"crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--screenshot", "fullPage",
"--collection", collection,
"--id", collection,
"--saveState", "never",
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout),
"--postLoadDelay", "20"]
# call docker if explicitly enabled or we are running on the host (not in docker)
# use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
use_docker = True
#88 - generating WACZ in Docker for url='https://www.facebook.com/khitthitnews/posts/pfbid02tX6o4TcNykMYyH4Wjbz3ckq5bH5rRr7aqLFCymkWwhVzPJGwq2mSCnp9jYZ8CVdTl'
# 89 - browsertrix_home_host='/home/dave/auto-archiver/tmplwb1vufr' browsertrix_home_container='/home/dave/auto-archiver/tmplwb1vufr'
# 99 - copying secrets/profile.tar.gz to /home/dave/auto-archiver/tmplwb1vufr/profile.tar.gz
if use_docker:
logger.debug(f"generating WACZ in Docker for {url=}")
logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}")
if self.docker_commands:
cmd = self.docker_commands + cmd
else:
# note there is another part further down the code which needs to be changed too.
cmd = ["docker", "run", "--rm", "-v", f"{browsertrix_home_host}:/crawls/", "webrecorder/browsertrix-crawler"] + cmd
if self.profile:
profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz")
logger.debug(f"copying {self.profile} to {profile_fn}")
shutil.copyfile(self.profile, profile_fn)
cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
# else:
# logger.debug(f"generating WACZ without Docker for {url=}")
# if self.profile:
# cmd.extend(["--profile", os.path.join("/app", str(self.profile))])
try:
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
except Exception as e:
logger.error(f"WACZ generation failed: {e}")
return False
# good test for filesystem bug solved by having a temp directory outside of the current working directory
# this will throw an exception
working_directory = os.getcwd()
if use_docker:
wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
# else:
# wacz_fn = os.path.join("collections", collection, f"{collection}.wacz")
if not os.path.exists(wacz_fn):
logger.warning(f"Unable to locate and upload WACZ {wacz_fn=}")
return False
# adding the .wacz file to the Metadata object
to_enrich.add_media(Media(wacz_fn), "browsertrix")
if self.extract_media:
self.extract_media_from_wacz(to_enrich, wacz_fn)
return True
def extract_media_from_wacz(self, to_enrich: Metadata, wacz_filename: str) -> None:
"""
Receives a .wacz archive, and extracts all relevant media from it, adding them to to_enrich.
"""
logger.info(f"WACZ extract_media flag is set, extracting media from {wacz_filename=}")
# unzipping the .wacz
tmp_dir = ArchivingContext.get_tmp_dir()
unzipped_dir = os.path.join(tmp_dir, "unzipped")
with ZipFile(wacz_filename, 'r') as z_obj:
z_obj.extractall(path=unzipped_dir)
# DM - use --combineWarc so don't have to do this?
# if warc is split into multiple gzip chunks, merge those
warc_dir = os.path.join(unzipped_dir, "archive")
warc_filename = os.path.join(tmp_dir, "merged.warc")
with open(warc_filename, 'wb') as outfile:
for filename in sorted(os.listdir(warc_dir)):
if filename.endswith('.gz'):
chunk_file = os.path.join(warc_dir, filename)
with open(chunk_file, 'rb') as infile:
shutil.copyfileobj(infile, outfile)
# get media out of .warc
counter = 0
seen_urls = set()
url = to_enrich.get_url()
if "facebook.com" in to_enrich.netloc:
logger.debug("special facebook codepath to extract media")
# if this first root page is strategy 0 then get the images as will be full resolution already
if "facebook.com/photo" in url:
# strategy 0 eg https://www.facebook.com/photo/?fbid=1646726009098072&set=pcb.1646726145764725
crawl_and_get_media_from_sub_page = False
else:
# strategy 1 eg https://www.facebook.com/khitthitnews/posts/pfbid0PTvT6iAccWqatvbDQNuqpFwL5WKzHuLK4QjP97Fwut637CV3XXQU53z1s2bJMAKwl
crawl_and_get_media_from_sub_page = True
with open(warc_filename, 'rb') as warc_stream:
for record in ArchiveIterator(warc_stream):
# only include fetched resources
if record.rec_type == "resource": # browsertrix screenshots
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
m = Media(filename=fn)
# DM there are 2 screenshots
# the first one is bonkers and seems to be a png but isn't.
# ignore it as it is always there
# I've seen the 4th be bonkers too
# DM 3rd Oct 24 - using playwrite to get a screenshot so lets comment this out
# was getting too much green in the screenshots
# DMAug 18 2024
# testing screenshots
if (counter == 0):
logger.debug(f'ignoring the first screenshot as it is always a png but isn\'t')
else:
logger.debug('Ignoring screenshots inside wacz for now')
# to_enrich.add_media(m, f"browsertrix-screenshot-{counter}")
counter += 1
# DM added as want to go to next record from here
continue
# Get fb_id and set_id logic
# DM catch for strategy 1 - Part 1
if record.rec_type == 'request' and crawl_and_get_media_from_sub_page == True:
uri = record.rec_headers.get_header('WARC-Target-URI')
if "bulk-route-definitions/" in uri:
content = record.content_stream().read()
foo = str(content)
# Strategy 1 test
# photo%2F%3Ffbid%3D1646726009098072%26set%3Dpcb.1646726145764725%26
# fbid = 1646726009098072
# set = pcb.1646726145764725
photo_string_start_pos = foo.find(f'photo%2F%3Ffbid%3D',0)
if (photo_string_start_pos > 0):
fbid_start_pos = photo_string_start_pos + 18
middle_26_start_pos = foo.find(f'%26', fbid_start_pos)
fb_id = foo[fbid_start_pos:middle_26_start_pos]
# photo%2F%3Ffbid%3D1646726009098072%26set%3Dpcb.1646726145764725%26
set_end_pos = foo.find(f'%26', middle_26_start_pos+1)
set_id = foo[middle_26_start_pos+13:set_end_pos]
logger.info(f" *** Part 1 - Strategy 1 {fb_id=} and {set_id=}")
bar = f'https://www.facebook.com/photo/?fbid={fb_id}&set=pcb.{set_id}'
logger.debug(f'starting url go to next full res image js viewer page, and start crawl is {bar}')
# Part 2
# fb_ids_to_request = []
fb_ids_requested = []
while (True):
builder_url = f"https://www.facebook.com/photo?fbid={fb_id}&set=pcb.{set_id}"
fb_ids_requested.append(fb_id)
logger.info(f" *** Part 2 next trying url for js page {builder_url}")
next_fb_id = self.save_images_to_enrich_object_from_url_using_browsertrix(builder_url, to_enrich, fb_id)
total_images = len(to_enrich.media)
if total_images > 70:
logger.warning('Total images is > max so stopping crawl')
break
if next_fb_id in fb_ids_requested:
logger.debug('have looped around all photos in js viewer so end')
break
else:
fb_id = next_fb_id
else:
logger.debug('photo string not found in bulk-route-definitions - this is normal. 1 out of 3 have seen work... ')
logger.debug('it also could be a single image which is different')
# Strategy x - single photo in js viewer
# photos%2Fa.386800725090613%2F1425302087907133%2F%3F
photos_string_start_pos = foo.find(f'photos%2Fa.',0)
if (photos_string_start_pos > 0):
fbid_start_pos = photos_string_start_pos + 11
middle_2F_start_pos = foo.find(f'%2F', fbid_start_pos)
fb_id = foo[fbid_start_pos:middle_2F_start_pos]
set_end_pos = foo.find(f'%2F', middle_2F_start_pos+1)
set_id = foo[middle_2F_start_pos+3:set_end_pos]
logger.debug(f"Strategy x single photo {fb_id=} and {set_id=}")
# route_urls[0]=%2Fkhitthitnews%3F
name_thing_start_pos = foo.find(f'route_urls[0]=%2F', 0)
name_thing_end_pos = foo.find(f'%3F',name_thing_start_pos)
name_thing = foo[name_thing_start_pos + 17:name_thing_end_pos]
# https://www.facebook.com/khitthitnews/photos/a.386800725090613/1425302087907133/
builder_url = f'https://www.facebook.com/{name_thing}/photos/a.{fb_id}/{set_id}/'
logger.debug('url to get next for the single photo is ' + builder_url)
next_fb_id = self.save_images_to_enrich_object_from_url_using_browsertrix(builder_url, to_enrich, fb_id)
# no crawl as a single photo only which has already been added
# this is probably meant to be a single photo
# however there may be many more via left and right arrows, but we don't want
# end of strategy 1
continue # to the next record
# only strategy 0
# save image logic
# as we don't want media from other strategies from this root page
if crawl_and_get_media_from_sub_page == False:
if record.rec_type != 'response': continue
record_url = record.rec_headers.get_header('WARC-Target-URI')
if not UrlUtil.is_relevant_url(record_url):
logger.debug(f"Skipping irrelevant URL {record_url} but it's still present in the WACZ.")
continue
if record_url in seen_urls:
logger.debug(f"Skipping already seen URL {record_url}.")
continue
# filter by media mimetypes
content_type = record.http_headers.get("Content-Type")
if not content_type: continue
if not any(x in content_type for x in ["video", "image", "audio"]): continue
# DM - ignore this specialised content type for facebook
if content_type == "image/x.fb.keyframes": continue
# create local file and add media
ext = mimetypes.guess_extension(content_type)
warc_fn = f"warc-file-{counter}{ext}"
fn = os.path.join(tmp_dir, warc_fn)
record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
# fn = './tmpil4kenvz/warc-file-0.jpg'
m = Media(filename=fn)
# record_url = 'https://scontent.ffab1-1.fna.fbcdn.net/v/t39.30808-1/340998090_786100926215087_3926180936792898436_n.jpg?stp=cp0_dst-jpg_p40x40&_nc_cat=1&ccb=1-7&_nc_sid=754033&_nc_ohc=8f70FA4fXssAX_OKDt0&_nc_oc=AQmMBocWCJEOrxM00aa52d3EcGEpbsCGKYMJSZcCgtOXrSnz66eWPGLgiuZ7GU3LiqM&_nc_ht=scontent.ffab1-1.fna&oh=00_AfC-Xg0lgD-HjujjdkUYrvwtgiFbq6tvuZJyu5Mfgnk24A&oe=650874A9'
m.set("src", record_url)
# if a link with better quality exists, try to download that
if record_url_best_qual != record_url:
try:
m.filename = self.download_from_url(record_url_best_qual, warc_fn, to_enrich)
m.set("src", record_url_best_qual)
m.set("src_alternative", record_url)
except Exception as e: logger.warning(f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC.")
# remove bad videos
if m.is_video() and not m.is_valid_video(): continue
# DM if size of media file is <30k discard
if os.path.getsize(m.filename) < 30000: continue
logger.debug(f'Facebook strategy 0. Saving {m.filename}')
# to_enrich contains the wacz and 4 images
# warc_fn = 'warc-file-0.jpg'
to_enrich.add_media(m, warc_fn)
counter += 1
seen_urls.add(record_url)
logger.info(f"special case FB WACZ extract_media finished, found {counter} relevant media file(s)")
## normal non FB media extraction
# this is for archiver (if all else fails) - but I dont use this.
# and for enricher - where I just want the wacz
else:
with open(warc_filename, 'rb') as warc_stream:
counter = 0
for record in ArchiveIterator(warc_stream):
# only include fetched resources
if record.rec_type == "resource": # screenshots
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
m = Media(filename=fn)
# DM 4th Oct - don't want screenshot from wacz for non fb
# to_enrich.add_media(m, f"browsertrix-screenshot-{counter}")
counter += 1
if record.rec_type != 'response': continue
record_url = record.rec_headers.get_header('WARC-Target-URI')
if not UrlUtil.is_relevant_url(record_url):
logger.debug(f"Skipping irrelevant URL {record_url} but it's still present in the WACZ.")
continue
if record_url in seen_urls:
logger.debug(f"Skipping already seen URL {record_url}.")
continue
# filter by media mimetypes
content_type = record.http_headers.get("Content-Type")
if not content_type: continue
if not any(x in content_type for x in ["video", "image", "audio"]): continue
# DM - ignore this specialised content type for facebook
# if content_type == "image/x.fb.keyframes": continue
# create local file and add media
ext = mimetypes.guess_extension(content_type)
warc_fn = f"warc-file-{counter}{ext}"
fn = os.path.join(tmp_dir, warc_fn)
record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
# only want < 40 images
if counter > 40:
logger.debug(f"Stopping as found 40 images")
break # out of for loop
# only want larger more important images
# DM 25th Oct 24 if size of media file is < x discard
fs = os.path.getsize(fn)
if fs < 6000 and ext == ".jpg": continue
if fs < 6000 and ext == ".webp": continue
if fs < 37000 and ext == ".png": continue
if ext == ".gif": continue
if ext == ".ico": continue
if ext == None : continue
m = Media(filename=fn)
m.set("src", record_url)
# if a link with better quality exists, try to download that
if record_url_best_qual != record_url:
try:
m.filename = self.download_from_url(record_url_best_qual, warc_fn, to_enrich)
m.set("src", record_url_best_qual)
m.set("src_alternative", record_url)
except Exception as e: logger.warning(f"Unable to download best quality URL for {record_url=} got error {e}, using original in WARC.")
# remove bad videos
if m.is_video() and not m.is_valid_video(): continue
logger.debug(f'Normal strategy 0. Saving {m.filename}')
to_enrich.add_media(m, warc_fn)
counter += 1
seen_urls.add(record_url)
logger.info(f"WACZ extract_media finished, found {counter} relevant media file(s)")
# only used by FB codepath
def save_images_to_enrich_object_from_url_using_browsertrix(self, url_build, to_enrich: Metadata, current_fb_id):
logger.debug(f' Inside Part 2')
# call browsertrix and get a warc file using a logged in facebook profile
# this will get full resolution image which we can then save as a jpg
with open('url.txt', 'w') as file:
file.write(url_build)
collection = random_str(8)
linux_tmp_dir ='/home/dave/aatmp'
# Check if the directory exists, and if not, create it
if not os.path.exists(linux_tmp_dir):
os.makedirs(linux_tmp_dir)
# DM 31st Oct 24 take out
# hard_code_directory_for_wsl2 ='/mnt/c/dev/v6-auto-archiver'
# browsertrix_home = ""
# tmp_dir = ArchivingContext.get_tmp_dir()
# try:
# # DM get strange AttributeError if include self.browsertrix_home - taken out for now
# # browsertrix_home = self.browsertrix_home or os.path.abspath(ArchivingContext.get_tmp_dir())
# # browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
# browsertrix_home = os.path.abspath(tmp_dir)
# except FileNotFoundError:
# logger.debug(f'Dev found in function 2')
# # tmp_dir = ArchivingContext.get_tmp_dir()
# foo = tmp_dir[1:]
# browsertrix_home = f'{hard_code_directory_for_wsl2}{foo}'
browsertrix_home = linux_tmp_dir
docker_commands = ["docker", "run", "--rm", "-v", f"{browsertrix_home}:/crawls/", "webrecorder/browsertrix-crawler"]
cmd = docker_commands + [
"crawl",
"--url", url_build,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--screenshot", "fullPage",
"--collection", collection,
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout),
"--combineWarc"
]
if self.profile:
# DM 31st oct patch this back in to make sure the profile there... I'm guessing it is already
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
logger.debug(f"copying {self.profile} to {profile_fn}")
shutil.copyfile(self.profile, profile_fn)
# profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz")
# logger.debug(f"copying {self.profile} to {profile_fn}")
# shutil.copyfile(self.profile, profile_fn)
# cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
try:
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
except Exception as e:
logger.error(f"WACZ generation failed: {e}")
return False
if os.getenv('RUNNING_IN_DOCKER'):
filename = os.path.join("collections", collection, f"{collection}.wacz")
else:
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}_0.warc.gz")
if not os.path.exists(filename):
logger.warning(f"Unable to locate and upload WACZ {filename=}")
return False
warc_filename = filename
counter = 100
seen_urls = set()
next_fb_id = 0
with open(warc_filename, 'rb') as warc_stream:
for record in ArchiveIterator(warc_stream):
# 1.Get next fb_id logic
if record.rec_type == 'request':
uri = record.rec_headers.get_header('WARC-Target-URI')
if "bulk-route-definitions/" in uri:
content = record.content_stream().read()
foo = str(content)
# Strategy 1 test
# photo%2F%3Ffbid%3D1646726009098072%26set%3Dpcb.1646726145764725%26
# fbid = 1646726009098072
# set = pcb.1646726145764725
photo_string_start_pos = foo.find(f'photo%2F%3Ffbid%3D',0)
# photo_string_start_pos = foo.find(f'%2Fphotos%2Fpcb.',0)
if (photo_string_start_pos > 0):
fbid_start_pos = photo_string_start_pos + 18
middle_26_start_pos = foo.find(f'%26', fbid_start_pos)
# only need this!
next_fb_id_foo = foo[fbid_start_pos:middle_26_start_pos]
# check haven't found current page fb_id
if next_fb_id_foo == current_fb_id:
logger.debug("found current fb_id so ignoring")
else:
next_fb_id = next_fb_id_foo
else:
logger.debug('photo string not found in bulk-route-definitions - this is normal. 1 out of 3 have seen work')
# end of strategy 1,2,3
continue
if record.rec_type != 'response': continue
record_url = record.rec_headers.get_header('WARC-Target-URI')
# 2.Save image logic
# THIS COULD BE A PROBLEM as each sub page will save 3 or 5 images probably
# filter by media mimetypes
content_type = record.http_headers.get("Content-Type")
if not content_type: continue
if not any(x in content_type for x in ["video", "image", "audio"]): continue
# DM - ignore this specialised content type for facebook
if content_type == "image/x.fb.keyframes": continue
# create local file and add media
ext = mimetypes.guess_extension(content_type)
warc_fn = f"warc-file-{collection}-{counter}{ext}"
# fn = os.path.join(tmp_dir, warc_fn)
fn = os.path.join(linux_tmp_dir, warc_fn)
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
# FB serves many images in the page as helpers - we just want the main high res image
# many of the small images are from comments
# 1k jpg
# 22k png
# 35k png - mobile phone image
# 17k png
# gifs are common in comments
# DM if size of media file is < x discard
fs = os.path.getsize(fn)
if fs < 5000 and ext == ".jpg": continue
if fs < 37000 and ext == ".png": continue
if ext == ".gif": continue
if ext == ".ico": continue
if ext == None : continue
m = Media(filename=fn)
m.set("src", record_url)
m.set("src_alternative", record_url)
to_enrich.add_media(m, warc_fn)
logger.info(f'Adding {fn=} which is {fs} bytes {record_url=} ')
counter += 1
seen_urls.add(record_url)
return next_fb_id