-
Notifications
You must be signed in to change notification settings - Fork 1
/
facebook_quantifier.py
748 lines (667 loc) · 32 KB
/
facebook_quantifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
"""Quantify Facebook user activity over time.
Facebook allows users to download a copy of their own data in a machine readable format.
This program extracts and quantifies information about user activity from the data
Facebook provides to give users an aggregated, birds-eye overview of their Facebook
activity over time.
The program will produce a CSV file with dates as the index and events (e.g.
'message_set' or 'commented') as columns. The data is provided in a format that makes it
easy to analyze.
"""
import argparse
import csv
import json
import sys
from collections import Counter
from datetime import date, datetime # 'date' used for type hints only
from pathlib import Path
from re import findall
from typing import Optional, Union
class FacebookQuantifier():
"""Extract and quantify events in user data provided by Facebook.
To count user activity, this class extracts timestamps found in a pre-defined list
of files contained in the Facebook data, collecting dates of the various user
activities found in the date. See the list of data attributes below for a full
overview. Note that Facebook is continuously changing what information is available
in the downloadable archive and some of these data attributes are only available in
older versions of the archive.
Attributes (required to instantiate object):
folder : Path
Folder that contains the data provided by Facebook
user : str
Name of the Facebook user
Possible data attributes (automatically generated upon instantiation):
ad_interaction: list[date]
User clicked on an ad
added_friend : list[date]
User connected with someone on Facebook
received_friend_request : list[date]
User received friend request
Note: Only pending requests which have not been accepted
rejected_friend_reject : list[date]
User rejected friend request
removed_friend : list[date]
User removed friend
apps_posts : list[date]
Installed Facebook app posted on the user's timeline
commented : list[date]
User commented on posts from friends
Note: 'posts' includes photos
event_invitation : list[date]
User received event invitation
group_membership_activity (list[date]):
User interacted with group (e.g. by joining them)
group_posts : list[date]
User posted in group
installed_app : list[date]
User installed Facebook apps
liked_external_pages : list[date]
User clicked the 'Like' button on external site
liked_page : list[date]
User liked a Facebook page
created_page : list[date]
User created a Facebook page
created_note : list[date]
User created a note (similar to pinned posts on timeline)
others_posts_timeline : list[date]
Others posted on the user's timeline
profile_updated : list[date]
User updated her profile
reactions : list[date]
User 'reacted' with 'Like' or other similar action
responded_event : list[date]
User reacted to events (e.g. 'going')
searched : list[date]
User searched something on Facebook
poked : list[date]
User poked someone
voted : list[date]
User voted in a poll
saved_item : list[date]
User 'saved' a post or other item
followed_sb_st : list[date]
User followed somebody or something
addressbook_entry : list[date]
User added someone to Facebook address book
own_posts : dict[str, list[date]]
User posted on own timeline. Dict keys are:
- 'own_posts_all' (anything posted)
- 'own_posts_media' (photos and videos only)
- 'own_posts_text_only' (no links or media)
- 'own_posts_links' (links or links + text)
messages : dict[str, list[date]]
Timestamps when user received or sent a message in Facebook messenger.
Keys are 'sent_message' and 'received_message' unless no sent messages
could be identified (due to user name not found). If that happens, dict
has only one key named 'send_or_received'
viewed : dict[str, list[date]]
Timestamps when user viewed various items. Possible keys:
- 'viewed_video'
- 'viewed_article'
- 'viewed_marketplace_item'
visited : dict[str, list[date]]
Timestamps when user visited other people's profiles or various pages.
Possible keys:
- 'visited_profile'
- 'visited_page'
- 'visited_event_page'
- 'visited_group_page'
- 'visited_marketplace'
menu_items : list[date]
Timestamps when user clicked an item in the user interface
"""
def __init__(self, folder: Path, user: str) -> None:
"""Scan the data provided by Facebook and extracts timestamps of activities.
Parameters
----------
folder : Path
The folder holding the data provided by Facebook, usually named
'facebook-<user name>'
user : str
The name of the user the Facebook data belongs to. Whitespace is
removed and string is lowercased. This makes sure the user name
always has the same format, regardless of whether it was provided
as an argument or extracted from the Facebook data folder.
"""
# TODO: Maybe instead of a whitelist, make a blacklist of files to ignore. For
# all the other files, call file-specific functions where necessary and
# get_timestamps for everything else. Then you just automatically name the
# activity after the file name and tell people to look up the Facebook page
# if it's unclear what it represents.
self.folder = folder
self.user = user.replace(" ", "").lower()
self.json_files = list(Path(self.folder).rglob("*.json"))
for file in self.json_files:
if file.name == "friends.json":
self.added_friend = self.get_timestamps(file)
# File named differently in different versions of the archive
elif file.name in ["received_friend_requests.json",
"friend_requests_received.json"]:
self.received_friend_request = self.get_timestamps(file)
elif file.name == "rejected_friend_requests.json":
self.rejected_friend_reject = self.get_timestamps(file)
elif file.name == "removed_friends.json":
self.removed_friend = self.get_timestamps(file)
elif file.name == "apps_and_websites.json":
self.installed_app = self.get_timestamps(file, "added_timestamp")
elif file.name == "comments.json":
self.commented = self.get_timestamps(file)
elif file.name == "posts_and_comments.json":
self.reactions = self.get_timestamps(file)
# Only available in older versions of the downloaded archives
elif file.name == "other_people's_posts_to_your_timeline.json":
self.others_posts_timeline = self.get_timestamps(file)
# Only available in older versions of the downloaded archive
elif file.name == "notes.json":
self.created_note = self.get_timestamps(file, "created_timestamp")
elif file.name == "your_event_responses.json":
self.responded_event = self.get_timestamps(file, "start_timestamp")
elif file.name == "your_group_membership_activity.json":
self.group_membership_activity = self.get_timestamps(file)
elif file.name == "your_posts_and_comments_in_groups.json":
self.group_posts = self.get_timestamps(file)
elif file.name == "posts_from_apps_and_websites.json":
self.apps_posts = self.get_timestamps(file)
elif file.name == "event_invitations.json":
self.event_invitation = self.get_timestamps(file, "start_timestamp")
# File named differently in different versions of the archive
elif file.name in ["pages.json", "pages_you've_liked.json"]:
self.liked_page = self.get_timestamps(file)
elif file.name == "your_pages.json":
self.created_page = self.get_timestamps(file)
# Only available in older versions of the downloaded archive
elif file.name == "likes_on_external_sites.json":
self.liked_external_pages = self.get_timestamps(file)
elif file.name == "profile_update_history.json":
self.profile_updated = self.get_timestamps(file)
elif file.name == "your_search_history.json":
self.searched = self.get_timestamps(file)
# Only available in older versions of the downloaded archive
elif file.name == "advertisers_you've_interacted_with.json":
self.ad_interaction = self.get_timestamps(file)
elif file.name == "pokes.json":
self.poked = self.get_timestamps(file)
elif file.name == "polls_you_voted_on.json":
self.voted = self.get_timestamps(file)
elif file.name == "saved_items_and_collections.json":
self.saved_item = self.get_timestamps(file)
# File named differently in different versions of the archive
elif file.name in ["following.json", "who_you_follow.json"]:
self.followed_sb_st = self.get_timestamps(file)
# Only available in older versions of the downloaded archive
elif file.name == "your_address_books.json":
self.addressbook_entry = self.get_timestamps(file, "created_timestamp")
# These files contain various types of events and require dedicated
# functions in order to distinguish between those different types
elif file.name == "your_posts_1.json":
posts = self.get_own_posts(file)
for post_type, dates in posts.items():
if post_type == "own_posts_all":
self.own_posts_all = dates
if post_type == "own_posts_media":
self.own_posts_media = dates
if post_type == "own_posts_text_only":
self.own_posts_text_only = dates
if post_type == "own_posts_links":
self.own_posts_links = dates
# File named differently in different versions of the archive
elif file.name in ["viewed.json", "recently_viewed.json"]:
views = self.get_viewed(file)
for view_type, dates in views.items():
if view_type == "viewed_video":
self.viewed_video = dates
if view_type == "viewed_article":
self.viewed_article = dates
if view_type == "viewed_marketplace_item":
self.viewed_marketplace_item = dates
# File named differently in different version of the archive
elif file.name in ["visited.json", "recently_visited.json"]:
visited = self.get_visited(file)
for visited_type, dates in visited.items():
if visited_type == "visited_profile":
self.visited_profile = dates
if visited_type == "visted_page":
self.visited_page = dates
if visited_type == "visited_event_page":
self.visited_event_page = dates
if visited_type == "visited_group_page":
self.visited_group_page = dates
if visited_type == "visited_marketplace":
self.visited_marketplace = dates
# Only available in older versions of the downloaded archive
elif file.name == "menu_items.json":
self.clicked_menu_items = self.get_menu_items(file)
# Messages are analyzed separately
messages = self.get_messages()
if messages:
for message_type, dates in messages.items():
if message_type == "message_sent":
self.message_sent = dates
if message_type == "message_received":
self.message_received = dates
if message_type == "message_received_or_sent":
self.message_received_or_sent = dates
def get_timestamps(self, file_path: Path,
timestr: str = "timestamp"
) -> Optional[list[date]]:
"""Extract timestamps from files.
First, try to get timestamps assuming that they are inside a
dict[list[dict]] structure. This is how most of the files provided
by Facebook are structured. If that returns an empty list, try again
assuming a deeper nesting.
Args:
filepath (Path): Path to JSON file.
timestr (str, optional):
The name of the JSON field with the event timestamps
varies across the data Facebook provides. Most often
it is simply "timestamp", but sometimes "crated_timestamp"
for example. This arg makes sure the right term is used.
Returns:
Optional[list[date]]: Lists of timestamps found in the data.
"""
json_file = self.load_file(file_path)
# JSON file is a dict structured as:
# dict[str, list[dict[str, int]]]
timestamps = [
datetime.fromtimestamp(item[timestr]).date()
for value in json_file.values()
# Unpack list in: dict[str, list]
for item in value
# Check nested dict in: dict[str, list[dict]]
if timestr in item
]
if not timestamps:
# JSON file is a dict structured as:
# dict[str, dict[str, list[dict[str, int]]]]
timestamps = [
datetime.fromtimestamp(item[timestr]).date()
for key in json_file
for inner_key in json_file.get(key).values()
for item in inner_key
if timestr in item
]
if not timestamps:
print(f"\t-No timestamps found in: {file_path}")
return None
return timestamps
def load_file(self, file_path: Path): # -> JSON
"""Load data into JSON.
Args:
file_path (Path): Path to JSON file.
Returns:
JSON: The input file loaded as JSON.
"""
with open(file_path) as data_file:
return json.load(data_file)
def get_messages(self) -> Optional[dict[str, list[date]]]:
"""Get timestamps from message files.
Loops through files in messages folder and collects timestamps
of each file in a list, categorizing them as "sent" or "received"
by checking whether self.user matches the field "sender_name"
inside each JSON file. If self.user was not found in any file, only
return one dict named 'received_or_sent' to indicate lack of distinction.
Returns:
Optional[dict[str, list[date]]]:
Dict with lists of timestamps; keys are 'sent'
and 'received' messages, or 'received_or_sent'
if no sent messages were found
"""
# Newer versions of downloadable archive include extra files that
# don't contain information relevant here
exclude: list[str] = ["autofill_information.json", "secret_groups.json"]
files_messages: list[Path] = [
file for file in self.json_files
if Path(self.folder, 'messages') in file.parents
if file.name not in exclude
]
if not files_messages:
return None
# To collect items across multiple files, create lists of dict directly
# and extend them with each file
messages: dict[str, list[date]] = {
"message_sent": [],
"message_received": []
}
for file in files_messages:
json_file = self.load_file(file)
messages["message_sent"].extend(
[
datetime.fromtimestamp(message["timestamp_ms"] / 1000).date()
for message in json_file["messages"]
# Format sender name so it is written just like in the folder
if message["sender_name"].replace(" ", "").lower() == self.user
]
)
messages["message_received"].extend(
[
datetime.fromtimestamp(message["timestamp_ms"] / 1000).date()
for message in json_file["messages"]
if message["sender_name"].replace(" ", "").lower() != self.user
]
)
if not messages["message_sent"]:
# If no sent messages could be detected, it means that the user name wasn't
# found in message files. If that happens, rename 'message_received' to
# 'message_received_or_sent' and remove 'messages_sent' to indicate lack
# of distinction
print(
"No sent messages found, verify if the provided user name is "
"identical to the name used in Facebook Messenger.\n"
)
del messages["message_sent"]
messages["message_received_or_sent"] = messages.pop("message_received")
return messages
def get_own_posts(self, file_path: Path) -> dict[str, list[date]]:
"""Get timestamps for own posts, separated by type of post.
To determine different types of posts (media, text only, link), turn
the values of each item inside the JSON file into a string and check
if the relevant field that identifies the type of post is in that string.
We do this because the metadata for each post is endlessly nested into
lists of dicts, so turning that into a string makes it much easier to
simply detect if a certain field is present.
Args:
file_path (Path): Path to JSON file
Returns:
dict[str, list[date]]:
Dict with lists of timestamps, keys are types of posts.
"""
json_file = self.load_file(file_path)
posts: dict[str, list[date]] = {}
posts["own_posts_all"] = [
datetime.fromtimestamp(item["timestamp"]).date()
for item in json_file
]
posts["own_posts_media"] = [
datetime.fromtimestamp(item["timestamp"]).date()
for item in json_file
if "media" in str(item.values())
]
posts["own_posts_text_only"] = [
datetime.fromtimestamp(item["timestamp"]).date()
for item in json_file
if "external_context" not in str(item.values())
if "media" not in str(item.values())
]
posts["own_posts_links"] = [
datetime.fromtimestamp(item["timestamp"]).date()
for item in json_file
if "external_context" in str(item.values())
if "media" not in str(item.values())
]
return posts
def get_viewed(self, file_path: Path) -> dict[str, list[date]]:
"""Get timestamps for viewed items, separated by type of item.
Captures when user viewed a video, an article or marketplace items.
Note that each video viewed is listed three times in the data: 'time spend',
'shows' and 'time viewed'. Here we only take the latter as it seems most
complete.
Args:
file_path (Path): Path to JSON file
Returns:
dict[str, list[date]]:
Dict with lists of timestamps, keys are types of item viewed.
"""
json_file = self.load_file(file_path)
views: dict[str, list[date]] = {}
# Older versions of the personal data archive used a different key.
# First check if old key available, otherwise use new key
# TODO: See if you can make this work without having to know the name
# of the key of the very first dict. E.g. with using json_file.values()
if not json_file.get("viewed_things"):
json_key: str = "recently_viewed"
else:
json_key = "viewed_things"
for category in json_file[json_key]:
if category['name'] == "Facebook Watch Videos and Shows":
for watchtype in category['children']:
if watchtype['name'] == "Time Viewed":
views["viewed_video"] = [
datetime.fromtimestamp(item["timestamp"]).date()
for item in watchtype['entries']
]
if category['name'] == "Articles":
views['viewed_article'] = [
datetime.fromtimestamp(item["timestamp"]).date()
for item in category["entries"]
]
# "Marketplace" entry only available in older versions of the
# downloadable archive. Includes views of marketplace items.
# In newer versions, this information is available in 'recently
# visted'. Newer versions of the 'viewed' overview also show
# when a marketplace item was shown on Facebook, regardless of
# whether the user interacted with it.
if category['name'] == "Marketplace":
for market_view in category['children']:
views["viewed_marketplace_item"] = [
datetime.fromtimestamp(item["timestamp"]).date()
for item in market_view['entries']
]
return views
def get_visited(self, file_path: Path) -> dict[str, list[date]]:
"""Get timestamps for views on various items like profiles.
Args:
file_path (Path): Path to JSON file
Returns:
dict[str, list[date]]:
Dict with lists of timestamps, keys are types items viewed.
"""
json_file = self.load_file(file_path)
# Older versions of the personal data archive used a different key.
# First check if old key available, otherwise use new key
if not json_file.get("visited_things"):
json_key: str = "visited_things_v2"
else:
json_key = "visited_things"
visited: dict[str, list[date]] = {}
for category in json_file[json_key]:
if category['name'] == "Profile visits":
visited["visited_profile"] = [
datetime.fromtimestamp(item["timestamp"]).date()
for item in category["entries"]
]
if category['name'] == "Page visits":
visited["visited_page"] = [
datetime.fromtimestamp(item["timestamp"]).date()
for item in category["entries"]
]
if category['name'] == "Events visited":
visited["visited_event_page"] = [
datetime.fromtimestamp(item["timestamp"]).date()
for item in category["entries"]
]
if category['name'] == "Groups visited":
visited["visited_group_page"] = [
datetime.fromtimestamp(item["timestamp"]).date()
for item in category["entries"]
]
# This information is only available in newer versions of the downloadable
# archive. Different to other data points, the date is represented as a
# string formatted as 'Sep 20, 2014', which we're converting here
if category['name'] == "Marketplace Visits":
visited["visited_marketplace"] = [
datetime.strptime(item['data']['value'], '%b %d, %Y').date()
for item in category["entries"]
]
return visited
def get_menu_items(self, file_path: Path) -> list[date]:
"""Get timestamps for clicked menu items.
Args:
file_path (Path): Path to JSON file
Returns:
list[date]:
List of timestamps
"""
json_file = self.load_file(file_path)
return [
datetime.fromtimestamp(item["timestamp"]).date()
for item in json_file['menu_items'][0]['entries']
]
def write_csv(self) -> None:
"""Write CSV summarizing Facebook activities per day.
Crates a spreadsheet where each row represents a day (formatted Year-Month-Day)
and each column shows how often various activities took place on that day (for
example how often a message was sent on Facebook).
"""
# To build rows for spreadsheet, we first use a sorted list of unique dates
# found across all collected activities. This is used as an index in the
# spreadsheet and makes sure that the data is sorted by date. Second, we get the
# count of each activity per date
activities: list[str] = self.get_activities()
unique_dates: list[date] = self.get_unique_dates(activities)
activity_counts: dict[str, Counter] = self.get_activity_counts(activities)
with open(f"facebook_data_{self.user}.csv", 'w', newline='') as csvfile:
fieldnames = ["date"] + activities
csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
csvwriter.writeheader()
# For each unique date, create a row and check if this date can be found in
# each Counter item of 'activity_counts'. If yes, add count to current row.
# If an activity doesn't have any count on a unique date, add 'None' to row.
for unique_date in unique_dates:
row: dict[str, Union[date, Optional[int]]] = {"date": unique_date}
for activity, counter in activity_counts.items():
for day, count in counter.items():
if day == unique_date:
row[activity] = count
if not row.get(activity):
row[activity] = None
csvwriter.writerow(row)
print(f"Saved file: facebook_data_{self.user}.csv")
def summarize_data(self) -> None:
"""Print overview showing how often each activity was found.
For each activity collected in current FacebookQuantifier instance, print count
of dates on which activity took place.
"""
print("Found the following number of activities:\n")
for activity in self.get_activities():
print(f"- {activity}: {len(getattr(self, activity))}")
def get_unique_dates(self, activities: list[str]) -> list[date]:
"""Get unique dates across all captured activities.
Parameters
----------
activities : list[str]
List of names of all collected activities in current FacebookQuantifier
instance.
Returns
-------
list[date]
List of unique dates (sorted)
"""
unique_dates_set: set[date] = set(
[
date for activity in activities
for date in getattr(self, activity)
]
)
return sorted(unique_dates_set)
def get_activity_counts(self, activities: list[str]) -> dict[str, Counter]:
"""Create a dict summarizing the count of each activity per day.
Parameters
----------
activities : list[str]
List of names of all collected activities in current FacebookQuantifier
instance.
Returns
-------
dict[str, Counter]
Dicts with activity names as keys and Counter objects as values. Counter
objects are structured as dicts with dates as keys and counts as values.
"""
activity_counts: dict[str, Counter] = {}
for activity in activities:
activity_counts[activity] = Counter(getattr(self, activity))
return activity_counts
def get_activities(self) -> list[str]:
"""Get a list of all captured activities.
Returns
-------
list[str]
List of names of all attributes of the current FacebookQuantifier instance
that represent activities.
"""
# Exclude attributes that do not contain dates of Facebook activities
exclude: list[str] = ["folder", "user", "json_files"]
return [attribute for attribute in self.__dict__ if attribute not in exclude]
def main() -> None:
"""Set up a FacebookQuantifier instance.
If no args is provided, scan current directory for folders named
"facebook-<username>", which is naming scheme Facebook uses by default. For
each folder detected, use Regex to get the user name included in the folder name.
Optionally, folder and user name can be provided as args.
For each pair of folder-user pair, create a FacebookQuantifier instance and write
a CSV file.
"""
argparser = argparse.ArgumentParser(
description="""Extracts and quantifies user activity based on the data
Facebook provides using 'Download Your Information'. If no arguments
are provided, scans current directory for folders named facebook-<username>.""",
)
argparser.add_argument(
"--folder",
"-f",
help="Path to folder containing Facebook data."
)
argparser.add_argument(
"--user",
"-u",
help="Name of the Facebook user. Used to distinguish sent and received "
"messages in Facebook messenger."
)
argparser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Print report listing frequency of activities found in the data."
)
args = argparser.parse_args()
# Check folder(s)
if args.folder:
if Path(args.folder).expanduser().is_dir():
base_folders = [Path(args.folder).expanduser()]
else:
print("Specified path is not a folder, please verify.")
sys.exit()
else:
# If no folder provided, scan current directory
base_folders = [
folder for folder in Path().iterdir()
if folder.name.startswith("facebook-")
if folder.is_dir()
]
if not base_folders:
print(
"Could not find folder(s). Either provide path to folder using "
"--folder='/path/to/folder' or make sure the Facebook data is in "
"the same folder as the script and named 'facebook-<user name>'."
)
sys.exit()
# Check user name(s)
if args.user and len(base_folders) > 1:
print(
"Multiple data folders found but only one user name provided. "
"Please specify folder using --folder."
)
sys.exit()
elif args.user and len(base_folders) == 1:
usernames = [args.user]
else:
try:
# When no user name was provided, try find it in folder name with Regex
usernames = [
findall(r"-(.*)", subfolder.name)[0]
for subfolder in base_folders
]
except IndexError:
print(
"Could not find user name in folder name. Please provide "
"a user name using the --user argument or make sure the folder "
"is named facebook-<user name> for automatic detection."
)
sys.exit()
# Instantiate FacebookQuantifier, summarize the data found and create CSV
for folder, user_name in zip(base_folders, usernames):
print(f"Checking data in folder '{folder}' for '{user_name}'")
facebook_data = FacebookQuantifier(folder, user_name)
facebook_data.write_csv()
if args.verbose:
facebook_data.summarize_data()
if __name__ == "__main__":
main()