From 81ecfbaec45cfebf7c547ed5daae5e59652fcac5 Mon Sep 17 00:00:00 2001 From: Levente Csoke Date: Wed, 12 Aug 2020 19:58:28 +0200 Subject: [PATCH 1/3] changed how we gather individual people from convos; solved and added some todos; 1 test failing only --- Conversations.py | 116 --------------- FacebookData.py | 40 ----- Friends.py | 36 ----- Individual.py | 47 ------ People.py | 79 ---------- __main__.py | 5 + Miner.py => miner/App.py | 25 ++-- .../ConversationAnalyzer.py | 22 +-- miner/Conversations.py | 139 ++++++++++++++++++ miner/FacebookData.py | 30 ++++ miner/Friends.py | 27 ++++ Group.py => miner/Group.py | 2 +- miner/Individual.py | 63 ++++++++ Me.py => miner/Me.py | 2 +- .../MessagingAnalyzer.py | 25 ++-- miner/People.py | 52 +++++++ Visualizer.py => miner/Visualizer.py | 6 +- tests/TestMessages.py => miner/__init__.py | 0 utils.py => miner/utils.py | 90 +++++++----- tests/TestPeople.py | 36 ----- tests/conftest.py | 10 +- ...alyzer.py => test_ConversationAnalyzer.py} | 23 ++- ...Conversations.py => test_Conversations.py} | 17 ++- tests/{TestFriends.py => test_Friends.py} | 4 +- tests/test_Messages.py | 0 ...gAnalyzer.py => test_MessagingAnalyzer.py} | 9 +- tests/test_People.py | 39 +++++ tests/test_utils.py | 2 +- 28 files changed, 472 insertions(+), 474 deletions(-) delete mode 100644 Conversations.py delete mode 100644 FacebookData.py delete mode 100644 Friends.py delete mode 100644 Individual.py delete mode 100644 People.py create mode 100644 __main__.py rename Miner.py => miner/App.py (63%) rename ConversationAnalyzer.py => miner/ConversationAnalyzer.py (84%) create mode 100644 miner/Conversations.py create mode 100644 miner/FacebookData.py create mode 100644 miner/Friends.py rename Group.py => miner/Group.py (92%) create mode 100644 miner/Individual.py rename Me.py => miner/Me.py (78%) rename MessagingAnalyzer.py => miner/MessagingAnalyzer.py (93%) create mode 100644 miner/People.py rename Visualizer.py => miner/Visualizer.py (88%) rename tests/TestMessages.py => miner/__init__.py (100%) rename utils.py => miner/utils.py (82%) delete mode 100644 tests/TestPeople.py rename tests/{TestConversationAnalyzer.py => test_ConversationAnalyzer.py} (94%) rename tests/{TestConversations.py => test_Conversations.py} (79%) rename tests/{TestFriends.py => test_Friends.py} (91%) create mode 100644 tests/test_Messages.py rename tests/{TestMessagingAnalyzer.py => test_MessagingAnalyzer.py} (98%) create mode 100644 tests/test_People.py diff --git a/Conversations.py b/Conversations.py deleted file mode 100644 index 3fb1fbd..0000000 --- a/Conversations.py +++ /dev/null @@ -1,116 +0,0 @@ -import os -from FacebookData import FacebookData -import pandas as pd - -from datetime import datetime - -MESSAGE_SUBPATH = 'messages/inbox' - - -class Conversations: - def __init__(self, data_path): - self.data_path = f'{data_path}/{MESSAGE_SUBPATH}' - - def get_people(self): - json_paths = self.walk_directory_and_search('.json') - return self.extract_names_from_convos(json_paths) - - def walk_directory_and_search(self, extension): - paths = [] - for root, dirs, files in os.walk(self.data_path): - for name in files: - if name.endswith(extension): - paths.append(os.path.join(root, name)) - return paths - - # TODO simplify this function!! also this takes very long - @staticmethod - def extract_names_from_convos(jsons): - name_data_map = {} - count = 0 - for file in jsons: - msg = Messages(file) - for participant in msg.participants: - key = participant if msg.ttype == 'Regular' else f'group_{count}' - if key == 'Facebook User': # TODO ?? what to do with this?? - continue - if name_data_map.get(key) and key.startswith( - 'group'): # making sure run only once even if it is a group - continue - if name_data_map.get(key): - dfs = [name_data_map[key]['messages'], msg.df] - name_data_map[key]['messages'] = pd.concat(dfs, ignore_index=False).sort_index() - else: - name_data_map[key] = { - 'title': msg.title, - 'compact_name': msg.compact_names, - # 'participants': msg.participants + ['Levente Csőke'], - 'participants': msg.participants, - 'messages': msg.df, - 'friend': None, - 'messages_dir': msg.messages_dir, - 'media_dir': msg.media_dir - } - if msg.ttype == 'RegularGroup': - count += 1 - - return name_data_map - - -class Messages(FacebookData): - def __init__(self, json_path): - super().__init__(json_path) - self.to_df() - self.set_date_as_index() - - def to_df(self): - self._df = pd.DataFrame(self.decoded.get('messages')) - - def set_date_as_index(self): - # NOTE maybe not needed; could calculate real time - date_series = self._df.timestamp_ms.apply(self.ts_to_date) - self._df = self._df.set_index(date_series).iloc[::-1] - - @property - def names(self): - return pd.DataFrame(self.participants)[0] - - @property - def participants(self): - participants = self.decoded.get('participants') - # TODO I should be IN - # but this breaks stuff at TestMessagingAnalyzer - return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke'] - # return [p.get('name') for p in participants if p.get('name')] - - @property - def title(self): - return self.decoded.get('title') - - @property - def ttype(self): - return self.decoded.get('thread_type') - - @property - def messages_dir(self): - thread_path = self.decoded.get('thread_path') - if not thread_path.startswith('inbox/'): - raise ValueError('Something is not okay.') - # TODO here or in the upper function where we extract names - return thread_path.split('/')[1].lower() - - @property - def media_dir(self): - # todo what should the path contain - for media in ['photos', 'gifs', 'files', 'videos', 'audio']: - if media in self._df.columns: - media_in_msg = list(self._df[media][self._df[media].notnull()]) - # if len(media_in_msg) > 1: # TODO is this ok. i think it is. think multiple photos sent once - # print('Media in msg is bigger than 1') - uri = media_in_msg[0][0].get('uri') - return os.path.dirname(os.path.dirname(uri)) - return None - - @staticmethod - def ts_to_date(date): - return datetime.fromtimestamp(date / 1000) # .strftime('%Y-%m-%d') diff --git a/FacebookData.py b/FacebookData.py deleted file mode 100644 index a82c896..0000000 --- a/FacebookData.py +++ /dev/null @@ -1,40 +0,0 @@ -from utils import read_json, decode_text, accents_map - - -class FacebookData: - def __init__(self, json_path): - self.json_path = json_path - self._df = None - - @property - def df(self): - return self._df - - @property - def decoded(self): - return decode_text(self.json) - - @property - def json(self): - return read_json(self.json_path) - - @property - def compact_names(self): - # NOTE this is the place where we change pd/np to builtin - # do we have to do this? - name_list = list(self.without_accent_and_whitespace(self.lower_names(self.names))) - return name_list[0] if len(name_list) == 1 else name_list - - @staticmethod - def lower_names(col): - return col.str.lower() - - @staticmethod - def without_accent_and_whitespace(col): - def replace_accents(text): - for char in accents_map.keys(): - if char in text: - text = text.replace(char, accents_map[char]) - return text.replace(' ', '') - - return col.apply(replace_accents) diff --git a/Friends.py b/Friends.py deleted file mode 100644 index 6e0e991..0000000 --- a/Friends.py +++ /dev/null @@ -1,36 +0,0 @@ -import pandas as pd -import os -from FacebookData import FacebookData -from utils import accents_map - - -class Friends(FacebookData): - - def __init__(self, *args): - super().__init__(*args) - - # self.path = 'data/friends' - # self.json_path = f'{self.path}/friends.json' - - self.to_df() - - def get_people(self): - names = {} - for name, compact in zip(self.names, self.compact_names): - names[name] = { - 'title': name, - 'compact_name': compact, - 'messages': None, - 'friend': True, - 'participants': None, - 'messages_dir': None, - 'media_dir': None - } - return names - - def to_df(self): - self._df = pd.DataFrame(self.decoded.get('friends')) - - @property - def names(self): - return self.df.name diff --git a/Individual.py b/Individual.py deleted file mode 100644 index a9f8d03..0000000 --- a/Individual.py +++ /dev/null @@ -1,47 +0,0 @@ -class Individual: - def __init__(self, name=None, title=None,compact=None, messages=None, friend=None, messages_dir=None, media_dir=None, - member_of=None): - self._name = name - self._title = title - self._compact_name = compact - self._messages = messages - self._friend = friend - self._messages_dir = messages_dir - self._media_dir = media_dir - self._member_of = member_of - - - def __repr__(self): - return self.name - - @property - def name(self): - return self._name - - @property - def title(self): - return self._title - - @property - def messages(self): - return self._messages - - @property - def friend(self): - return self._friend - - @property - def media_dir(self): - return self._media_dir - - @property - def messages_dir(self): - return self._messages_dir - - @property - def compact_name(self): - return self._compact_name - - @property - def member_of(self): - return self._member_of diff --git a/People.py b/People.py deleted file mode 100644 index 11d1887..0000000 --- a/People.py +++ /dev/null @@ -1,79 +0,0 @@ -from Individual import Individual -from Conversations import Conversations -from Friends import Friends - -# from Me import Me -DATA_PATH = '/home/levente/projects/facebook-data-miner/data' -import time -from Group import Group - - -# TODO we dont need both data and individuals... or?? - -class People: - def __init__(self, path=None): - self.data_path = path if path else DATA_PATH - self._names = [] - self._individuals = {} - self._groups = [] - self._data = self.get_people() # TODO is this supposed to be here or elsewhere - self.to_individuals() # TODO is this supposed to be here or elsewhere - - @property - def data(self): - return self._data - - @property - def names(self): - return self._names - - @property - def individuals(self): - return self._individuals - - @property - def groups(self): - return self._groups - - def get_people(self): - start = time.time() - friends = Friends(self.data_path + '/friends/friends.json') - people1 = friends.get_people() - print('friends: ', time.time() - start) - - # TODO LATER too slow - # takes about 30 secs both - # read it once, store it in DB OR? - start = time.time() - conversations = Conversations(self.data_path) - people2 = conversations.get_people() - print('convos: ', time.time() - start) - - return self.unify_people(people1, people2) - - def to_individuals(self): # TODO maybe rather split_convos or differentiate_convos - start = time.time() - for person, data in self._data.items(): - if person.startswith('group'): - g = Group(name=data.get('name'), title=data.get('title'), messages=data.get('messages'), - compact=data.get('compact_name'), messages_dir=data.get('messages_dir'), - media_dir=data.get('media_dir'), members=None) - self._groups.append(g) - else: - indie = Individual(name=person, title=data.get('title'), messages=data.get('messages'), - compact=data.get('compact_name'), messages_dir=data.get('messages_dir'), - media_dir=data.get('media_dir'), member_of=None) - self._names.append(person) - self._individuals[person] = indie - print('indies: ', time.time() - start) - - @staticmethod - def unify_people(friends, convos): - for person, data in friends.items(): - if not convos.get(person): - convos[person] = data - convos[person]['friend'] = True - return convos - -# if __name__ == '__main__': -# p = People() diff --git a/__main__.py b/__main__.py new file mode 100644 index 0000000..33f7113 --- /dev/null +++ b/__main__.py @@ -0,0 +1,5 @@ +from miner.App import App + +if __name__ == '__main__': + app = App() + app.analyze_messages() \ No newline at end of file diff --git a/Miner.py b/miner/App.py similarity index 63% rename from Miner.py rename to miner/App.py index 3b32806..c2560bd 100644 --- a/Miner.py +++ b/miner/App.py @@ -1,11 +1,11 @@ -DATA_PATH = '/home/levente/projects/facebook-data-miner/data' +from miner.ConversationAnalyzer import ConversationAnalyzer +from miner.MessagingAnalyzer import MessagingAnalyzer +from miner.People import People -from People import People -from ConversationAnalyzer import ConversationAnalyzer -from MessagingAnalyzer import MessagingAnalyzer +DATA_PATH = '/home/levente/projects/facebook-data-miner/data' -class Miner: +class App: def __init__(self): pass @@ -15,8 +15,7 @@ def analyze_messages(): stats = {} - for name, person in p.individuals.items(): - #assert name == person.name, 'ERRRRRRROR!!!' + for name, person in p.data.items(): if person.messages is None: stats[person.name] = None continue @@ -24,7 +23,6 @@ def analyze_messages(): stats[person.name] = analyzer.stats # if stats[person.name].get('message_count').get('me') > 5000: # top[person.name] = stats[person.name] - example = stats['Dániel Nagy'] print() # print('LEN: ', len(top.keys())) @@ -33,13 +31,10 @@ def analyze_messages(): @staticmethod def analyze_messaging(): - p = People(path=DATA_PATH) - - msg_analyzer = MessagingAnalyzer(p.names, p.individuals) - - msgs = msg_analyzer.total_number_of_messages() + people = People(path=DATA_PATH) + msg_analyzer = MessagingAnalyzer(people) if __name__ == '__main__': - m = Miner() - m.analyze_messages() + app = App() + app.analyze_messages() diff --git a/ConversationAnalyzer.py b/miner/ConversationAnalyzer.py similarity index 84% rename from ConversationAnalyzer.py rename to miner/ConversationAnalyzer.py index cfe1a95..a5928bf 100644 --- a/ConversationAnalyzer.py +++ b/miner/ConversationAnalyzer.py @@ -1,5 +1,5 @@ import pandas as pd -from utils import date_checker, period_checker, subject_checker, generate_time_series, get_stats_for_intervals +from miner.utils import date_checker, period_checker, subject_checker, generate_date_series, get_stats_for_intervals class ConversationAnalyzer: @@ -25,7 +25,7 @@ def get_stats(self, df, subject='all', start=None, end=None, period=None): return stats def get_time_series_data(self, subject='all', **kwargs): - time_series = generate_time_series(**kwargs) + time_series = generate_date_series(**kwargs) return get_stats_for_intervals(self.get_stats, self.df, time_series, subject=subject) @staticmethod @@ -58,12 +58,6 @@ class ConversationStats: Statistics of conversation with one person. """ - # TODO do we need this or not?!?! smh - # def __new__(cls, df, *args, **kwargs): - # if not len(df.index): # This deals with the case if input df is empty - # return None - # return super(ConversationStats, cls).__new__(cls, *args, **kwargs) - def __init__(self, df): self.df = df @@ -131,13 +125,13 @@ def char_count(self): # 10. @property - def most_used_chars(self): - return None # TODO LATER or not https://stackoverflow.com/questions/4131123/finding-the-most-frequent-character-in-a-string - - # 11. - @property def rate_of_media_messages(self): - pass # NOTE what? + """ + TODO LATER + search for media messages all 5 of them + rate is only the second or third abstraction + """ + pass def get_words(self): token_list = self.messages.str.lower().str.split() diff --git a/miner/Conversations.py b/miner/Conversations.py new file mode 100644 index 0000000..4f61ad9 --- /dev/null +++ b/miner/Conversations.py @@ -0,0 +1,139 @@ +import os +from miner.Group import Group +from miner.Individual import Individual +from miner.FacebookData import FacebookData +import pandas as pd +from miner import utils +from datetime import datetime + + +class Conversations: + def __init__(self, data_path): + self.indie_convo_paths = [] + self.group_convo_paths = [] + self.deleted_user_convo_paths = [] + + self.data_path = f'{data_path}/{utils.MESSAGE_SUBPATH}' + self.order_paths() + + def order_paths(self): + json_paths = utils.walk_directory_and_search(self.data_path, '.json') + self.differentiate_paths(json_paths) + + def differentiate_paths(self, jsons): + for file in jsons: + msg = Messages(file) + if msg.title == 'Facebook User': + self.deleted_user_convo_paths.append(file) + elif msg.ttype == 'RegularGroup': + self.group_convo_paths.append(file) + elif msg.ttype == 'Regular': + self.indie_convo_paths.append(file) + else: + raise ValueError('Should not happen!') + + def get_people_from_private_messages(self, name=None, membership=True): + name_data_map = {} + paths = self.indie_convo_paths + if name is not None: + paths = self.filter_by_name(name) + for file in paths: + messages = Messages(file) + name = messages.title + if name_data_map.get(name) is not None: + dfs = [name_data_map[name].messages, messages.df] + name_data_map[name].messages = pd.concat(dfs).sort_index() + else: + # TODO we may also want to get group messages where name is present + name_data_map[name] = self.create_individual(messages, membership=membership) + return name_data_map + + def filter_by_name(self, name): + filtered_paths = [] + compact_name = None if name is None else utils.replace_accents(name.lower()) + for path in self.indie_convo_paths: + if compact_name in os.path.basename(os.path.dirname(os.path.normpath(path))): + filtered_paths.append(path) + return filtered_paths + + def create_individual(self, messages, membership=None): + return Individual( + name=messages.title, title=messages.title, # TODO depracate one of (name, title) + compact=messages.compact_names, + messages=messages.df, + messages_dir=messages.messages_dir, + media_dir=messages.media_dir, + member_of=self.group_membership(messages.title) if membership else None, + ) + + @staticmethod + def fill_data_map(message): + return { + 'title': message.title, + 'compact_name': message.compact_names, + # 'participants': msg.participants + ['Levente Csőke'], + 'participants': message.participants, + 'messages': message.df, + 'friend': None, + 'messages_dir': message.messages_dir, + 'media_dir': message.media_dir + } + + @staticmethod + def group_membership(name): + return None + + @staticmethod + def json_is_a_group_msg(file): + msg = Messages(file) + return msg.ttype == 'RegularGroup' + + +class Messages(FacebookData): + def __init__(self, json_path): + super().__init__(json_path) + self.to_df('messages') + self.set_date_as_index() + + @property + def names(self): + return pd.DataFrame(self.participants)[0] + + @property + def participants(self): + participants = self.decoded.get('participants') + # TODO I should be IN + # but this breaks stuff at TestMessagingAnalyzer + return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke'] + # return [p.get('name') for p in participants if p.get('name')] + + @property + def title(self): + return self.decoded.get('title') + + @property + def ttype(self): + return self.decoded.get('thread_type') + + @property + def messages_dir(self): + thread_path = self.decoded.get('thread_path') + if not thread_path.startswith('inbox/'): + raise ValueError('Field `thread_path` should start with `inbox/`.') + return thread_path.split('inbox/')[1] + + @property + def media_dir(self): + for media in utils.MEDIA_DIRS: + if media in self._df.columns: + media_in_msg = list(self._df[media][self._df[media].notnull()]) + uri = media_in_msg[0][0].get('uri') + return os.path.dirname(os.path.dirname(uri)).split('inbox/')[1] + + def set_date_as_index(self): + date_series = self._df.timestamp_ms.apply(self.ts_to_date) + self._df = self._df.set_index(date_series).iloc[::-1] + + @staticmethod + def ts_to_date(date): + return datetime.fromtimestamp(date / 1000) # .strftime('%Y-%m-%d') diff --git a/miner/FacebookData.py b/miner/FacebookData.py new file mode 100644 index 0000000..babe74d --- /dev/null +++ b/miner/FacebookData.py @@ -0,0 +1,30 @@ +from miner import utils +import pandas as pd + + +class FacebookData: + def __init__(self, json_path): + self.json_path = json_path + self._df = None + + @property + def df(self): + return self._df + + @property + def decoded(self): + return utils.decode_text(self.json) + + @property + def json(self): + return utils.read_json(self.json_path) + + @property + def compact_names(self): + # NOTE this is the place where we change pd/np to builtin + # do we have to do this? + name_list = list(utils.without_accent_and_whitespace(utils.lower_names(self.names))) # should be just fine + return name_list[0] if len(name_list) == 1 else name_list + + def to_df(self, field=None): + self._df = pd.DataFrame(self.decoded.get(field)) diff --git a/miner/Friends.py b/miner/Friends.py new file mode 100644 index 0000000..7950155 --- /dev/null +++ b/miner/Friends.py @@ -0,0 +1,27 @@ +import pandas as pd +import os +from miner.FacebookData import FacebookData +from miner.Individual import Individual + + +class Friends(FacebookData): + + def __init__(self, *args): + super().__init__(*args) + self.to_df('friends') + + def get_people(self, name=None): + names = {} + for full_name, compact in zip(self.names, self.compact_names): + if name is not None and name != full_name: # filtering for name + continue + names[full_name] = Individual( + name=full_name, title=full_name, # TODO depracate one of (name, title) + compact=compact, + friend=True, + ) + return names + + @property + def names(self): + return self.df.name diff --git a/Group.py b/miner/Group.py similarity index 92% rename from Group.py rename to miner/Group.py index 2152d77..94e1ed9 100644 --- a/Group.py +++ b/miner/Group.py @@ -1,4 +1,4 @@ -# TODO LATER groups should be searched by looking into jsons unfortunately :( +# NOTE groups should be searched by looking into jsons unfortunately :( # because of directory says others # maybe we dont use groups right away? diff --git a/miner/Individual.py b/miner/Individual.py new file mode 100644 index 0000000..84d63b5 --- /dev/null +++ b/miner/Individual.py @@ -0,0 +1,63 @@ +class Individual: + def __init__(self, name=None, title=None, compact=None, messages=None, friend=None, messages_dir=None, + media_dir=None, + member_of=None): + self._name = name + self._title = title + self._compact_name = compact + self._messages = messages + self._friend = friend + self._messages_dir = messages_dir + self._media_dir = media_dir + self._member_of = member_of + + def __repr__(self): + return f'{self.name}, messages: {self.messages}' + + def __add__(self, other): + return Individual( + name=self.title if self.title else other.title, + title=self.title if self.title else other.title, # TODO depracate one of (name, title) + friend=self.friend if self.friend else other.friend, + compact=self.compact_name if self.compact_name else other.compact_name, + messages=self.messages if len(self.messages) else other.messages, + messages_dir=self.messages_dir if self.messages_dir else other.messages_dir, + media_dir=self.media_dir if self.media_dir else other.media_dir, + member_of=self.member_of if self.member_of else other.member_of + ) + + @property + def name(self): + return self._name + + @property + def title(self): + return self._title + + @property + def messages(self): + return self._messages + + @messages.setter + def messages(self, df): + self._messages = df + + @property + def friend(self): + return self._friend + + @property + def media_dir(self): + return self._media_dir + + @property + def messages_dir(self): + return self._messages_dir + + @property + def compact_name(self): + return self._compact_name + + @property + def member_of(self): + return self._member_of diff --git a/Me.py b/miner/Me.py similarity index 78% rename from Me.py rename to miner/Me.py index 3293bf7..377c3a9 100644 --- a/Me.py +++ b/miner/Me.py @@ -1,4 +1,4 @@ -from FacebookData import FacebookData +from miner.FacebookData import FacebookData class Me(FacebookData): diff --git a/MessagingAnalyzer.py b/miner/MessagingAnalyzer.py similarity index 93% rename from MessagingAnalyzer.py rename to miner/MessagingAnalyzer.py index 0619505..8e8ddd4 100644 --- a/MessagingAnalyzer.py +++ b/miner/MessagingAnalyzer.py @@ -1,19 +1,13 @@ -from utils import year_converter, month_converter, generate_date_series, get_stats_for_intervals +from miner.utils import year_converter, month_converter, generate_date_series, get_stats_for_intervals from datetime import datetime, date, timedelta import pandas as pd -from ConversationAnalyzer import ConversationAnalyzer +from miner.ConversationAnalyzer import ConversationAnalyzer class MessagingAnalyzer: - def __init__(self, names, people): - # TODO input people only. class will know what to do - self.names = names - self.people = people - - def time_series_analysis_for_all(self, subject=None, **kwargs): - time_series = generate_date_series(**kwargs) - stacked_df = self.stack_dfs(self.people) - interval_stats = get_stats_for_intervals(self.get_stats, stacked_df, time_series, subject=subject) + def __init__(self, people): + self.names = people.names + self.people = people.data def get_stats(self, df, subject='all', start=None, end=None, period=None): # TODO LATER @@ -97,14 +91,13 @@ def most_used_words_by_partners(self, **kwargs): # 5. Number of messages sent/got on busiest period (by year/month/day/hour) def days_when_most_messages_sent(self): - # TODO LATER hard algorithmic problem pass def days_when_most_messages_received(self): pass def hours_when_most_messages_sent(self): - # TODO LATER + # TODO LATER hard algorithmic problem # is this referring to the absolute hour most messages sent?? # like: 2014.07.25. 15h-16h # OR @@ -119,7 +112,11 @@ def hours_when_most_messages_received(self): pass # 6. Time series: dict of 'year/month/day/hour : number of messages/words/characters (also sent/got) for user/all convos' - # TODO + def time_series_analysis_for_all(self, subject=None, **kwargs): + time_series = generate_date_series(**kwargs) + stacked_df = self.stack_dfs(self.people) + interval_stats = get_stats_for_intervals(self.get_stats, stacked_df, time_series, subject=subject) + # TODO finsh this for time series for all @staticmethod def stack_dfs(people): diff --git a/miner/People.py b/miner/People.py new file mode 100644 index 0000000..707c6f5 --- /dev/null +++ b/miner/People.py @@ -0,0 +1,52 @@ +import time + +from miner.Conversations import Conversations +from miner.Friends import Friends + +# from Me import Me + +DATA_PATH = '/home/levente/projects/facebook-data-miner/data' + + +class People: + def __init__(self, path=None, name=None): + self.data_path = path if path else DATA_PATH + self._groups = [] + self._data = self.get_people(name=name) + self._names = self.data.keys() + + @property + def data(self): + return self._data + + @property + def names(self): + return self._names + + @property + def groups(self): + return self._groups + + def get_people(self, name=None): + start = time.time() + friend = Friends(self.data_path + '/friends/friends.json') + friends = friend.get_people(name=name) + print('friends: ', time.time() - start) + + # TODO LATER too slow -> store in file + start = time.time() + conversations = Conversations(self.data_path) + individuals = conversations.get_people_from_private_messages() + + print('convos: ', time.time() - start) + + return self.unify_people(friends, individuals) + + @staticmethod + def unify_people(friends, convo_partners): + for person, friend in friends.items(): + if not convo_partners.get(person): + convo_partners[person] = friend + else: + convo_partners[person] = convo_partners.get(person) + friend + return convo_partners diff --git a/Visualizer.py b/miner/Visualizer.py similarity index 88% rename from Visualizer.py rename to miner/Visualizer.py index 052ecb3..440c3e7 100644 --- a/Visualizer.py +++ b/miner/Visualizer.py @@ -1,8 +1,8 @@ import matplotlib.pyplot as plt import seaborn as sns import pandas as pd -from People import People -from ConversationAnalyzer import ConversationAnalyzer +from miner.People import People +from miner.ConversationAnalyzer import ConversationAnalyzer # plt.rcParams.update({'figure.figsize': (10, 7), 'figure.dpi': 120}) @@ -22,7 +22,7 @@ def plot_convos(self, names): @staticmethod def set_up_data(people, name, period='y'): - analyzer = ConversationAnalyzer(name, people.individuals.get(name).messages) + analyzer = ConversationAnalyzer(name, people.data.get(name).messages) interval_stats = analyzer.get_time_series_data(subject='all', start=None, end=None, period=period) return analyzer.get_plottable_time_series_data(interval_stats, statistic='msg_count') diff --git a/tests/TestMessages.py b/miner/__init__.py similarity index 100% rename from tests/TestMessages.py rename to miner/__init__.py diff --git a/utils.py b/miner/utils.py similarity index 82% rename from utils.py rename to miner/utils.py index 2a48624..c80001b 100644 --- a/utils.py +++ b/miner/utils.py @@ -1,11 +1,31 @@ +import os import json import pandas as pd import dateparser from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta +MESSAGE_SUBPATH = 'messages/inbox' +MEDIA_DIRS = ['photos', 'gifs', 'files', 'videos', 'audio'] MONTHS = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] +DELTA_MAP = { + 'y': relativedelta(years=+1), + 'm': relativedelta(months=+1), + 'd': timedelta(days=1), + 'h': timedelta(hours=1) +} +ACCENTS_MAP = { + "á": "a", + "é": "e", + "í": "i", + "ó": "o", + "ö": "o", + "ő": "o", + "ú": "u", + "ü": "u", + "ű": "u", +} def read_json(file): @@ -22,28 +42,6 @@ def order_list_of_dicts(lst, key='timestamp_ms'): return sorted(lst, key=lambda k: k[key]) -accents_map = { - "á": "a", - "é": "e", - "í": "i", - "ó": "o", - "ö": "o", - "ő": "o", - "ú": "u", - "ü": "u", - "ű": "u", - # "Á": "A", - # "É": "E", - # "Í": "I", - # "Ó": "O", - # "Ö": "O", - # "Ő": "O", - # "Ú": "U", - # "Ü": "U", - # "Ű": "U", -} - - # @@ -110,14 +108,6 @@ def wrapper(*args, **kwargs): return wrapper -DELTA_MAP = { - 'y': relativedelta(years=+1), - 'm': relativedelta(months=+1), - 'd': timedelta(days=1), - 'h': timedelta(hours=1) -} - - def period_checker(func): def wrapper(*args, **kwargs): if kwargs.get('start') is not None and kwargs.get('end') is not None: @@ -134,7 +124,7 @@ def wrapper(*args, **kwargs): def generate_date_series(start=None, end=None, period=None): if period is None or DELTA_MAP.get(period) is None: raise ValueError('Parameter `period` should be one of {y, m, d, h}') - start = start or datetime(year=2009, month=10, day=2, hour=0) # TODO change this to date when user joined FB + start = start or datetime(year=2009, month=10, day=2, hour=0) # TODO LATER change this to date when user joined FB end = end or datetime.now() dates = [] @@ -147,14 +137,12 @@ def generate_date_series(start=None, end=None, period=None): def get_stats_for_intervals(func, df, time_series, subject='all'): data = {} - for offset, series in time_series.items(): - data[offset] = {} - for i in range(len(series) - 1): # only looping len - 1 times - start = series[i] - # TODO LATER will we miss the last entry? I dont think so (99%), but check and correct hand in hand with the timeseries bug - # IT DOES NOT! HOWEVER test it with new data injected/modified at runtime <- this is hard - end = series[i + 1] - data[offset][start] = func(df, subject=subject, start=start, end=end) + for i in range(len(time_series) - 1): # only looping len - 1 times + start = time_series[i] + # TODO test it with new data injected/modified at runtime <- this is hard + # what is this about actually? + end = time_series[i + 1] + data[start] = func(df, subject=subject, start=start, end=end) return data @@ -186,3 +174,27 @@ def decode_text(obj): return {key: decode_text(item) for key, item in obj.items()} return obj + + +def lower_names(col): + return col.str.lower() + + +def replace_accents(text): + for char in ACCENTS_MAP.keys(): + if char in text: + text = text.replace(char, ACCENTS_MAP[char]) + return text.replace(' ', '') + + +def without_accent_and_whitespace(col): + return col.apply(replace_accents) + + +def walk_directory_and_search(path, extension): + paths = [] + for root, dirs, files in os.walk(path): + for file_name in files: + if file_name.endswith(extension): + paths.append(os.path.join(root, file_name)) + return paths diff --git a/tests/TestPeople.py b/tests/TestPeople.py deleted file mode 100644 index 61295d8..0000000 --- a/tests/TestPeople.py +++ /dev/null @@ -1,36 +0,0 @@ -import pytest - - - -@pytest.fixture() -def people_names(): - return ['John Doe', 'Donald Duck', 'Szett Droxler', 'Foo Bar', 'Tőke Hal', 'Dér Dénes', 'Teflon Musk', 'Daisy Duck', - 'Guy Fawkes', 'Benedek Elek'] - - -def test_specific_people_has_or_has_not_got_messages(people): - # TODO LATER parametrize - import pandas as pd - assert isinstance(people.data.get('Benedek Elek').get('messages'), pd.DataFrame) - assert isinstance(people.data.get('Teflon Musk').get('messages'), pd.DataFrame) - assert isinstance(people.data.get('Tőke Hal').get('messages'), pd.DataFrame) - assert not isinstance(people.data.get('John Doe').get('messages'), pd.DataFrame) - assert not isinstance(people.data.get('Szett Droxler').get('messages'), pd.DataFrame) - assert not isinstance(people.data.get('Daisy Duck').get('messages'), pd.DataFrame) - assert not isinstance(people.data.get('Guy Fawkes').get('messages'), pd.DataFrame) - - -def test_people_name(people, people_names): - people_without_groups = [p for p in people.data.keys() if not p.startswith('group')] - assert sorted(people_names) == sorted(people_without_groups) - - -def test_some_convos_are_with_friends(people): - assert people.data.get('Teflon Musk').get('friend') - assert not people.data.get('Benedek Elek').get('friend') - - -def test_specific_people_has_or_has_not_got_media(people): - assert people.data.get('Teflon Musk').get('media_dir') - -#TODO LATER test individuals too \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 430e923..08e382b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,12 @@ import pytest -from People import People +from miner.People import People TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' @pytest.fixture(scope='session') -def people(): - p = People(path=TEST_DATA_PATH) - return p +def get_people(): + def _get_people(name=None): + return People(path=TEST_DATA_PATH, name=name) + return _get_people + diff --git a/tests/TestConversationAnalyzer.py b/tests/test_ConversationAnalyzer.py similarity index 94% rename from tests/TestConversationAnalyzer.py rename to tests/test_ConversationAnalyzer.py index 015ac19..f7d679f 100644 --- a/tests/TestConversationAnalyzer.py +++ b/tests/test_ConversationAnalyzer.py @@ -1,22 +1,16 @@ import pytest -from ConversationAnalyzer import ConversationAnalyzer -from People import People -from utils import dt +from miner.ConversationAnalyzer import ConversationAnalyzer +from miner.People import People +from miner.utils import dt TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' -# @pytest.mark.parametrize("test_input,expected", [("3+5", 8), ("2+4", 6), ("6*9", 42)]) -# def test_eval(test_input, expected): -# assert eval(test_input) == expected - -# get\(\'.*\'\)\. - - @pytest.fixture(scope='session') -def person(people): +def person(get_people): def _person(name): - return people.individuals[name] + people = get_people(name) + return people.data[name] return _person @@ -224,11 +218,12 @@ def test_stats_teflon_musk_all_2014_12(statistics): assert stats.char_count == 0 # assert stats.most_used_chars == 0 -class TestConversationAnalyzer: # Foo Bar + +class TestConversationAnalyzer: # Foo Bar pass def test_time_series_analysis_for_user(analyze): analyzer = analyze('Teflon Musk') - analyzer.get_time_series_data(subject='all') + analyzer.get_time_series_data(subject='all', period='y') assert 1 diff --git a/tests/TestConversations.py b/tests/test_Conversations.py similarity index 79% rename from tests/TestConversations.py rename to tests/test_Conversations.py index e198dc9..d066361 100644 --- a/tests/TestConversations.py +++ b/tests/test_Conversations.py @@ -1,6 +1,7 @@ import pandas as pd import pytest -from Conversations import Conversations +from miner.Conversations import Conversations +from miner import utils import os TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' @@ -8,12 +9,12 @@ @pytest.fixture() def convos(): convo = Conversations(f'{TEST_DATA_PATH}') - return convo.get_people() + return convo.get_people_from_private_messages() def test_get_all_people_from_convo(convos): people = [] - + # TODO make this work for convo in convos.keys(): if convo.startswith('group'): people += [p for p in convos[convo].get('participants')] @@ -28,19 +29,19 @@ def test_get_all_people_from_convo(convos): def test_all_convos_have_dir(convos): - assert all([data.get('messages_dir') for data in convos.values()]) + assert all([data.messages_dir for data in convos.values()]) def test_all_convos_have_messages_df(convos): - assert all([isinstance(data.get('messages'), pd.DataFrame) for data in convos.values()]) + assert all([isinstance(data.messages, pd.DataFrame) for data in convos.values()]) def test_some_convos_as_media_dir(convos): - assert convos.get('Teflon Musk').get('media_dir') - assert not convos.get('Benedek Elek').get('media_dir') + assert convos.get('Teflon Musk').media_dir + assert not convos.get('Benedek Elek').media_dir def test_convo_media_has_one_folder_of_possibles(convos): - listed_dir = os.listdir(f"{TEST_DATA_PATH}/{convos.get('Teflon Musk').get('media_dir')}") + listed_dir = os.listdir(f"{TEST_DATA_PATH}/{utils.MESSAGE_SUBPATH}/{convos.get('Teflon Musk').media_dir}") assert 'files' in listed_dir assert 'photos' in listed_dir assert 'audio' not in listed_dir diff --git a/tests/TestFriends.py b/tests/test_Friends.py similarity index 91% rename from tests/TestFriends.py rename to tests/test_Friends.py index f336609..652b671 100644 --- a/tests/TestFriends.py +++ b/tests/test_Friends.py @@ -1,6 +1,6 @@ import pytest -from Friends import Friends +from miner.Friends import Friends TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' @@ -31,7 +31,7 @@ def test_get_peoples_names_from_friends(friends, expected_friends): def test_get_peoples_compact_name_from_friends(friends, expected_friends): expected_compact_names = [value.get('compact_name') for value in expected_friends.values()] - assert all([p.get('compact_name') in expected_compact_names for p in friends.values()]) + assert all([p.compact_name in expected_compact_names for p in friends.values()]) diff --git a/tests/test_Messages.py b/tests/test_Messages.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/TestMessagingAnalyzer.py b/tests/test_MessagingAnalyzer.py similarity index 98% rename from tests/TestMessagingAnalyzer.py rename to tests/test_MessagingAnalyzer.py index b803693..ec03497 100644 --- a/tests/TestMessagingAnalyzer.py +++ b/tests/test_MessagingAnalyzer.py @@ -1,10 +1,11 @@ import pytest -from MessagingAnalyzer import MessagingAnalyzer -from utils import dt +from miner.MessagingAnalyzer import MessagingAnalyzer +from miner.utils import dt @pytest.fixture(scope='session') -def analyzer(people): - return MessagingAnalyzer(people.names, people.individuals) +def analyzer(get_people): + people = get_people() + return MessagingAnalyzer(people) def test_total_number_of_messages(analyzer): diff --git a/tests/test_People.py b/tests/test_People.py new file mode 100644 index 0000000..e9f270e --- /dev/null +++ b/tests/test_People.py @@ -0,0 +1,39 @@ +import pytest + + + +@pytest.fixture() +def people_names(): + return ['John Doe', 'Donald Duck', 'Szett Droxler', 'Foo Bar', 'Tőke Hal', 'Dér Dénes', 'Teflon Musk', 'Daisy Duck', + 'Guy Fawkes', 'Benedek Elek'] + +@pytest.fixture +def people(get_people): + return get_people() + +def test_specific_people_has_or_has_not_got_messages(people): + # TODO LATER parametrize + import pandas as pd + assert isinstance(people.data.get('Benedek Elek').messages, pd.DataFrame) + assert isinstance(people.data.get('Teflon Musk').messages, pd.DataFrame) + assert isinstance(people.data.get('Tőke Hal').messages, pd.DataFrame) + assert not isinstance(people.data.get('John Doe').messages, pd.DataFrame) + assert not isinstance(people.data.get('Szett Droxler').messages, pd.DataFrame) + assert not isinstance(people.data.get('Daisy Duck').messages, pd.DataFrame) + assert not isinstance(people.data.get('Guy Fawkes').messages, pd.DataFrame) + + +def test_people_name(people, people_names): + people_without_groups = [p for p in people.data.keys() if not p.startswith('group')] + assert sorted(people_names) == sorted(people_without_groups) + + +def test_some_convos_are_with_friends(people): + assert people.data.get('Teflon Musk').friend + assert not people.data.get('Benedek Elek').friend + + +def test_specific_people_has_or_has_not_got_media(people): + assert people.data.get('Teflon Musk').media_dir + +#TODO LATER test individuals too \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index 579569c..4d11263 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,5 @@ import unittest -from utils import * +from miner.utils import * from pathlib import Path import reusables from reusables.cli import * From b2f725eda6da5500aa2effcd351494a401e79347 Mon Sep 17 00:00:00 2001 From: Levente Csoke Date: Sat, 15 Aug 2020 01:01:19 +0200 Subject: [PATCH 2/3] refactored MsgA and ConvoA into Analyzer; added lot of functionalities for gathering data for plotting --- .gitignore | 21 +- README.md | 8 +- miner/Analyzer.py | 133 ++++++++++++ miner/App.py | 26 +-- miner/ConversationAnalyzer.py | 146 ------------- miner/ConversationStats.py | 89 ++++++++ miner/Conversations.py | 155 ++++++-------- miner/FacebookData.py | 2 - miner/Friends.py | 4 +- miner/Individual.py | 10 +- miner/Me.py | 2 +- miner/Messages.py | 64 ++++++ miner/MessagingAnalyzer.py | 127 ----------- miner/People.py | 11 +- miner/requirements.txt | 9 + miner/utils.py | 68 ++++-- requirements.txt | 5 + tests/test_ConversationAnalyzer.py | 33 ++- tests/test_Conversations.py | 92 ++++---- tests/test_MessagingAnalyzer.py | 333 +++++++++++++++-------------- tests/test_utils.py | 9 +- 21 files changed, 670 insertions(+), 677 deletions(-) create mode 100644 miner/Analyzer.py delete mode 100644 miner/ConversationAnalyzer.py create mode 100644 miner/ConversationStats.py create mode 100644 miner/Messages.py delete mode 100644 miner/MessagingAnalyzer.py create mode 100644 miner/requirements.txt create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 5b8858a..4755795 100644 --- a/.gitignore +++ b/.gitignore @@ -228,24 +228,19 @@ dmypy.json # End of https://www.toptal.com/developers/gitignore/api/python,jupyternotebook,pycharm,visualstudiocode +# ignoring data folder +data -# ignoring data -data +# ignoring jupyter notebook +tests/playground.py -# ignoring todo +# ignoring various files created during development +plots +*.png todo.md - - -# ignoring trash file trash.py - - -# ignoring jupyter notebook explore.ipynb - - -# ignoring jupyter notebook -tests/playground.py +tests/test_data/messages/inbox/private_messages.json tests/.pytest_cache .pytest_cache \ No newline at end of file diff --git a/README.md b/README.md index 943a123..c4b3b6a 100644 --- a/README.md +++ b/README.md @@ -11,4 +11,10 @@ More info soon... ## Contribution Help is more than welcome. If somebody feel the urge to contribute, I would share my plans with them. -Ideas are welcome too. Feel free to open a new issue. \ No newline at end of file +Ideas are welcome too. Feel free to open a new issue. + + +For running VIsualizer CLI: +```shell script +export PYTHONPATH="$PWD" +``` diff --git a/miner/Analyzer.py b/miner/Analyzer.py new file mode 100644 index 0000000..b0e61bc --- /dev/null +++ b/miner/Analyzer.py @@ -0,0 +1,133 @@ +from miner.ConversationStats import ConversationStats +from miner import utils +import pandas as pd + + +class Analyzer: + # TODO do we need to override __subclasscheck__ ? + + # def __new__(cls, name, messages, *args, **kwargs): + # if messages is None: # This deals with the case if no messages + # return None + # return super(Analyzer, cls).__new__(cls, *args, **kwargs) + + def __init__(self, people): + self.people = people + self.people_data = people.data + self.names = people.names + self.multi = len(self.people_data) > 1 + + if self.multi: + self.df = self.stack_dfs() + else: + # TODO solve this hand in hand with the __new__ method. too ugly + self.df = self.people_data.get(list(self.names)[0]).messages + + def get_stats_for_intervals(self, time_series, subject='all'): + data = {} + for i in range(len(time_series) - 1): # only looping len - 1 times + start = time_series[i] + end = time_series[i + 1] + data[start] = self.get_stats(self.df, subject=subject, start=start, end=end) + return data + + def get_stats(self, df=None, subject='all', start=None, end=None, period=None): + df = self.df if df is None else df + df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period) + stats = ConversationStats(df) + return stats + + @staticmethod + def get_plottable_time_series_data(interval_stats, statistic): + for k, v in interval_stats.items(): + if isinstance(v, ConversationStats): + interval_stats[k] = getattr(v, statistic) + return interval_stats + + @property + def stats(self): + return self.get_stats() + + def __str__(self): + if self.multi: + return self.names + else: + return f'{self.names[0]}: {list(self.df.index)}' + + def stack_dfs(self): + dfs = [] + for data in self.people_data.values(): + if data.messages is not None: + dfs.append(data.messages) + return pd.concat(dfs).sort_index() + + # 1. Total count of messages/words/characters (also by year/month/day/hour) + # 2. Total count of messages/words/characters sent (also by year/month/day/hour) + # 3. Total count of messages/words/characters received (also by year/month) + def get_count(self, attribute, subject='all', start=None, end=None, period=None): + stats = self.get_stats(subject=subject, start=start, end=end, period=period) + return getattr(stats, attribute) + + ################# + + # 4. Most used messages/words in convos by me/partner (also by year/month/day/hour) + def most_used_messages_(self, **kwargs): + """ + >>> s1 = pd.Series([3, 1, 2, 3, 4, 1, 1]) + >>> s2 = pd.Series([3, 2, 1, 1]) + >>> s1_vc = s1.value_counts() + >>> s2_vc = s2.value_counts() + TODO LATER most used is already a problem: + - because its a series of all the unique messages/words ever used in a convo + - it contains strings like ':d', ':p' and 'xd' + - from all the convos the result of value_counts has to be cleared + and has to be truncated (that is not use the 200th most used word, only top10 let's say) + - then these series has to be merged in a way that the same string's counts are added up + - what about typos????! + """ + pass + + # 5. Number of messages sent/got on busiest period (by year/month/day/hour) + def stat_per_period(self, period, attribute, **kwargs): + interval_stats = self.get_time_series_data(period, **kwargs) + # TODO attribute is one of (msg, word, char) + time_series_data = self.get_plottable_time_series_data(interval_stats, statistic=attribute) + return utils.count_stat_for_period(time_series_data, period) + + # 6. Time series: dict of 'year/month/day/hour : number of messages/words/characters (also sent/got) for user/all convos' + def get_time_series_data(self, period, subject='all', **kwargs): + time_series = utils.generate_date_series(period, **kwargs) + return self.get_stats_for_intervals(self.df, time_series, subject=subject) + + # # 7. Ranking of friends by messages by y/m/d/h, by different stats, by sent/got + def get_ranking_of_friends_by_messages(self, attribute='msg_count', subject='all', start=None, end=None, + period=None): + # TODO almost the same function as get_count + count_dict = {} + for name in self.names: + # analyzer = Analyzer({name: self.people.get(name)}) # this has to be a people instance?! OR? + # analyzer = Analyzer(People(self.people.data_path, name=name)) # this has to be a people instance?! OR? + df = self.df[self.df.partner == name] + stats = self.get_stats(df=df, subject=subject, start=start, end=end, period=period) + if stats is not None: + count_dict = utils.fill_dict(count_dict, name, getattr(stats, attribute)) + + count_dict = {key: value for key, value in sorted(count_dict.items(), key=lambda item: item[1], reverse=True)} + return count_dict + + @staticmethod + @utils.subject_checker + @utils.date_checker + @utils.period_checker + def filter_by_input(df, subject='all', start=None, end=None, period=None): + if subject == 'me': + df = df[df.sender_name == 'Levente Csőke'] + elif subject == 'partner': + df = df[df.sender_name != 'Levente Csőke'] + if start and end: + df = df.loc[start:end] + elif start and not end: + df = df.loc[start:start + period] + elif not start and end: + df = df.loc[end - period:end] + return df diff --git a/miner/App.py b/miner/App.py index c2560bd..4989e8b 100644 --- a/miner/App.py +++ b/miner/App.py @@ -1,5 +1,5 @@ -from miner.ConversationAnalyzer import ConversationAnalyzer -from miner.MessagingAnalyzer import MessagingAnalyzer +from miner.Analyzer import Analyzer + from miner.People import People DATA_PATH = '/home/levente/projects/facebook-data-miner/data' @@ -13,26 +13,8 @@ def __init__(self): def analyze_messages(): p = People(path=DATA_PATH) - stats = {} - - for name, person in p.data.items(): - if person.messages is None: - stats[person.name] = None - continue - analyzer = ConversationAnalyzer(person.name, person.messages) - stats[person.name] = analyzer.stats - # if stats[person.name].get('message_count').get('me') > 5000: - # top[person.name] = stats[person.name] - print() - - # print('LEN: ', len(top.keys())) - # top_all = {name: data.get('message_count').get('all') for name, data in top.items()} - # analyzer.visualize_stats(top) - - @staticmethod - def analyze_messaging(): - people = People(path=DATA_PATH) - msg_analyzer = MessagingAnalyzer(people) + analyzer = Analyzer(p) + rank = analyzer.get_ranking_of_friends_by_messages(attribute='char_count') if __name__ == '__main__': diff --git a/miner/ConversationAnalyzer.py b/miner/ConversationAnalyzer.py deleted file mode 100644 index a5928bf..0000000 --- a/miner/ConversationAnalyzer.py +++ /dev/null @@ -1,146 +0,0 @@ -import pandas as pd -from miner.utils import date_checker, period_checker, subject_checker, generate_date_series, get_stats_for_intervals - - -class ConversationAnalyzer: - def __new__(cls, name, messages, *args, **kwargs): - if messages is None: # This deals with the case if no messages - return None - return super(ConversationAnalyzer, cls).__new__(cls, *args, **kwargs) - - def __init__(self, name, messages): - self.name = name - self.df = messages - - def __str__(self): - return f'{self.name}: {list(self.df.index)}' - - @property - def stats(self): - return self.get_stats(self.df) - - def get_stats(self, df, subject='all', start=None, end=None, period=None): - df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period) - stats = ConversationStats(df) - return stats - - def get_time_series_data(self, subject='all', **kwargs): - time_series = generate_date_series(**kwargs) - return get_stats_for_intervals(self.get_stats, self.df, time_series, subject=subject) - - @staticmethod - def get_plottable_time_series_data(interval_stats, statistic): - for k, v in interval_stats.items(): - if isinstance(v, ConversationStats): - interval_stats[k] = getattr(v, statistic) - return interval_stats - - @staticmethod - @subject_checker - @date_checker - @period_checker - def filter_by_input(df, subject='all', start=None, end=None, period=None): - if subject == 'me': - df = df[df.sender_name == 'Levente Csőke'] - elif subject == 'partner': - df = df[df.sender_name != 'Levente Csőke'] - if start and end: - df = df.loc[start:end] - elif start and not end: - df = df.loc[start:start + period] - elif not start and end: - df = df.loc[end - period:end] - return df - - -class ConversationStats: - """ - Statistics of conversation with one person. - """ - - def __init__(self, df): - self.df = df - - def __repr__(self): - return f'{self.msg_count}' - - @property - def messages(self): - return self.df.content.dropna() - - @property - def words(self): - return self.get_words() - - # 1. - @property - def msg_count(self): - return len(self.df) - - # 2. - @property - def unique_msg_count(self): - return len(self.messages.unique()) - - # 3. - @property - def most_used_msgs(self): - # TODO LATER first few (1-10) messages - return self.messages.value_counts() - - # 4. - @property - def msg_frequency(self): - # NOTE this has been most likely depracated OR? - pass - - # 5. - @property - def word_count(self): - return len(self.words) - - # 6. - @property - def unique_word_count(self): - return len(set(self.words)) - - # 7. - @property - def most_used_words(self): - s = pd.Series(self.words) - return s.value_counts() - - # 8. - @property - def word_frequency(self): - pass - - # 9. - @property - def char_count(self): - char_count = 0 - for word in self.words: - char_count += len(word) - return char_count - - # 10. - @property - def rate_of_media_messages(self): - """ - TODO LATER - search for media messages all 5 of them - rate is only the second or third abstraction - """ - pass - - def get_words(self): - token_list = self.messages.str.lower().str.split() - words = [] - for tokens in token_list: - # print(tokens) - if not isinstance(tokens, list): - print('WARNING! Not a list!') - continue # TODO ??? check this - for token in tokens: - words.append(token) - return words diff --git a/miner/ConversationStats.py b/miner/ConversationStats.py new file mode 100644 index 0000000..040bd17 --- /dev/null +++ b/miner/ConversationStats.py @@ -0,0 +1,89 @@ + +class ConversationStats: + """ + Statistics of conversation with one person. + """ + + def __init__(self, df): + self.df = df + + def __repr__(self): + return f'{self.msg_count}' + + @property + def messages(self): + return self.df.content.dropna() + + @property + def words(self): + return self.get_words() + + # 1. + @property + def msg_count(self): + return len(self.df) + + # 2. + @property + def unique_msg_count(self): + return len(self.messages.unique()) + + # 3. + @property + def most_used_msgs(self): + return self.messages.value_counts() + + # 4. + @property + def msg_frequency(self): + # NOTE this has been most likely depracated OR? + pass + + # 5. + @property + def word_count(self): + return len(self.words) + + # 6. + @property + def unique_word_count(self): + return len(set(self.words)) + + # 7. + @property + def most_used_words(self): + return pd.Series(self.words).value_counts() + + # 8. + @property + def word_frequency(self): + pass + + # 9. + @property + def char_count(self): + char_count = 0 + for word in self.words: + char_count += len(word) + return char_count + + # 10. + @property + def rate_of_media_messages(self): + """ + TODO LATER + search for media messages all 5 of them + rate is only the second or third abstraction + """ + pass + + def get_words(self): + token_list = self.messages.str.lower().str.split() + words = [] + for tokens in token_list: + if not isinstance(tokens, list): + print('WARNING! Not a list!') + continue + for token in tokens: + words.append(token) + return words diff --git a/miner/Conversations.py b/miner/Conversations.py index 4f61ad9..7a373d4 100644 --- a/miner/Conversations.py +++ b/miner/Conversations.py @@ -1,64 +1,93 @@ +import pandas as pd import os -from miner.Group import Group + + +from miner.Messages import Messages from miner.Individual import Individual -from miner.FacebookData import FacebookData -import pandas as pd + from miner import utils -from datetime import datetime class Conversations: def __init__(self, data_path): - self.indie_convo_paths = [] - self.group_convo_paths = [] - self.deleted_user_convo_paths = [] + self.private_convo_paths = {} + self.group_convo_paths = {} # TODO fill this as well + self.deleted_user_convo_paths = [] # NOTE these are collected but not yet used self.data_path = f'{data_path}/{utils.MESSAGE_SUBPATH}' self.order_paths() def order_paths(self): - json_paths = utils.walk_directory_and_search(self.data_path, '.json') + paths_map = f'{self.data_path}/private_messages.json' + if os.path.isfile(paths_map): + self.read_paths(paths_map) + return + json_paths = utils.walk_directory_and_search(self.data_path, '.json', contains_string='message_') self.differentiate_paths(json_paths) + self.register_paths() def differentiate_paths(self, jsons): for file in jsons: msg = Messages(file) if msg.title == 'Facebook User': - self.deleted_user_convo_paths.append(file) + self.deleted_user_convo_paths.append(msg.messages_dir) elif msg.ttype == 'RegularGroup': - self.group_convo_paths.append(file) + self.map_group_convo_files(msg, file) elif msg.ttype == 'Regular': - self.indie_convo_paths.append(file) + # self.private_convo_paths[msg.title] = msg.messages_dir + self.map_private_convo_files(msg, file) else: raise ValueError('Should not happen!') + def register_paths(self): + utils.dump_to_json(self.private_convo_paths, f'{self.data_path}/private_messages.json') + + def read_paths(self, file): + self.private_convo_paths = utils.read_json(file) + print() + + def map_private_convo_files(self, msg, file): + name = msg.title + if self.private_convo_paths.get(name): + self.private_convo_paths[name].append(file) + else: + self.private_convo_paths[name] = [file] + + def map_group_convo_files(self, msg, file): + for participant in msg.participants: + if self.group_convo_paths.get(file): + self.group_convo_paths[file].append(participant) + else: + self.group_convo_paths[file] = [participant] + def get_people_from_private_messages(self, name=None, membership=True): name_data_map = {} - paths = self.indie_convo_paths - if name is not None: - paths = self.filter_by_name(name) - for file in paths: - messages = Messages(file) - name = messages.title - if name_data_map.get(name) is not None: - dfs = [name_data_map[name].messages, messages.df] - name_data_map[name].messages = pd.concat(dfs).sort_index() - else: - # TODO we may also want to get group messages where name is present - name_data_map[name] = self.create_individual(messages, membership=membership) + convo_path_map = self.filter_by_name(name) if name is not None else self.private_convo_paths.values() + for paths in convo_path_map: + for file in paths: + messages = Messages(file) + name = messages.title + if name_data_map.get(name) is not None: + dfs = [name_data_map[name].messages, messages.df] + name_data_map[name].messages = pd.concat(dfs).sort_index() + else: + name_data_map[name] = self.create_individual(messages, membership=membership) return name_data_map def filter_by_name(self, name): filtered_paths = [] - compact_name = None if name is None else utils.replace_accents(name.lower()) - for path in self.indie_convo_paths: - if compact_name in os.path.basename(os.path.dirname(os.path.normpath(path))): - filtered_paths.append(path) + names = [] + if isinstance(name, str): + names = [name] + elif isinstance(name, list): + names = name + for name in names: + filtered_paths.append(self.private_convo_paths.get(name)) return filtered_paths def create_individual(self, messages, membership=None): return Individual( - name=messages.title, title=messages.title, # TODO depracate one of (name, title) + name=messages.title, compact=messages.compact_names, messages=messages.df, messages_dir=messages.messages_dir, @@ -66,74 +95,12 @@ def create_individual(self, messages, membership=None): member_of=self.group_membership(messages.title) if membership else None, ) - @staticmethod - def fill_data_map(message): - return { - 'title': message.title, - 'compact_name': message.compact_names, - # 'participants': msg.participants + ['Levente Csőke'], - 'participants': message.participants, - 'messages': message.df, - 'friend': None, - 'messages_dir': message.messages_dir, - 'media_dir': message.media_dir - } - @staticmethod def group_membership(name): return None - @staticmethod - def json_is_a_group_msg(file): - msg = Messages(file) - return msg.ttype == 'RegularGroup' - - -class Messages(FacebookData): - def __init__(self, json_path): - super().__init__(json_path) - self.to_df('messages') - self.set_date_as_index() - - @property - def names(self): - return pd.DataFrame(self.participants)[0] - - @property - def participants(self): - participants = self.decoded.get('participants') - # TODO I should be IN - # but this breaks stuff at TestMessagingAnalyzer - return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke'] - # return [p.get('name') for p in participants if p.get('name')] - - @property - def title(self): - return self.decoded.get('title') - - @property - def ttype(self): - return self.decoded.get('thread_type') - - @property - def messages_dir(self): - thread_path = self.decoded.get('thread_path') - if not thread_path.startswith('inbox/'): - raise ValueError('Field `thread_path` should start with `inbox/`.') - return thread_path.split('inbox/')[1] - - @property - def media_dir(self): - for media in utils.MEDIA_DIRS: - if media in self._df.columns: - media_in_msg = list(self._df[media][self._df[media].notnull()]) - uri = media_in_msg[0][0].get('uri') - return os.path.dirname(os.path.dirname(uri)).split('inbox/')[1] - - def set_date_as_index(self): - date_series = self._df.timestamp_ms.apply(self.ts_to_date) - self._df = self._df.set_index(date_series).iloc[::-1] + def get_people_from_group_messages(self): + pass # TODO for v0.0.4 + + - @staticmethod - def ts_to_date(date): - return datetime.fromtimestamp(date / 1000) # .strftime('%Y-%m-%d') diff --git a/miner/FacebookData.py b/miner/FacebookData.py index babe74d..81b946f 100644 --- a/miner/FacebookData.py +++ b/miner/FacebookData.py @@ -21,8 +21,6 @@ def json(self): @property def compact_names(self): - # NOTE this is the place where we change pd/np to builtin - # do we have to do this? name_list = list(utils.without_accent_and_whitespace(utils.lower_names(self.names))) # should be just fine return name_list[0] if len(name_list) == 1 else name_list diff --git a/miner/Friends.py b/miner/Friends.py index 7950155..98d995f 100644 --- a/miner/Friends.py +++ b/miner/Friends.py @@ -1,5 +1,3 @@ -import pandas as pd -import os from miner.FacebookData import FacebookData from miner.Individual import Individual @@ -16,7 +14,7 @@ def get_people(self, name=None): if name is not None and name != full_name: # filtering for name continue names[full_name] = Individual( - name=full_name, title=full_name, # TODO depracate one of (name, title) + name=full_name, compact=compact, friend=True, ) diff --git a/miner/Individual.py b/miner/Individual.py index 84d63b5..4518a5f 100644 --- a/miner/Individual.py +++ b/miner/Individual.py @@ -1,9 +1,8 @@ class Individual: - def __init__(self, name=None, title=None, compact=None, messages=None, friend=None, messages_dir=None, + def __init__(self, name=None, compact=None, messages=None, friend=None, messages_dir=None, media_dir=None, member_of=None): self._name = name - self._title = title self._compact_name = compact self._messages = messages self._friend = friend @@ -16,8 +15,7 @@ def __repr__(self): def __add__(self, other): return Individual( - name=self.title if self.title else other.title, - title=self.title if self.title else other.title, # TODO depracate one of (name, title) + name=self.name if self.name else other.name, friend=self.friend if self.friend else other.friend, compact=self.compact_name if self.compact_name else other.compact_name, messages=self.messages if len(self.messages) else other.messages, @@ -30,10 +28,6 @@ def __add__(self, other): def name(self): return self._name - @property - def title(self): - return self._title - @property def messages(self): return self._messages diff --git a/miner/Me.py b/miner/Me.py index 377c3a9..b10356d 100644 --- a/miner/Me.py +++ b/miner/Me.py @@ -7,4 +7,4 @@ def __init__(self, *args): @property def name(self): - return 'Levente Csőke' + return '' diff --git a/miner/Messages.py b/miner/Messages.py new file mode 100644 index 0000000..6fbc9d3 --- /dev/null +++ b/miner/Messages.py @@ -0,0 +1,64 @@ +from datetime import datetime +import pandas as pd +import os + +from miner.FacebookData import FacebookData +from miner import utils + + +class Messages(FacebookData): + def __init__(self, json_path): + super().__init__(json_path) + self.to_df('messages') + self.set_date_as_index() + self.add_partner_column() + + @property + def names(self): + # TODO ugly + try: + return pd.DataFrame(self.participants)[0] + except KeyError: + return pd.Series({0: 'Facebook User'}) + + @property + def participants(self): + participants = self.decoded.get('participants') + # TODO I should be IN + # but this breaks stuff at TestMessagingAnalyzer + return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke'] + # return [p.get('name') for p in participants if p.get('name')] + + @property + def title(self): + return self.decoded.get('title') + + @property + def ttype(self): + return self.decoded.get('thread_type') + + @property + def messages_dir(self): + thread_path = self.decoded.get('thread_path') + if not thread_path.startswith('inbox/'): + raise ValueError('Field `thread_path` should start with `inbox/`.') + return thread_path.split('inbox/')[1] + + @property + def media_dir(self): + for media in utils.MEDIA_DIRS: + if media in self._df.columns: + media_in_msg = list(self._df[media][self._df[media].notnull()]) + uri = media_in_msg[0][0].get('uri') + return os.path.dirname(os.path.dirname(uri)).split('inbox/')[1] + + def set_date_as_index(self): + date_series = self._df.timestamp_ms.apply(self.ts_to_date) + self._df = self._df.set_index(date_series).iloc[::-1] + + def add_partner_column(self): + self._df['partner'] = self.title + + @staticmethod + def ts_to_date(date): + return datetime.fromtimestamp(date / 1000) diff --git a/miner/MessagingAnalyzer.py b/miner/MessagingAnalyzer.py deleted file mode 100644 index 8e8ddd4..0000000 --- a/miner/MessagingAnalyzer.py +++ /dev/null @@ -1,127 +0,0 @@ -from miner.utils import year_converter, month_converter, generate_date_series, get_stats_for_intervals -from datetime import datetime, date, timedelta -import pandas as pd -from miner.ConversationAnalyzer import ConversationAnalyzer - - -class MessagingAnalyzer: - def __init__(self, people): - self.names = people.names - self.people = people.data - - def get_stats(self, df, subject='all', start=None, end=None, period=None): - # TODO LATER - # here you have to do something with it - pass - - def get_count(self, attribute, subject='all', start=None, end=None, period=None): - count = 0 - # we have a list of names we want to iterate over - for name in self.names: - stats = self.get_conversation_stats(name=name, subject=subject, start=start, end=end, period=period) - if stats is not None: - count += getattr(stats, attribute) - return count - - def get_conversation_stats(self, name, subject='all', start=None, end=None, period=None): - messages = self.people.get(name).messages - analyzer = ConversationAnalyzer(name, messages) - if analyzer is None: - return None - return analyzer.get_stats(messages, subject=subject, start=start, end=end, period=period) - - def total_number_of_(self, attribute, subject='all', **kwargs): - return self.get_count(attribute=attribute, subject=subject, **kwargs) - - # 1. Ranking of friends by total count of messages/words/characters (also by year/month/day/hour) - def total_number_of_messages(self, **kwargs): - return self.total_number_of_(attribute='msg_count', **kwargs) - - def total_number_of_words(self, **kwargs): - return self.total_number_of_(attribute='word_count', **kwargs) - - def total_number_of_characters(self, **kwargs): - return self.total_number_of_(attribute='char_count', **kwargs) - - # 2. Ranking of friends who I sent the most messages/words/characters (also by year/month/day/hour) - def total_number_of_messages_sent(self, **kwargs): - return self.total_number_of_(attribute='msg_count', subject='me', **kwargs) - - def total_number_of_words_sent(self, **kwargs): - return self.total_number_of_(attribute='word_count', subject='me', **kwargs) - - def total_number_of_characters_sent(self, **kwargs): - return self.total_number_of_(attribute='char_count', subject='me', **kwargs) - - # 3. Ranking of friends who sent the most messages/words/characters (also by year/month) - def total_number_of_messages_received(self, **kwargs): - return self.total_number_of_(attribute='msg_count', subject='partner', **kwargs) - - def total_number_of_words_received(self, **kwargs): - return self.total_number_of_(attribute='word_count', subject='partner', **kwargs) - - def total_number_of_characters_received(self, **kwargs): - return self.total_number_of_(attribute='char_count', subject='partner', **kwargs) - - # 4. Most used messages/words in convos by me/partner (also by year/month/day/hour) - def most_used_messages_by_me(self, **kwargs): - """ - >>> s1 = pd.Series([3, 1, 2, 3, 4, 1, 1]) - >>> s2 = pd.Series([3, 2, 1, 1]) - >>> s1_vc = s1.value_counts() - >>> s2_vc = s2.value_counts() - TODO LATER most used is already a problem: - - because its a series of all the unique messages/words ever used in a convo - - it contains strings like ':d', ':p' and 'xd' - - from all the convos the result of value_counts has to be cleared - and has to be truncated (that is not use the 200th most used word, only top10 let's say) - - then these series has to be merged in a way that the same string's counts are added up - - what about typos????! - """ - pass - - def most_used_messages_by_partners(self, **kwargs): - pass - - def most_used_words_by_me(self, **kwargs): - pass - - def most_used_words_by_partners(self, **kwargs): - pass - - # 5. Number of messages sent/got on busiest period (by year/month/day/hour) - def days_when_most_messages_sent(self): - pass - - def days_when_most_messages_received(self): - pass - - def hours_when_most_messages_sent(self): - # TODO LATER hard algorithmic problem - # is this referring to the absolute hour most messages sent?? - # like: 2014.07.25. 15h-16h - # OR - # the pattern of most messages sent between this and this hours - # like: 20h-21h - # ACTUALLY BOTH - # for years/months/days/hours - # BUT this comes from the time series analysis - pass - - def hours_when_most_messages_received(self): - pass - - # 6. Time series: dict of 'year/month/day/hour : number of messages/words/characters (also sent/got) for user/all convos' - def time_series_analysis_for_all(self, subject=None, **kwargs): - time_series = generate_date_series(**kwargs) - stacked_df = self.stack_dfs(self.people) - interval_stats = get_stats_for_intervals(self.get_stats, stacked_df, time_series, subject=subject) - # TODO finsh this for time series for all - - @staticmethod - def stack_dfs(people): - dfs = [] - for data in people.values(): - if data.messages is not None: - dfs.append(data.messages) - return pd.concat(dfs).sort_index() diff --git a/miner/People.py b/miner/People.py index 707c6f5..2970390 100644 --- a/miner/People.py +++ b/miner/People.py @@ -3,8 +3,6 @@ from miner.Conversations import Conversations from miner.Friends import Friends -# from Me import Me - DATA_PATH = '/home/levente/projects/facebook-data-miner/data' @@ -21,7 +19,7 @@ def data(self): @property def names(self): - return self._names + return self._names #if len(self._names) > 1 else self._names[0] @property def groups(self): @@ -36,9 +34,10 @@ def get_people(self, name=None): # TODO LATER too slow -> store in file start = time.time() conversations = Conversations(self.data_path) - individuals = conversations.get_people_from_private_messages() - - print('convos: ', time.time() - start) + print('convos1: ', time.time() - start) + start = time.time() + individuals = conversations.get_people_from_private_messages(name=name) + print('convos2: ', time.time() - start) return self.unify_people(friends, individuals) diff --git a/miner/requirements.txt b/miner/requirements.txt new file mode 100644 index 0000000..1262ec9 --- /dev/null +++ b/miner/requirements.txt @@ -0,0 +1,9 @@ +numpy==1.18.1 +pandas==1.0.3 +dateparser==0.7.6 +seaborn==0.10.1 +matplotlib==3.2.1 +plotly==4.8.2 +miner==0.0.0 +Pillow==7.2.0 +python_dateutil==2.8.1 diff --git a/miner/utils.py b/miner/utils.py index c80001b..cfa7644 100644 --- a/miner/utils.py +++ b/miner/utils.py @@ -1,6 +1,5 @@ import os import json -import pandas as pd import dateparser from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta @@ -9,6 +8,13 @@ MEDIA_DIRS = ['photos', 'gifs', 'files', 'videos', 'audio'] MONTHS = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] +WEEKDAYS = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"] +PERIOD_MAP = { + 'y': None, + 'm': MONTHS, + 'd': WEEKDAYS, + 'h': None, +} DELTA_MAP = { 'y': relativedelta(years=+1), 'm': relativedelta(months=+1), @@ -34,17 +40,14 @@ def read_json(file): def dump_to_json(data=None, file=None): - with open(file, 'w') as f: - json.dump(data, f) + with open(file, 'w', encoding='utf8') as f: + json.dump(data, f, ensure_ascii=False) def order_list_of_dicts(lst, key='timestamp_ms'): return sorted(lst, key=lambda k: k[key]) -# - - def year_converter(func): """ Higher-order function that converts @year param passed to @func into numeric version. @@ -121,31 +124,22 @@ def wrapper(*args, **kwargs): return wrapper -def generate_date_series(start=None, end=None, period=None): +def generate_date_series(period, start=None, end=None): if period is None or DELTA_MAP.get(period) is None: raise ValueError('Parameter `period` should be one of {y, m, d, h}') start = start or datetime(year=2009, month=10, day=2, hour=0) # TODO LATER change this to date when user joined FB end = end or datetime.now() + # TODO THIS HAS A PROBLEM. msgs happened in 2020 getting assigned to 2019 because: 2019 + 1 year + start.month + start.day < now() + # TODO serious problem! dates = [] intermediate = start - while intermediate <= end: + while intermediate <= (end + DELTA_MAP.get(period)): # means that we want to have the end in it as well dates.append(intermediate) intermediate = intermediate + DELTA_MAP.get(period) return dates -def get_stats_for_intervals(func, df, time_series, subject='all'): - data = {} - for i in range(len(time_series) - 1): # only looping len - 1 times - start = time_series[i] - # TODO test it with new data injected/modified at runtime <- this is hard - # what is this about actually? - end = time_series[i + 1] - data[start] = func(df, subject=subject, start=start, end=end) - return data - - def dt(year: int = 2004, month: int = 1, day: int = 1, hour: int = 0): return datetime(year=year, month=month, day=day, hour=hour) @@ -191,10 +185,42 @@ def without_accent_and_whitespace(col): return col.apply(replace_accents) -def walk_directory_and_search(path, extension): +def walk_directory_and_search(path, extension, contains_string=None): paths = [] for root, dirs, files in os.walk(path): for file_name in files: if file_name.endswith(extension): - paths.append(os.path.join(root, file_name)) + if contains_string is not None and contains_string in file_name: + paths.append(os.path.join(root, file_name)) return paths + + +def fill_dict(dictionary, key, value): + if dictionary.get(key) is not None: + dictionary[key] += value + else: + dictionary[key] = value + return dictionary + + +def month_sorter(x): + return MONTHS.index(x[0]) + + +def count_stat_for_period(data, period): + # TODO sort by lists + periods = {} + for key, value in data.items(): + if period == 'y': + periods = fill_dict(periods, key.year, value) + periods = dict(sorted(periods.items())) + elif period == 'm': + periods = fill_dict(periods, MONTHS[key.month - 1], value) + periods = dict(sorted(periods.items(), key=lambda x: MONTHS.index(x[0]))) + elif period == 'd': + periods = fill_dict(periods, WEEKDAYS[key.weekday()], value) + periods = dict(sorted(periods.items(), key=lambda x: WEEKDAYS.index(x[0]))) + elif period == 'h': + periods = fill_dict(periods, key.hour, value) + periods = dict(sorted(periods.items())) + return periods diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..76f8eef --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +seaborn==0.10.1 +dateparser==0.7.6 +pandas==1.0.3 +matplotlib==3.2.1 +python_dateutil==2.8.1 diff --git a/tests/test_ConversationAnalyzer.py b/tests/test_ConversationAnalyzer.py index f7d679f..9d11e46 100644 --- a/tests/test_ConversationAnalyzer.py +++ b/tests/test_ConversationAnalyzer.py @@ -1,36 +1,35 @@ import pytest -from miner.ConversationAnalyzer import ConversationAnalyzer -from miner.People import People + +from miner.Analyzer import Analyzer from miner.utils import dt TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' -@pytest.fixture(scope='session') -def person(get_people): - def _person(name): - people = get_people(name) - return people.data[name] - - return _person +# @pytest.fixture(scope='session') +# def person(get_people): +# def _person(name): +# people = get_people(name) +# return people.data[name] +# +# return _person @pytest.fixture(scope='session') -def analyze(person): +def analyze(get_people): def _analyze(name): - individual = person(name) - return ConversationAnalyzer(name, individual.messages) + people = get_people(name) + return Analyzer(people) return _analyze @pytest.fixture(scope='session') -def statistics(person, analyze): +def statistics(analyze): def _stats(name, **kwargs): - individual = person(name) analyzer = analyze(name) if 'subject' in kwargs or 'start' in kwargs or 'end' in kwargs: # and others - return analyzer.get_stats(individual.messages, **kwargs) + return analyzer.get_stats(**kwargs) else: return analyzer.stats @@ -219,10 +218,6 @@ def test_stats_teflon_musk_all_2014_12(statistics): # assert stats.most_used_chars == 0 -class TestConversationAnalyzer: # Foo Bar - pass - - def test_time_series_analysis_for_user(analyze): analyzer = analyze('Teflon Musk') analyzer.get_time_series_data(subject='all', period='y') diff --git a/tests/test_Conversations.py b/tests/test_Conversations.py index d066361..ef9fdc3 100644 --- a/tests/test_Conversations.py +++ b/tests/test_Conversations.py @@ -1,72 +1,76 @@ import pandas as pd import pytest from miner.Conversations import Conversations +from miner.Individual import Individual from miner import utils import os + TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' @pytest.fixture() -def convos(): - convo = Conversations(f'{TEST_DATA_PATH}') - return convo.get_people_from_private_messages() +def conversations(): + return Conversations(f'{TEST_DATA_PATH}') + + +@pytest.fixture +def people_from_private_convos(conversations): + return conversations.get_people_from_private_messages() + + +def test_if_paths_are_registered(conversations): + assert len(conversations.private_convo_paths) == 4 + assert len(conversations.group_convo_paths) == 3 + assert len(conversations.deleted_user_convo_paths) == 0 -def test_get_all_people_from_convo(convos): +def test_get_all_people_from_private_messages(people_from_private_convos): + people = list(people_from_private_convos.keys()) + expected = ['Foo Bar', 'Teflon Musk', 'Benedek Elek', 'Tőke Hal'] + assert sorted(people) == sorted(expected) + + +def test_get_all_people_from_convo(conversations): people = [] - # TODO make this work - for convo in convos.keys(): - if convo.startswith('group'): - people += [p for p in convos[convo].get('participants')] - else: - people.append(convo) - people = list(set(people)) + # indie + people += list(conversations.private_convo_paths.keys()) + # group + people_from_groups = [p for people in conversations.group_convo_paths.values() for p in people] + + people += people_from_groups expected = ['Dér Dénes', 'Facebook User', 'Foo Bar', 'John Doe', 'Teflon Musk', 'Benedek Elek', 'Donald Duck', 'Tőke Hal'] - # TODO LATER what to do with Facebook User?????? - assert sorted(people) == sorted(expected) + assert sorted(list(set(people))) == sorted(expected) + + +def test_people_are_individual_instances(people_from_private_convos): + assert all([isinstance(person, Individual) for person in people_from_private_convos.values()]) + + +def test_all_individual_have_messages_df(people_from_private_convos): + assert all([isinstance(data.messages, pd.DataFrame) for data in people_from_private_convos.values()]) -def test_all_convos_have_dir(convos): - assert all([data.messages_dir for data in convos.values()]) +def test_all_individual_have_dir(people_from_private_convos): + assert all([data.messages_dir for data in people_from_private_convos.values()]) -def test_all_convos_have_messages_df(convos): - assert all([isinstance(data.messages, pd.DataFrame) for data in convos.values()]) +def test_some_individual_as_media_dir(people_from_private_convos): + assert people_from_private_convos.get('Teflon Musk').media_dir + assert not people_from_private_convos.get('Benedek Elek').media_dir -def test_some_convos_as_media_dir(convos): - assert convos.get('Teflon Musk').media_dir - assert not convos.get('Benedek Elek').media_dir -def test_convo_media_has_one_folder_of_possibles(convos): - listed_dir = os.listdir(f"{TEST_DATA_PATH}/{utils.MESSAGE_SUBPATH}/{convos.get('Teflon Musk').media_dir}") +def test_individual_media_has_one_folder_of_possibles(people_from_private_convos): + listed_dir = os.listdir( + f"{TEST_DATA_PATH}/{utils.MESSAGE_SUBPATH}/{people_from_private_convos.get('Teflon Musk').media_dir}") assert 'files' in listed_dir assert 'photos' in listed_dir assert 'audio' not in listed_dir -def test_groups_have_more_than_two_participates(convos): - groups = {convo: data for convo, data in convos.items() if convo.startswith('group')} + +def test_groups_have_more_than_two_participates(people_from_private_convos): + groups = {convo: data for convo, data in people_from_private_convos.items() if convo.startswith('group')} # TODO participants should contain the user itself as well assert all([len(data.get('participants')) > 2 for data in groups.values()]) - - - -""" -testcases: -- individual convos contain all names, compact_names, message folders and media folders - - media folders are a big question. how do you get it? actually once you have the thread_path then from that you can guess, - OR better off use the uri in the messages... fuck seems complicated -- friends contain all names and compact names, -- convos and friends has a common set, and the set is identical -- people gets assigned with all the unique friends and individual/group convos - -gonna test: -- assigning messages to friends, -- deal with multiple directories, IF there are multiple directories, -- -concerns: -- what to do with non-friends, -- I assume multiple directories are because of files sent, -""" diff --git a/tests/test_MessagingAnalyzer.py b/tests/test_MessagingAnalyzer.py index ec03497..969dc5d 100644 --- a/tests/test_MessagingAnalyzer.py +++ b/tests/test_MessagingAnalyzer.py @@ -1,244 +1,245 @@ import pytest -from miner.MessagingAnalyzer import MessagingAnalyzer + +from miner.Analyzer import Analyzer from miner.utils import dt @pytest.fixture(scope='session') def analyzer(get_people): people = get_people() - return MessagingAnalyzer(people) + return Analyzer(people) def test_total_number_of_messages(analyzer): - assert analyzer.total_number_of_messages() == 29 + assert analyzer.get_count(attribute='msg_count', ) == 29 - assert analyzer.total_number_of_messages(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_messages(start=dt(year=2014), period='y') == 11 - assert analyzer.total_number_of_messages(start=dt(year=2018), period='y') == 3 - assert analyzer.total_number_of_messages(start=dt(year=2020), period='y') == 15 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2014), period='y') == 11 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2018), period='y') == 3 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020), period='y') == 15 - assert analyzer.total_number_of_messages(start=dt(year=2011, month=11), period='m') == 0 - assert analyzer.total_number_of_messages(start=dt(year=2014, month=9), period='m') == 1 - assert analyzer.total_number_of_messages(start=dt(year=2014, month=11), period='m') == 8 - assert analyzer.total_number_of_messages(start=dt(year=2014, month=12), period='m') == 2 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2011, month=11), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2014, month=9), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2014, month=11), period='m') == 8 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2014, month=12), period='m') == 2 - assert analyzer.total_number_of_messages(start=dt(year=2018, month=1), period='m') == 3 - assert analyzer.total_number_of_messages(start=dt(year=2018, month=5), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2018, month=1), period='m') == 3 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2018, month=5), period='m') == 0 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=2), period='m') == 10 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=3), period='m') == 1 # jpg - assert analyzer.total_number_of_messages(start=dt(year=2020, month=4), period='m') == 2 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=5), period='m') == 1 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=8), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=2), period='m') == 10 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=3), period='m') == 1 # jpg + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=4), period='m') == 2 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=8), period='m') == 1 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=2, day=13), period='d') == 2 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 2 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=2, day=13), period='d') == 2 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=2, day=13, hour=6), period='h') == 2 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=2, day=13, hour=6), period='d') == 4 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=2, day=13, hour=6), period='d') == 4 def test_total_number_of_words(analyzer): - assert analyzer.total_number_of_words() == 86 + assert analyzer.get_count(attribute='word_count', ) == 86 - assert analyzer.total_number_of_words(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_words(start=dt(year=2014), period='y') == 20 - assert analyzer.total_number_of_words(start=dt(year=2018), period='y') == 32 - assert analyzer.total_number_of_words(start=dt(year=2020), period='y') == 34 + assert analyzer.get_count(attribute='word_count', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='word_count', start=dt(year=2014), period='y') == 20 + assert analyzer.get_count(attribute='word_count', start=dt(year=2018), period='y') == 32 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020), period='y') == 34 - assert analyzer.total_number_of_words(start=dt(year=2014, month=9), period='m') == 6 - assert analyzer.total_number_of_words(start=dt(year=2014, month=11), period='m') == 13 - assert analyzer.total_number_of_words(start=dt(year=2014, month=12), period='m') == 1 + assert analyzer.get_count(attribute='word_count', start=dt(year=2014, month=9), period='m') == 6 + assert analyzer.get_count(attribute='word_count', start=dt(year=2014, month=11), period='m') == 13 + assert analyzer.get_count(attribute='word_count', start=dt(year=2014, month=12), period='m') == 1 - assert analyzer.total_number_of_words(start=dt(year=2018, month=1), period='m') == 32 - assert analyzer.total_number_of_words(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.get_count(attribute='word_count', start=dt(year=2018, month=1), period='m') == 32 + assert analyzer.get_count(attribute='word_count', start=dt(year=2018, month=2), period='m') == 0 - assert analyzer.total_number_of_words(start=dt(year=2020, month=2), period='m') == 27 - assert analyzer.total_number_of_words(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_words(start=dt(year=2020, month=4), period='m') == 4 - assert analyzer.total_number_of_words(start=dt(year=2020, month=5), period='m') == 1 - assert analyzer.total_number_of_words(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_words(start=dt(year=2020, month=8), period='m') == 2 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=2), period='m') == 27 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=4), period='m') == 4 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=8), period='m') == 2 - assert analyzer.total_number_of_words(start=dt(year=2020, month=2, day=13), period='d') == 14 - assert analyzer.total_number_of_words(start=dt(year=2020, month=2, day=13, hour=5), period='d') == 14 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=2, day=13), period='d') == 14 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=2, day=13, hour=5), period='d') == 14 def test_total_number_of_characters(analyzer): - assert analyzer.total_number_of_characters() == 379 + assert analyzer.get_count(attribute='char_count', ) == 379 - assert analyzer.total_number_of_characters(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_characters(start=dt(year=2014), period='y') == 69 - assert analyzer.total_number_of_characters(start=dt(year=2018), period='y') == 170 - assert analyzer.total_number_of_characters(start=dt(year=2020), period='y') == 140 + assert analyzer.get_count(attribute='char_count', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='char_count', start=dt(year=2014), period='y') == 69 + assert analyzer.get_count(attribute='char_count', start=dt(year=2018), period='y') == 170 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020), period='y') == 140 - assert analyzer.total_number_of_characters(start=dt(year=2014, month=9), period='m') == 24 - assert analyzer.total_number_of_characters(start=dt(year=2014, month=11), period='m') == 42 - assert analyzer.total_number_of_characters(start=dt(year=2014, month=12), period='m') == 3 + assert analyzer.get_count(attribute='char_count', start=dt(year=2014, month=9), period='m') == 24 + assert analyzer.get_count(attribute='char_count', start=dt(year=2014, month=11), period='m') == 42 + assert analyzer.get_count(attribute='char_count', start=dt(year=2014, month=12), period='m') == 3 - assert analyzer.total_number_of_characters(start=dt(year=2018, month=1), period='m') == 170 - assert analyzer.total_number_of_characters(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.get_count(attribute='char_count', start=dt(year=2018, month=1), period='m') == 170 + assert analyzer.get_count(attribute='char_count', start=dt(year=2018, month=2), period='m') == 0 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=2), period='m') == 114 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=4), period='m') == 17 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=5), period='m') == 4 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=8), period='m') == 5 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=2), period='m') == 114 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=4), period='m') == 17 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=5), period='m') == 4 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=8), period='m') == 5 def test_total_number_of_messages_sent(analyzer): - assert analyzer.total_number_of_messages_sent() == 17 - assert analyzer.total_number_of_messages_sent(start=dt(year=2014), period='y') == 6 - assert analyzer.total_number_of_messages_sent(start=dt(year=2018), period='y') == 2 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020), period='y') == 9 + assert analyzer.get_count(attribute='msg_count', subject='me', ) == 17 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2014), period='y') == 6 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2018), period='y') == 2 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020), period='y') == 9 - assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=9), period='m') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=11), period='m') == 4 - assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=12), period='m') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2018, month=1), period='m') == 2 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2014, month=9), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2014, month=11), period='m') == 4 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2014, month=12), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2018, month=1), period='m') == 2 - assert analyzer.total_number_of_messages_sent(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2011, month=11), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2018, month=5), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2011, month=11), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2018, month=5), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2), period='m') == 6 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=4), period='m') == 2 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=5), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=8), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=2), period='m') == 6 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=4), period='m') == 2 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=5), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=8), period='m') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2, day=13), period='d') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2, day=13, hour=18), period='h') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=2, day=13), period='d') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=2, day=13, hour=6), period='h') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=2, day=13, hour=18), period='h') == 0 def test_total_number_of_words_sent(analyzer): - assert analyzer.total_number_of_words_sent() == 69 + assert analyzer.get_count(attribute='word_count', subject='me', ) == 69 - assert analyzer.total_number_of_words_sent(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2014), period='y') == 16 - assert analyzer.total_number_of_words_sent(start=dt(year=2018), period='y') == 31 - assert analyzer.total_number_of_words_sent(start=dt(year=2020), period='y') == 22 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2014), period='y') == 16 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2018), period='y') == 31 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020), period='y') == 22 - assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=9), period='m') == 6 - assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=11), period='m') == 9 - assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=12), period='m') == 1 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2014, month=9), period='m') == 6 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2014, month=11), period='m') == 9 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2014, month=12), period='m') == 1 - assert analyzer.total_number_of_words_sent(start=dt(year=2018, month=1), period='m') == 31 - assert analyzer.total_number_of_words_sent(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2018, month=1), period='m') == 31 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2018, month=2), period='m') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2), period='m') == 16 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=4), period='m') == 4 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=5), period='m') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=8), period='m') == 2 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=2), period='m') == 16 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=4), period='m') == 4 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=5), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=8), period='m') == 2 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2, day=13), period='d') == 5 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 5 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=2, day=13), period='d') == 5 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=2, day=13, hour=6), period='h') == 5 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 def test_total_number_of_characters_sent(analyzer): - assert analyzer.total_number_of_characters_sent() == 311 + assert analyzer.get_count(attribute='char_count', subject='me', ) == 311 - assert analyzer.total_number_of_characters_sent(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2014), period='y') == 60 - assert analyzer.total_number_of_characters_sent(start=dt(year=2018), period='y') == 167 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020), period='y') == 84 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2014), period='y') == 60 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2018), period='y') == 167 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020), period='y') == 84 - assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=9), period='m') == 24 - assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=11), period='m') == 33 - assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=12), period='m') == 3 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2014, month=9), period='m') == 24 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2014, month=11), period='m') == 33 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2014, month=12), period='m') == 3 - assert analyzer.total_number_of_characters_sent(start=dt(year=2018, month=1), period='m') == 167 - assert analyzer.total_number_of_characters_sent(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2018, month=1), period='m') == 167 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2018, month=2), period='m') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2), period='m') == 62 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=4), period='m') == 17 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=5), period='m') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=8), period='m') == 5 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2), period='m') == 62 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=4), period='m') == 17 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=5), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=8), period='m') == 5 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=6), period='d') == 21 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=7), period='d') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2, day=13, hour=6), period='d') == 21 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2, day=13, hour=7), period='d') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 21 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2, day=13, hour=6), period='h') == 21 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 def test_total_number_of_messages_received(analyzer): - assert analyzer.total_number_of_messages_received() == 12 - assert analyzer.total_number_of_messages_received(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2014), period='y') == 5 - assert analyzer.total_number_of_messages_received(start=dt(year=2018), period='y') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2020), period='y') == 6 + assert analyzer.get_count(attribute='msg_count', subject='partner', ) == 12 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2014), period='y') == 5 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2018), period='y') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020), period='y') == 6 - assert analyzer.total_number_of_messages_received(start=dt(year=2011, month=11), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2011, month=11), period='m') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=9), period='m') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=11), period='m') == 4 - assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=12), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2014, month=9), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2014, month=11), period='m') == 4 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2014, month=12), period='m') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2018, month=1), period='m') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2018, month=5), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2018, month=1), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2018, month=5), period='m') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2), period='m') == 4 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=3), period='m') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=4), period='m') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=5), period='m') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=8), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=2), period='m') == 4 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=3), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=4), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=8), period='m') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2, day=13), period='d') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2, day=14), period='d') == 2 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2, day=18), period='d') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=2, day=13), period='d') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=2, day=14), period='d') == 2 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=2, day=18), period='d') == 1 def test_total_number_of_words_received(analyzer): - assert analyzer.total_number_of_words_received() == 17 + assert analyzer.get_count(attribute='word_count', subject='partner', ) == 17 - assert analyzer.total_number_of_words_received(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_words_received(start=dt(year=2014), period='y') == 4 - assert analyzer.total_number_of_words_received(start=dt(year=2018), period='y') == 1 - assert analyzer.total_number_of_words_received(start=dt(year=2020), period='y') == 12 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2014), period='y') == 4 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2018), period='y') == 1 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020), period='y') == 12 - assert analyzer.total_number_of_words_received(start=dt(year=2014, month=9), period='m') == 0 - assert analyzer.total_number_of_words_received(start=dt(year=2014, month=11), period='m') == 4 - assert analyzer.total_number_of_words_received(start=dt(year=2014, month=12), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2014, month=9), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2014, month=11), period='m') == 4 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2014, month=12), period='m') == 0 - assert analyzer.total_number_of_words_received(start=dt(year=2018, month=1), period='m') == 1 - assert analyzer.total_number_of_words_received(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2018, month=1), period='m') == 1 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2018, month=2), period='m') == 0 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2), period='m') == 11 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=2), period='m') == 11 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=5), period='m') == 1 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2, day=13), period='d') == 9 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2, day=14), period='d') == 2 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2, day=18), period='d') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=2, day=13), period='d') == 9 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=2, day=14), period='d') == 2 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=2, day=18), period='d') == 0 def test_total_number_of_characters_received(analyzer): - assert analyzer.total_number_of_characters_received() == 68 + assert analyzer.get_count(attribute='char_count', subject='partner', ) == 68 - assert analyzer.total_number_of_characters_received(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_characters_received(start=dt(year=2014), period='y') == 9 - assert analyzer.total_number_of_characters_received(start=dt(year=2018), period='y') == 3 - assert analyzer.total_number_of_characters_received(start=dt(year=2020), period='y') == 56 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2014), period='y') == 9 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2018), period='y') == 3 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020), period='y') == 56 - assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=9), period='m') == 0 - assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=11), period='m') == 9 - assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=12), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2014, month=9), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2014, month=11), period='m') == 9 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2014, month=12), period='m') == 0 - assert analyzer.total_number_of_characters_received(start=dt(year=2018, month=1), period='m') == 3 - assert analyzer.total_number_of_characters_received(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2018, month=1), period='m') == 3 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2018, month=2), period='m') == 0 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2), period='m') == 52 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=5), period='m') == 4 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=2), period='m') == 52 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=5), period='m') == 4 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2, day=13), period='d') == 30 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2, day=14), period='d') == 22 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2, day=18), period='d') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=2, day=13), period='d') == 30 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=2, day=14), period='d') == 22 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=2, day=18), period='d') == 0 diff --git a/tests/test_utils.py b/tests/test_utils.py index 4d11263..3e57e17 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -65,19 +65,20 @@ def test_generate_date_series(): + # TODO resolve start = datetime(2020, 1, 1, 0, 0) end = datetime(2021, 1, 1, 0, 0) - date_range_year = generate_date_series(start, end, 'y') + date_range_year = generate_date_series('y', start, end) assert len(date_range_year) == 1 + 1 - date_range_month = generate_date_series(start, end, 'm') + date_range_month = generate_date_series('m', start, end) assert len(date_range_month) == 12 + 1 - date_range_day = generate_date_series(start, end, 'd') + date_range_day = generate_date_series('d', start, end) assert len(date_range_day) == 366 + 1 - date_range_hour = generate_date_series(start, end, 'h') + date_range_hour = generate_date_series('h', start, end) assert len(date_range_hour) == (366 * 24) + 1 for day in date_range_day: From 55ba0d0f3f21a96bb7a18315ae7d7b8f596e1863 Mon Sep 17 00:00:00 2001 From: Levente Csoke Date: Sun, 16 Aug 2020 14:35:48 +0200 Subject: [PATCH 3/3] refactored analyzer; added some documentation; resolved TODOs --- miner/Analyzer.py | 108 ++++++++++++++--------------- miner/App.py | 11 ++- miner/Conversations.py | 16 +++-- miner/FacebookData.py | 6 +- miner/Friends.py | 3 + miner/Group.py | 8 +-- miner/Individual.py | 4 ++ miner/Me.py | 4 ++ miner/Messages.py | 10 +-- miner/People.py | 11 ++- miner/requirements.txt | 4 -- miner/utils.py | 67 ++++++++++++++---- tests/conftest.py | 4 +- tests/test_ConversationAnalyzer.py | 7 +- tests/test_Conversations.py | 3 +- tests/test_Friends.py | 3 +- tests/test_utils.py | 10 +-- 17 files changed, 166 insertions(+), 113 deletions(-) diff --git a/miner/Analyzer.py b/miner/Analyzer.py index b0e61bc..17d9f68 100644 --- a/miner/Analyzer.py +++ b/miner/Analyzer.py @@ -1,34 +1,45 @@ +import pandas as pd + from miner.ConversationStats import ConversationStats from miner import utils -import pandas as pd class Analyzer: - # TODO do we need to override __subclasscheck__ ? + """ + Analyzer for analyzing specific and/or all conversations - # def __new__(cls, name, messages, *args, **kwargs): - # if messages is None: # This deals with the case if no messages - # return None - # return super(Analyzer, cls).__new__(cls, *args, **kwargs) + """ def __init__(self, people): self.people = people self.people_data = people.data - self.names = people.names + self.names = list(people.names) self.multi = len(self.people_data) > 1 if self.multi: - self.df = self.stack_dfs() + self.df = self.stack_dfs(self.people_data) + else: + self.df = self.people_data.get(self.names[0]).messages + + def __str__(self): + if self.multi: + return self.names else: - # TODO solve this hand in hand with the __new__ method. too ugly - self.df = self.people_data.get(list(self.names)[0]).messages + return f'{self.names[0]}: {list(self.df.index)}' - def get_stats_for_intervals(self, time_series, subject='all'): + @property + def stats(self): + return self.get_stats() + + def get_stats_for_intervals(self, time_series, period, subject='all'): data = {} - for i in range(len(time_series) - 1): # only looping len - 1 times + for i in range(len(time_series)): start = time_series[i] - end = time_series[i + 1] - data[start] = self.get_stats(self.df, subject=subject, start=start, end=end) + try: # with this solution we will have data for the very last moments until datetime.now() + end = time_series[i + 1] + except IndexError: + end = None + data[start] = self.get_stats(df=self.df, subject=subject, start=start, end=end, period=period) return data def get_stats(self, df=None, subject='all', start=None, end=None, period=None): @@ -37,30 +48,6 @@ def get_stats(self, df=None, subject='all', start=None, end=None, period=None): stats = ConversationStats(df) return stats - @staticmethod - def get_plottable_time_series_data(interval_stats, statistic): - for k, v in interval_stats.items(): - if isinstance(v, ConversationStats): - interval_stats[k] = getattr(v, statistic) - return interval_stats - - @property - def stats(self): - return self.get_stats() - - def __str__(self): - if self.multi: - return self.names - else: - return f'{self.names[0]}: {list(self.df.index)}' - - def stack_dfs(self): - dfs = [] - for data in self.people_data.values(): - if data.messages is not None: - dfs.append(data.messages) - return pd.concat(dfs).sort_index() - # 1. Total count of messages/words/characters (also by year/month/day/hour) # 2. Total count of messages/words/characters sent (also by year/month/day/hour) # 3. Total count of messages/words/characters received (also by year/month) @@ -68,8 +55,6 @@ def get_count(self, attribute, subject='all', start=None, end=None, period=None) stats = self.get_stats(subject=subject, start=start, end=end, period=period) return getattr(stats, attribute) - ################# - # 4. Most used messages/words in convos by me/partner (also by year/month/day/hour) def most_used_messages_(self, **kwargs): """ @@ -88,38 +73,47 @@ def most_used_messages_(self, **kwargs): pass # 5. Number of messages sent/got on busiest period (by year/month/day/hour) - def stat_per_period(self, period, attribute, **kwargs): + def stat_per_period(self, period, statistic, **kwargs): interval_stats = self.get_time_series_data(period, **kwargs) - # TODO attribute is one of (msg, word, char) - time_series_data = self.get_plottable_time_series_data(interval_stats, statistic=attribute) + time_series_data = self.get_stat_count(interval_stats, statistic=statistic) return utils.count_stat_for_period(time_series_data, period) - # 6. Time series: dict of 'year/month/day/hour : number of messages/words/characters (also sent/got) for user/all convos' + # 6. Time series: dict of 'y/m/d/h : number of messages/words/characters (also sent/got) for user/all convos' def get_time_series_data(self, period, subject='all', **kwargs): - time_series = utils.generate_date_series(period, **kwargs) - return self.get_stats_for_intervals(self.df, time_series, subject=subject) + time_series = utils.generate_date_series(period=period, **kwargs) + return self.get_stats_for_intervals(time_series, period, subject=subject) - # # 7. Ranking of friends by messages by y/m/d/h, by different stats, by sent/got - def get_ranking_of_friends_by_messages(self, attribute='msg_count', subject='all', start=None, end=None, - period=None): - # TODO almost the same function as get_count + # # 7. Ranking of partners by messages by y/m/d/h, by different stats, by sent/got + def get_ranking_of_partners_by_messages(self, statistic='msg_count', **kwargs): count_dict = {} for name in self.names: - # analyzer = Analyzer({name: self.people.get(name)}) # this has to be a people instance?! OR? - # analyzer = Analyzer(People(self.people.data_path, name=name)) # this has to be a people instance?! OR? df = self.df[self.df.partner == name] - stats = self.get_stats(df=df, subject=subject, start=start, end=end, period=period) + stats = self.get_stats(df=df, **kwargs) if stats is not None: - count_dict = utils.fill_dict(count_dict, name, getattr(stats, attribute)) - - count_dict = {key: value for key, value in sorted(count_dict.items(), key=lambda item: item[1], reverse=True)} + count_dict = utils.fill_dict(count_dict, name, getattr(stats, statistic)) return count_dict + @staticmethod + def stack_dfs(people_data): + dfs = [] + for data in people_data.values(): + if data.messages is not None: + dfs.append(data.messages) + return pd.concat(dfs).sort_index() + + @staticmethod + @utils.attribute_checker + def get_stat_count(interval_stats, statistic='msg_count'): + for k, v in interval_stats.items(): + interval_stats[k] = getattr(v, statistic) + return interval_stats + @staticmethod @utils.subject_checker @utils.date_checker - @utils.period_checker + @utils.start_end_period_checker def filter_by_input(df, subject='all', start=None, end=None, period=None): + if subject == 'me': df = df[df.sender_name == 'Levente Csőke'] elif subject == 'partner': diff --git a/miner/App.py b/miner/App.py index 4989e8b..7813f9e 100644 --- a/miner/App.py +++ b/miner/App.py @@ -1,11 +1,16 @@ -from miner.Analyzer import Analyzer +import os +from miner.Analyzer import Analyzer from miner.People import People -DATA_PATH = '/home/levente/projects/facebook-data-miner/data' +DATA_PATH = f'{os.getcwd()}/data' class App: + """ + Entrypoint. Not yet used extensively. + # TODO LATER turn it into a cli + """ def __init__(self): pass @@ -14,7 +19,7 @@ def analyze_messages(): p = People(path=DATA_PATH) analyzer = Analyzer(p) - rank = analyzer.get_ranking_of_friends_by_messages(attribute='char_count') + rank = analyzer.get_ranking_of_partners_by_messages(attribute='char_count') if __name__ == '__main__': diff --git a/miner/Conversations.py b/miner/Conversations.py index 7a373d4..b4a4381 100644 --- a/miner/Conversations.py +++ b/miner/Conversations.py @@ -1,7 +1,6 @@ import pandas as pd import os - from miner.Messages import Messages from miner.Individual import Individual @@ -9,9 +8,13 @@ class Conversations: + """ + Class for managing and parsing conversations + """ + def __init__(self, data_path): self.private_convo_paths = {} - self.group_convo_paths = {} # TODO fill this as well + self.group_convo_paths = {} # TODO LATER fill this as well self.deleted_user_convo_paths = [] # NOTE these are collected but not yet used self.data_path = f'{data_path}/{utils.MESSAGE_SUBPATH}' @@ -41,7 +44,7 @@ def differentiate_paths(self, jsons): def register_paths(self): utils.dump_to_json(self.private_convo_paths, f'{self.data_path}/private_messages.json') - + def read_paths(self, file): self.private_convo_paths = utils.read_json(file) print() @@ -55,6 +58,8 @@ def map_private_convo_files(self, msg, file): def map_group_convo_files(self, msg, file): for participant in msg.participants: + if participant == 'Levente Csőke': + continue if self.group_convo_paths.get(file): self.group_convo_paths[file].append(participant) else: @@ -100,7 +105,4 @@ def group_membership(name): return None def get_people_from_group_messages(self): - pass # TODO for v0.0.4 - - - + pass diff --git a/miner/FacebookData.py b/miner/FacebookData.py index 81b946f..ef5ba78 100644 --- a/miner/FacebookData.py +++ b/miner/FacebookData.py @@ -3,6 +3,10 @@ class FacebookData: + """ + Base class for reading in tabular data from JSONs. + """ + def __init__(self, json_path): self.json_path = json_path self._df = None @@ -21,7 +25,7 @@ def json(self): @property def compact_names(self): - name_list = list(utils.without_accent_and_whitespace(utils.lower_names(self.names))) # should be just fine + name_list = list(utils.without_accent_and_whitespace(utils.lower_names(self.names))) return name_list[0] if len(name_list) == 1 else name_list def to_df(self, field=None): diff --git a/miner/Friends.py b/miner/Friends.py index 98d995f..5acc1be 100644 --- a/miner/Friends.py +++ b/miner/Friends.py @@ -3,6 +3,9 @@ class Friends(FacebookData): + """ + Class for storing data in friends.json + """ def __init__(self, *args): super().__init__(*args) diff --git a/miner/Group.py b/miner/Group.py index 94e1ed9..3a1b131 100644 --- a/miner/Group.py +++ b/miner/Group.py @@ -4,6 +4,10 @@ class Group: + """ + Class for holding a group-message's data + """ + def __init__(self, name=None, title=None, messages=None, compact=None, messages_dir=None, media_dir=None, members=None): self._name = name @@ -26,10 +30,6 @@ def title(self): def messages(self): return self._messages - # @property - # def get_message_jsons(self): - # return self._messages - @property def media_dir(self): return self._media_dir diff --git a/miner/Individual.py b/miner/Individual.py index 4518a5f..6f818ab 100644 --- a/miner/Individual.py +++ b/miner/Individual.py @@ -1,4 +1,8 @@ class Individual: + """ + Class for holding a person's data the user ever interacted with + """ + def __init__(self, name=None, compact=None, messages=None, friend=None, messages_dir=None, media_dir=None, member_of=None): diff --git a/miner/Me.py b/miner/Me.py index b10356d..ef2179e 100644 --- a/miner/Me.py +++ b/miner/Me.py @@ -2,6 +2,10 @@ class Me(FacebookData): + """ + Class for storing basic data about the user + """ + def __init__(self, *args): super().__init__(*args) diff --git a/miner/Messages.py b/miner/Messages.py index 6fbc9d3..ebdaadf 100644 --- a/miner/Messages.py +++ b/miner/Messages.py @@ -7,6 +7,10 @@ class Messages(FacebookData): + """ + Class for representing data of all the messages with a user or a group + """ + def __init__(self, json_path): super().__init__(json_path) self.to_df('messages') @@ -15,7 +19,6 @@ def __init__(self, json_path): @property def names(self): - # TODO ugly try: return pd.DataFrame(self.participants)[0] except KeyError: @@ -24,10 +27,7 @@ def names(self): @property def participants(self): participants = self.decoded.get('participants') - # TODO I should be IN - # but this breaks stuff at TestMessagingAnalyzer - return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke'] - # return [p.get('name') for p in participants if p.get('name')] + return [p.get('name') for p in participants if p.get('name')] @property def title(self): diff --git a/miner/People.py b/miner/People.py index 2970390..b7852e8 100644 --- a/miner/People.py +++ b/miner/People.py @@ -1,12 +1,18 @@ import time +import os from miner.Conversations import Conversations from miner.Friends import Friends -DATA_PATH = '/home/levente/projects/facebook-data-miner/data' +DATA_PATH = f'{os.getcwd()}/data' class People: + """ + Class that manages and represents people from different kind of interactions + # TODO LATER abstractional flaw?! people? person? indie? + """ + def __init__(self, path=None, name=None): self.data_path = path if path else DATA_PATH self._groups = [] @@ -19,7 +25,7 @@ def data(self): @property def names(self): - return self._names #if len(self._names) > 1 else self._names[0] + return self._names # if len(self._names) > 1 else self._names[0] @property def groups(self): @@ -31,7 +37,6 @@ def get_people(self, name=None): friends = friend.get_people(name=name) print('friends: ', time.time() - start) - # TODO LATER too slow -> store in file start = time.time() conversations = Conversations(self.data_path) print('convos1: ', time.time() - start) diff --git a/miner/requirements.txt b/miner/requirements.txt index 1262ec9..8ee3351 100644 --- a/miner/requirements.txt +++ b/miner/requirements.txt @@ -1,9 +1,5 @@ numpy==1.18.1 pandas==1.0.3 dateparser==0.7.6 -seaborn==0.10.1 matplotlib==3.2.1 -plotly==4.8.2 -miner==0.0.0 -Pillow==7.2.0 python_dateutil==2.8.1 diff --git a/miner/utils.py b/miner/utils.py index cfa7644..6f6f565 100644 --- a/miner/utils.py +++ b/miner/utils.py @@ -1,8 +1,9 @@ -import os -import json -import dateparser -from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta +from datetime import datetime, timedelta +import dateparser +from itertools import islice +import json +import os MESSAGE_SUBPATH = 'messages/inbox' MEDIA_DIRS = ['photos', 'gifs', 'files', 'videos', 'audio'] @@ -111,11 +112,10 @@ def wrapper(*args, **kwargs): return wrapper -def period_checker(func): +def start_end_period_checker(func): def wrapper(*args, **kwargs): if kwargs.get('start') is not None and kwargs.get('end') is not None: return func(*args, **kwargs) - if not kwargs.get('period') or DELTA_MAP[kwargs.get('period')] is None: raise ValueError('Parameter `period` should be one of {y, m, d, h}') kwargs['period'] = DELTA_MAP[kwargs.get('period')] @@ -124,17 +124,33 @@ def wrapper(*args, **kwargs): return wrapper -def generate_date_series(period, start=None, end=None): - if period is None or DELTA_MAP.get(period) is None: - raise ValueError('Parameter `period` should be one of {y, m, d, h}') - start = start or datetime(year=2009, month=10, day=2, hour=0) # TODO LATER change this to date when user joined FB - end = end or datetime.now() +def period_checker(func): + def wrapper(*args, **kwargs): + if not kwargs.get('period') or DELTA_MAP[kwargs.get('period')] is None: + raise ValueError('Parameter `period` should be one of {y, m, d, h}') + return func(*args, **kwargs) + + return wrapper - # TODO THIS HAS A PROBLEM. msgs happened in 2020 getting assigned to 2019 because: 2019 + 1 year + start.month + start.day < now() - # TODO serious problem! + +def get_start_based_on_period(join_date, period): + if period == 'y': + return datetime(join_date.year, 1, 1) + elif period == 'm': + return datetime(join_date.year, join_date.month, 1) + return join_date + + +@period_checker +def generate_date_series(period='y', start=None, end=None): dates = [] + + join_date = datetime(year=2009, month=10, day=2) # TODO later get this from somewhere + start = start or get_start_based_on_period(join_date, period) + end = end or datetime.now() + intermediate = start - while intermediate <= (end + DELTA_MAP.get(period)): # means that we want to have the end in it as well + while intermediate <= end: # means that we want to have the end in it as well dates.append(intermediate) intermediate = intermediate + DELTA_MAP.get(period) return dates @@ -224,3 +240,26 @@ def count_stat_for_period(data, period): periods = fill_dict(periods, key.hour, value) periods = dict(sorted(periods.items())) return periods + + +def sort_dict(dictionary, func=lambda x: x, reverse=False): + return {key: value for key, value in sorted(dictionary.items(), key=func, reverse=reverse)} + + +def remove_items_where_value_is_falsible(dictionary): + return {k: v for k, v in dictionary.items() if v} + + +# keep only first 20 entries +def slice_dict(dictionary, n): + return dict(islice(dictionary.items(), n)) + + +def attribute_checker(func): + def wrapper(*args, **kwargs): + statistic = kwargs.get('statistic') + if not statistic or statistic not in ('msg_count', 'word_count', 'char_count'): + raise ValueError('Parameter `statistic` should be one of {msg_count, word_count, char_count}') + return func(*args, **kwargs) + + return wrapper diff --git a/tests/conftest.py b/tests/conftest.py index 08e382b..ebdc35e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,9 @@ import pytest +import os + from miner.People import People -TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' +TEST_DATA_PATH = f'{os.getcwd()}/test_data' @pytest.fixture(scope='session') diff --git a/tests/test_ConversationAnalyzer.py b/tests/test_ConversationAnalyzer.py index 9d11e46..61ab388 100644 --- a/tests/test_ConversationAnalyzer.py +++ b/tests/test_ConversationAnalyzer.py @@ -3,8 +3,6 @@ from miner.Analyzer import Analyzer from miner.utils import dt -TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' - # @pytest.fixture(scope='session') # def person(get_people): @@ -218,7 +216,4 @@ def test_stats_teflon_musk_all_2014_12(statistics): # assert stats.most_used_chars == 0 -def test_time_series_analysis_for_user(analyze): - analyzer = analyze('Teflon Musk') - analyzer.get_time_series_data(subject='all', period='y') - assert 1 + diff --git a/tests/test_Conversations.py b/tests/test_Conversations.py index ef9fdc3..5a7cb58 100644 --- a/tests/test_Conversations.py +++ b/tests/test_Conversations.py @@ -5,7 +5,7 @@ from miner import utils import os -TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' +TEST_DATA_PATH = f'{os.getcwd()}/test_data' @pytest.fixture() @@ -72,5 +72,4 @@ def test_individual_media_has_one_folder_of_possibles(people_from_private_convos def test_groups_have_more_than_two_participates(people_from_private_convos): groups = {convo: data for convo, data in people_from_private_convos.items() if convo.startswith('group')} - # TODO participants should contain the user itself as well assert all([len(data.get('participants')) > 2 for data in groups.values()]) diff --git a/tests/test_Friends.py b/tests/test_Friends.py index 652b671..c6abfea 100644 --- a/tests/test_Friends.py +++ b/tests/test_Friends.py @@ -1,8 +1,9 @@ import pytest +import os from miner.Friends import Friends -TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' +TEST_DATA_PATH = f'{os.getcwd()}/test_data' @pytest.fixture() diff --git a/tests/test_utils.py b/tests/test_utils.py index 3e57e17..3577add 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -69,20 +69,20 @@ def test_generate_date_series(): start = datetime(2020, 1, 1, 0, 0) end = datetime(2021, 1, 1, 0, 0) - date_range_year = generate_date_series('y', start, end) + date_range_year = generate_date_series(period='y', start=start, end=end) assert len(date_range_year) == 1 + 1 - date_range_month = generate_date_series('m', start, end) + date_range_month = generate_date_series(period='m', start=start, end=end) assert len(date_range_month) == 12 + 1 - date_range_day = generate_date_series('d', start, end) + date_range_day = generate_date_series(period='d', start=start, end=end) assert len(date_range_day) == 366 + 1 - date_range_hour = generate_date_series('h', start, end) + date_range_hour = generate_date_series(period='h', start=start, end=end) assert len(date_range_hour) == (366 * 24) + 1 for day in date_range_day: assert isinstance(day, datetime) with pytest.raises(ValueError): - faulty_date_range = generate_date_series(start, end, ) + faulty_date_range = generate_date_series(start=start, end=end, )