diff --git a/.gitignore b/.gitignore index f6ccf6b..5b8858a 100644 --- a/.gitignore +++ b/.gitignore @@ -236,4 +236,16 @@ data todo.md +# ignoring trash file +trash.py + + +# ignoring jupyter notebook +explore.ipynb + + +# ignoring jupyter notebook +tests/playground.py + +tests/.pytest_cache .pytest_cache \ No newline at end of file diff --git a/ConversationAnalyzer.py b/ConversationAnalyzer.py new file mode 100644 index 0000000..fe30520 --- /dev/null +++ b/ConversationAnalyzer.py @@ -0,0 +1,152 @@ +import pandas as pd +from utils import date_checker, period_checker, subject_checker, generate_time_series, get_stats_for_intervals + + +class ConversationAnalyzer: + def __new__(cls, name, messages, *args, **kwargs): + if messages is None: # This deals with the case if no messages + return None + return super(ConversationAnalyzer, cls).__new__(cls, *args, **kwargs) + + def __init__(self, name, messages): + self.name = name + self.df = messages + + def __str__(self): + return f'{self.name}: {list(self.df.index)}' + + @property + def stats(self): + return self.get_stats(self.df) + + # TODO has to be tested + def get_time_series_data(self, subject='all', **kwargs): + time_series = generate_time_series(**kwargs) + return get_stats_for_intervals(self.get_stats, self.df, time_series, subject=subject) + + def get_plotable_time_series_data(self, interval_stats, statistic): + for k, v in interval_stats.items(): + if isinstance(v, ConversationStats): + interval_stats[k] = getattr(v, statistic) + return interval_stats + + def get_stats(self, df, subject='all', start=None, end=None, period=None): + df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period) + stats = ConversationStats(df) + return stats + + @staticmethod + @subject_checker + @date_checker + @period_checker + def filter_by_input(df, subject='all', start=None, end=None, period=None): + if subject == 'me': + df = df[df.sender_name == 'Levente Csőke'] + elif subject == 'partner': + df = df[df.sender_name != 'Levente Csőke'] + if start and end: + df = df.loc[start:end] + elif start and not end: + df = df.loc[start:start + period] + elif not start and end: + df = df.loc[end - period:end] + return df + + +class ConversationStats: + """ + Statistics of conversation with one person. + """ + + # TODO do we need this or not?!?! smh + # def __new__(cls, df, *args, **kwargs): + # if not len(df.index): # This deals with the case if input df is empty + # return None + # return super(ConversationStats, cls).__new__(cls, *args, **kwargs) + + def __init__(self, df): + self.df = df + + def __repr__(self): + return f'{self.msg_count}' + + @property + def messages(self): + return self.df.content.dropna() + + @property + def words(self): + return self.get_words() + + # 1. + @property + def msg_count(self): + return len(self.df) + + # 2. + @property + def unique_msg_count(self): + return len(self.messages.unique()) + + # 3. + @property + def most_used_msgs(self): + # TODO first few (1-10) messages + return self.messages.value_counts() + + # 4. + @property + def msg_frequency(self): + # TODO this has been most likely depracated + pass + + # 5. + @property + def word_count(self): + return len(self.words) + + # 6. + @property + def unique_word_count(self): + return len(set(self.words)) + + # 7. + @property + def most_used_words(self): + s = pd.Series(self.words) + return s.value_counts() + + # 8. + @property + def word_frequency(self): + pass + + # 9. + @property + def char_count(self): + char_count = 0 + for word in self.words: + char_count += len(word) + return char_count + + # 10. + @property + def most_used_chars(self): + return None # TODO or not https://stackoverflow.com/questions/4131123/finding-the-most-frequent-character-in-a-string + + # 11. + @property + def rate_of_media_messages(self): + pass # TODO what? + + def get_words(self): + token_list = self.messages.str.lower().str.split() + words = [] + for tokens in token_list: + # print(tokens) + if not isinstance(tokens, list): + print('WARNING! Not a list!') + continue # TODO ??? check this + for token in tokens: + words.append(token) + return words diff --git a/Conversations.py b/Conversations.py new file mode 100644 index 0000000..112f336 --- /dev/null +++ b/Conversations.py @@ -0,0 +1,116 @@ +import os +from FacebookData import FacebookData +import pandas as pd + +from datetime import datetime + +MESSAGE_SUBPATH = 'messages/inbox' + + +class Conversations: + def __init__(self, data_path): + self.data_path = f'{data_path}/{MESSAGE_SUBPATH}' + + def get_people(self): + json_paths = self.walk_directory_and_search('.json') + return self.extract_names_from_convos(json_paths) + + def walk_directory_and_search(self, extension): + paths = [] + for root, dirs, files in os.walk(self.data_path): + for name in files: + if name.endswith(extension): + paths.append(os.path.join(root, name)) + return paths + + # TODO simplify this function!! also this takes very long + @staticmethod + def extract_names_from_convos(jsons): + name_data_map = {} + count = 0 + for file in jsons: + msg = Messages(file) + for participant in msg.participants: + key = participant if msg.ttype == 'Regular' else f'group_{count}' + if key == 'Facebook User': # TODO ?? what to do with this?? + continue + if name_data_map.get(key) and key.startswith( + 'group'): # making sure run only once even if it is a group + continue + if name_data_map.get(key): + dfs = [name_data_map[key]['messages'], msg.df] + name_data_map[key]['messages'] = pd.concat(dfs, ignore_index=False).sort_index() + else: + name_data_map[key] = { + 'title': msg.title, + 'compact_name': msg.compact_names, # TODO is list ok for if length is only 1?? + # 'participants': msg.participants + ['Levente Csőke'], + 'participants': msg.participants, + 'messages': msg.df, + 'friend': None, + 'messages_dir': msg.messages_dir, + 'media_dir': msg.media_dir + } + if msg.ttype == 'RegularGroup': + count += 1 + + return name_data_map + + +class Messages(FacebookData): + def __init__(self, json_path): + super().__init__(json_path) + self.to_df() + self.set_date_as_index() + + def to_df(self): + self._df = pd.DataFrame(self.decoded.get('messages')) + + def set_date_as_index(self): + # TODO maybe not needed; could calculate real time + date_series = self._df.timestamp_ms.apply(self.ts_to_date) + self._df = self._df.set_index(date_series).iloc[::-1] + + @property + def names(self): + return pd.DataFrame(self.participants)[0] + + @property + def participants(self): + participants = self.decoded.get('participants') + # TODO I should be IN + # but this breaks stuff at TestMessagingAnalyzer + return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke'] + # return [p.get('name') for p in participants if p.get('name')] + + @property + def title(self): + return self.decoded.get('title') + + @property + def ttype(self): + return self.decoded.get('thread_type') + + @property + def messages_dir(self): + thread_path = self.decoded.get('thread_path') + if not thread_path.startswith('inbox/'): + raise ValueError('Something is not okay.') + # TODO here or in the upper function where we extract names + return thread_path.split('/')[1].lower() + + @property + def media_dir(self): + # todo what should the path contain + for media in ['photos', 'gifs', 'files', 'videos', 'audio']: + if media in self._df.columns: + media_in_msg = list(self._df[media][self._df[media].notnull()]) + # if len(media_in_msg) > 1: # TODO is this ok. i think it is. think multiple photos sent once + # print('Media in msg is bigger than 1') + uri = media_in_msg[0][0].get('uri') + return os.path.dirname(os.path.dirname(uri)) + return None + + @staticmethod + def ts_to_date(date): + return datetime.fromtimestamp(date / 1000) # .strftime('%Y-%m-%d') diff --git a/FacebookData.py b/FacebookData.py new file mode 100644 index 0000000..a82c896 --- /dev/null +++ b/FacebookData.py @@ -0,0 +1,40 @@ +from utils import read_json, decode_text, accents_map + + +class FacebookData: + def __init__(self, json_path): + self.json_path = json_path + self._df = None + + @property + def df(self): + return self._df + + @property + def decoded(self): + return decode_text(self.json) + + @property + def json(self): + return read_json(self.json_path) + + @property + def compact_names(self): + # NOTE this is the place where we change pd/np to builtin + # do we have to do this? + name_list = list(self.without_accent_and_whitespace(self.lower_names(self.names))) + return name_list[0] if len(name_list) == 1 else name_list + + @staticmethod + def lower_names(col): + return col.str.lower() + + @staticmethod + def without_accent_and_whitespace(col): + def replace_accents(text): + for char in accents_map.keys(): + if char in text: + text = text.replace(char, accents_map[char]) + return text.replace(' ', '') + + return col.apply(replace_accents) diff --git a/Friends.py b/Friends.py new file mode 100644 index 0000000..6e0e991 --- /dev/null +++ b/Friends.py @@ -0,0 +1,36 @@ +import pandas as pd +import os +from FacebookData import FacebookData +from utils import accents_map + + +class Friends(FacebookData): + + def __init__(self, *args): + super().__init__(*args) + + # self.path = 'data/friends' + # self.json_path = f'{self.path}/friends.json' + + self.to_df() + + def get_people(self): + names = {} + for name, compact in zip(self.names, self.compact_names): + names[name] = { + 'title': name, + 'compact_name': compact, + 'messages': None, + 'friend': True, + 'participants': None, + 'messages_dir': None, + 'media_dir': None + } + return names + + def to_df(self): + self._df = pd.DataFrame(self.decoded.get('friends')) + + @property + def names(self): + return self.df.name diff --git a/Group.py b/Group.py new file mode 100644 index 0000000..41b966f --- /dev/null +++ b/Group.py @@ -0,0 +1,47 @@ +# TODO groups should be searched by looking into jsons unfortunately :( +# because of directory says others +# maybe we dont use groups right away? + + +class Group: + def __init__(self, name=None, title=None, messages=None, compact=None, messages_dir=None, media_dir=None, + members=None): + self._name = name + self._title = title + self._messages = messages + self._compact_name = compact + self._messages_dir = messages_dir + self._media_dir = media_dir + self._members = members + + @property + def name(self): + return self._name + + @property + def title(self): + return self._title + + @property + def messages(self): + return self._messages + + # @property + # def get_message_jsons(self): + # return self._messages + + @property + def media_dir(self): + return self._media_dir + + @property + def messages_dir(self): + return self._messages_dir + + @property + def compact_name(self): + return self._compact_name + + @property + def members(self): + return self._members diff --git a/Individual.py b/Individual.py new file mode 100644 index 0000000..a9f8d03 --- /dev/null +++ b/Individual.py @@ -0,0 +1,47 @@ +class Individual: + def __init__(self, name=None, title=None,compact=None, messages=None, friend=None, messages_dir=None, media_dir=None, + member_of=None): + self._name = name + self._title = title + self._compact_name = compact + self._messages = messages + self._friend = friend + self._messages_dir = messages_dir + self._media_dir = media_dir + self._member_of = member_of + + + def __repr__(self): + return self.name + + @property + def name(self): + return self._name + + @property + def title(self): + return self._title + + @property + def messages(self): + return self._messages + + @property + def friend(self): + return self._friend + + @property + def media_dir(self): + return self._media_dir + + @property + def messages_dir(self): + return self._messages_dir + + @property + def compact_name(self): + return self._compact_name + + @property + def member_of(self): + return self._member_of diff --git a/Me.py b/Me.py new file mode 100644 index 0000000..3293bf7 --- /dev/null +++ b/Me.py @@ -0,0 +1,10 @@ +from FacebookData import FacebookData + + +class Me(FacebookData): + def __init__(self, *args): + super().__init__(*args) + + @property + def name(self): + return 'Levente Csőke' diff --git a/MessagingAnalyzer.py b/MessagingAnalyzer.py new file mode 100644 index 0000000..07965e5 --- /dev/null +++ b/MessagingAnalyzer.py @@ -0,0 +1,135 @@ +from utils import year_converter, month_converter, generate_time_series, get_stats_for_intervals +from datetime import datetime, date, timedelta +import pandas as pd +from ConversationAnalyzer import ConversationAnalyzer + +""" + +""" + + +class MessagingAnalyzer: + def __init__(self, names, people): + # TODO input people only. class ill know what to do + self.names = names + self.people = people + + def time_series_analysis_for_all(self, subject=None, **kwargs): + time_series = generate_time_series(**kwargs) + stacked_df = self.stack_dfs(self.people) + interval_stats = get_stats_for_intervals(self.get_stats, stacked_df, time_series, subject=subject) + + def get_stats(self, df, subject='all', start=None, end=None, period=None): + # TODO + # here you have to do something with it + pass + + def get_count(self, attribute, subject='all', start=None, end=None, period=None): + count = 0 + # we have a list of names we want to iterate over + for name in self.names: + stats = self.get_conversation_stats(name=name, subject=subject, start=start, end=end, period=period) + if stats is not None: # TODO too explicit; needed because it is possible that None will be returned, if t got an empty df + count += getattr(stats, attribute) + return count + + def get_conversation_stats(self, name, subject='all', start=None, end=None, period=None): + messages = self.people.get(name).messages + analyzer = ConversationAnalyzer(name, messages) + if analyzer is None: # TODO this is too explicit ?! + return None + return analyzer.get_stats(messages, subject=subject, start=start, end=end, period=period) + + def total_number_of_(self, attribute, subject='all', **kwargs): + return self.get_count(attribute=attribute, subject=subject, **kwargs) + + # 1. Ranking of friends by total count of messages/words/characters (also by year/month/day/hour) + def total_number_of_messages(self, **kwargs): + return self.total_number_of_(attribute='msg_count', **kwargs) + + def total_number_of_words(self, **kwargs): + return self.total_number_of_(attribute='word_count', **kwargs) + + def total_number_of_characters(self, **kwargs): + return self.total_number_of_(attribute='char_count', **kwargs) + + # 2. Ranking of friends who I sent the most messages/words/characters (also by year/month/day/hour) + def total_number_of_messages_sent(self, **kwargs): + return self.total_number_of_(attribute='msg_count', subject='me', **kwargs) + + def total_number_of_words_sent(self, **kwargs): + return self.total_number_of_(attribute='word_count', subject='me', **kwargs) + + def total_number_of_characters_sent(self, **kwargs): + return self.total_number_of_(attribute='char_count', subject='me', **kwargs) + + # 3. Ranking of friends who sent the most messages/words/characters (also by year/month) + def total_number_of_messages_received(self, **kwargs): + return self.total_number_of_(attribute='msg_count', subject='partner', **kwargs) + + def total_number_of_words_received(self, **kwargs): + return self.total_number_of_(attribute='word_count', subject='partner', **kwargs) + + def total_number_of_characters_received(self, **kwargs): + return self.total_number_of_(attribute='char_count', subject='partner', **kwargs) + + # 4. Most used messages/words in convos by me/partner (also by year/month/day/hour) + def most_used_messages_by_me(self, **kwargs): + """ + >>> s1 = pd.Series([3, 1, 2, 3, 4, 1, 1]) + >>> s2 = pd.Series([3, 2, 1, 1]) + >>> s1_vc = s1.value_counts() + >>> s2_vc = s2.value_counts() + TODO (later) most used is already a problem: + - because its a series of all the unique messages/words ever used in a convo + - it contains strings like ':d', ':p' and 'xd' + - from all the convos the result of value_counts has to be cleared + and has to be truncated (that is not use the 200th most used word, only top10 let's say) + - then these series has to be merged in a way that the same string's counts are added up + - what about typos????! + """ + pass + + def most_used_messages_by_partners(self, **kwargs): + pass + + def most_used_words_by_me(self, **kwargs): + pass + + def most_used_words_by_partners(self, **kwargs): + pass + + # 5. Number of messages sent/got on busiest period (by year/month/day/hour) + def days_when_most_messages_sent(self): + # TODO hard algorithmic problem + pass + + def days_when_most_messages_received(self): + pass + + def hours_when_most_messages_sent(self): + # TODO + # is this referring to the absolute hour most messages sent?? + # like: 2014.07.25. 15h-16h + # OR + # the pattern of most messages sent between this and this hours + # like: 20h-21h + # ACTUALLY BOTH + # for years/months/days/hours + # BUT this comes from the time series analysis + pass + + def hours_when_most_messages_received(self): + pass + + # 6. Time series: dict of 'year/month/day/hour : number of messages/words/characters (also sent/got) for user/all convos' + # TODO + + @staticmethod + def stack_dfs(people): + dfs = [] + for data in people.values(): + if data.messages is not None: + dfs.append(data.messages) + # TODO do I need to sort by index (date)? yes! + return pd.concat(dfs).sort_index() # TODO why ignore_index?? diff --git a/Miner.py b/Miner.py index e69de29..3b32806 100644 --- a/Miner.py +++ b/Miner.py @@ -0,0 +1,45 @@ +DATA_PATH = '/home/levente/projects/facebook-data-miner/data' + +from People import People +from ConversationAnalyzer import ConversationAnalyzer +from MessagingAnalyzer import MessagingAnalyzer + + +class Miner: + def __init__(self): + pass + + @staticmethod + def analyze_messages(): + p = People(path=DATA_PATH) + + stats = {} + + for name, person in p.individuals.items(): + #assert name == person.name, 'ERRRRRRROR!!!' + if person.messages is None: + stats[person.name] = None + continue + analyzer = ConversationAnalyzer(person.name, person.messages) + stats[person.name] = analyzer.stats + # if stats[person.name].get('message_count').get('me') > 5000: + # top[person.name] = stats[person.name] + example = stats['Dániel Nagy'] + print() + + # print('LEN: ', len(top.keys())) + # top_all = {name: data.get('message_count').get('all') for name, data in top.items()} + # analyzer.visualize_stats(top) + + @staticmethod + def analyze_messaging(): + p = People(path=DATA_PATH) + + msg_analyzer = MessagingAnalyzer(p.names, p.individuals) + + msgs = msg_analyzer.total_number_of_messages() + + +if __name__ == '__main__': + m = Miner() + m.analyze_messages() diff --git a/People.py b/People.py new file mode 100644 index 0000000..11d1887 --- /dev/null +++ b/People.py @@ -0,0 +1,79 @@ +from Individual import Individual +from Conversations import Conversations +from Friends import Friends + +# from Me import Me +DATA_PATH = '/home/levente/projects/facebook-data-miner/data' +import time +from Group import Group + + +# TODO we dont need both data and individuals... or?? + +class People: + def __init__(self, path=None): + self.data_path = path if path else DATA_PATH + self._names = [] + self._individuals = {} + self._groups = [] + self._data = self.get_people() # TODO is this supposed to be here or elsewhere + self.to_individuals() # TODO is this supposed to be here or elsewhere + + @property + def data(self): + return self._data + + @property + def names(self): + return self._names + + @property + def individuals(self): + return self._individuals + + @property + def groups(self): + return self._groups + + def get_people(self): + start = time.time() + friends = Friends(self.data_path + '/friends/friends.json') + people1 = friends.get_people() + print('friends: ', time.time() - start) + + # TODO LATER too slow + # takes about 30 secs both + # read it once, store it in DB OR? + start = time.time() + conversations = Conversations(self.data_path) + people2 = conversations.get_people() + print('convos: ', time.time() - start) + + return self.unify_people(people1, people2) + + def to_individuals(self): # TODO maybe rather split_convos or differentiate_convos + start = time.time() + for person, data in self._data.items(): + if person.startswith('group'): + g = Group(name=data.get('name'), title=data.get('title'), messages=data.get('messages'), + compact=data.get('compact_name'), messages_dir=data.get('messages_dir'), + media_dir=data.get('media_dir'), members=None) + self._groups.append(g) + else: + indie = Individual(name=person, title=data.get('title'), messages=data.get('messages'), + compact=data.get('compact_name'), messages_dir=data.get('messages_dir'), + media_dir=data.get('media_dir'), member_of=None) + self._names.append(person) + self._individuals[person] = indie + print('indies: ', time.time() - start) + + @staticmethod + def unify_people(friends, convos): + for person, data in friends.items(): + if not convos.get(person): + convos[person] = data + convos[person]['friend'] = True + return convos + +# if __name__ == '__main__': +# p = People() diff --git a/README.md b/README.md index d9dbdd3..943a123 100644 --- a/README.md +++ b/README.md @@ -1 +1,14 @@ -# facebook-data-miner \ No newline at end of file +# Facebook Data Miner +This repository has a purpose to provide a set of tools, with which one can analyze their facebook data locally. + +The codebase is under development. + +Features will be added gradually, starting with basic analysis of the messages. +Don't expect too fast development tho'. + +More info soon... + +## Contribution +Help is more than welcome. If somebody feel the urge to contribute, I would share my plans with them. + +Ideas are welcome too. Feel free to open a new issue. \ No newline at end of file diff --git a/Visualizer.py b/Visualizer.py new file mode 100644 index 0000000..b4a3779 --- /dev/null +++ b/Visualizer.py @@ -0,0 +1,36 @@ +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +from People import People +from ConversationAnalyzer import ConversationAnalyzer + + +# plt.rcParams.update({'figure.figsize': (10, 7), 'figure.dpi': 120}) + +class Visualizer: + def __init__(self): + pass + + def plot_time_series(self, x, y, title="Time series", xlabel='Date', ylabel='Value', dpi=100): + plt.figure(figsize=(16, 5), dpi=dpi) + plt.plot(x, y, color='tab:red') + plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel) + plt.show() + + +def set_up(people, name, interval='y'): + analyzer = ConversationAnalyzer(name, people.individuals.get(name).messages) + interval_stats = analyzer.get_time_series_data() + stats = interval_stats.get(interval) + return analyzer.get_plotable_time_series_data(stats, statistic='msg_count') + + +if __name__ == "__main__": + v = Visualizer() + TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' + people = People(path=TEST_DATA_PATH) + names = ['Teflon Musk', 'Tőke Hal'] + for name in names: + data = set_up(people, name, interval='d') + df = pd.DataFrame(data.items(), columns=['date', 'value']) + v.plot_time_series(x=df.date, y=df.value, title=name) diff --git a/tests/TestConversationAnalyzer.py b/tests/TestConversationAnalyzer.py new file mode 100644 index 0000000..5fa87d8 --- /dev/null +++ b/tests/TestConversationAnalyzer.py @@ -0,0 +1,231 @@ +import pytest +from ConversationAnalyzer import ConversationAnalyzer +from People import People +from utils import dt + +TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' + + +# @pytest.mark.parametrize("test_input,expected", [("3+5", 8), ("2+4", 6), ("6*9", 42)]) +# def test_eval(test_input, expected): +# assert eval(test_input) == expected + +# get\(\'.*\'\)\. + + +@pytest.fixture(scope='session') +def person(people): + def _person(name): + return people.individuals[name] + + return _person + + +@pytest.fixture(scope='session') +def analyze(person): + def _analyze(name): + individual = person(name) + return ConversationAnalyzer(name, individual.messages) + + return _analyze + + +@pytest.fixture(scope='session') +def statistics(person, analyze): + def _stats(name, **kwargs): + individual = person(name) + analyzer = analyze(name) + if 'subject' in kwargs or 'start' in kwargs or 'end' in kwargs: # and others + return analyzer.get_stats(individual.messages, **kwargs) + else: + return analyzer.stats + + return _stats + + +# TODO extend all functions with all the data +def test_stats_toke_hal_all(statistics): + stats = statistics('Tőke Hal') + + assert stats.msg_count == 5 + assert stats.unique_msg_count == 4 + # assert stats.most_used_msgs == 0 + # assert stats.msg_frequency == 0 + assert stats.word_count == 6 + assert stats.unique_word_count == 4 + # assert stats.word_frequency == 0 + assert stats.char_count == 17 + # assert stats.most_used_chars == 0 + + +def test_stats_toke_hal_me(statistics): + stats = statistics('Tőke Hal', subject='me') + + assert stats.msg_count == 3 + assert stats.unique_msg_count == 3 + # assert stats.most_used_msgs == 0 + # assert stats.msg_frequency == 0 + assert stats.word_count == 4 + assert stats.unique_word_count == 3 + # assert stats.word_frequency == 0 + assert stats.char_count == 12 + # assert stats.most_used_chars == 0 + + +def test_stats_toke_hal_partner(statistics): + stats = statistics('Tőke Hal', subject='partner') + + assert stats.msg_count == 2 + assert stats.unique_msg_count == 2 + # assert stats.most_used_msgs == 0 + # assert stats.msg_frequency == 0 + assert stats.word_count == 2 + assert stats.unique_word_count == 2 + # assert stats.word_frequency == 0 + assert stats.char_count == 5 + # assert stats.most_used_chars == 0 + + +def test_stats_toke_hal_all_2014_11(statistics): + stats = statistics('Tőke Hal', subject='all', start=dt(2014, 11), period='m') + + assert stats.msg_count == 4 + assert stats.unique_msg_count == 3 + # assert stats.most_used_msgs == 0 + # assert stats.msg_frequency == 0 + # assert stats.word_frequency == 0 + # assert stats.most_used_chars == 0 + + +def test_stats_toke_hal_partner_2014_11(statistics): + stats = statistics('Tőke Hal', subject='partner', start=dt(2014, 11), period='m') + assert stats.char_count == 5 + assert stats.word_count == 2 + + +def test_stats_toke_hal_me_2014_11(statistics): + stats = statistics('Tőke Hal', subject='me', start=dt(2014, 11), period='m') + assert stats.unique_word_count == 3 + + +# +def test_stats_toke_hal_all_2014_12(statistics): + stats = statistics('Tőke Hal', subject='all', start=dt(2014, 12), period='m') + assert stats.msg_count == 1 + # assert stats.most_used_msgs == 0 + # assert stats.msg_frequency == 0 + assert stats.unique_word_count == 1 + # assert stats.word_frequency == 0 + assert stats.char_count == 3 + # assert stats.most_used_chars == 0 + + +def test_stats_toke_hal_partner_2014_12(statistics): + stats = statistics('Tőke Hal', subject='partner', start=dt(2014, 12), period='m') + assert stats.word_count == 0 + + +def test_stats_toke_hal_me_2014_12(statistics): + stats = statistics('Tőke Hal', subject='me', start=dt(2014, 12), period='m') + assert stats.unique_msg_count == 1 + + +def test_stats_teflon_musk(statistics): + stats = statistics('Teflon Musk') + assert stats.msg_count == 6 + assert stats.unique_msg_count == 2 # TODO this does not count media messages + # assert stats.most_used_msgs == 0 # TODO should only return the most used or e.g. top10 most used + # assert stats.msg_frequency == 0 + assert stats.word_count == 14 + assert stats.unique_word_count == 7 + # assert stats.word_frequency == 0 + assert stats.char_count == 52 # 23 + # assert stats.most_used_chars == 0 + + +def test_stats_teflon_musk_me(statistics): + stats = statistics('Teflon Musk', subject='me') + assert stats.msg_count == 3 + assert stats.unique_msg_count == 1 + # assert stats.most_used_msgs == 0 + # assert stats.msg_frequency == 0 + assert stats.word_count == 12 + assert stats.unique_word_count == 6 + # assert stats.word_frequency == 0 + assert stats.char_count == 48 + # assert stats.most_used_chars == 0 + + +def test_stats_teflon_musk_partner(statistics): + stats = statistics('Teflon Musk', subject='partner') + assert stats.msg_count == 3 + assert stats.unique_msg_count == 1 + # assert stats.most_used_msgs == 0 + # assert stats.msg_frequency == 0 + assert stats.word_count == 2 + assert stats.unique_word_count == 1 + # assert stats.word_frequency == 0 + assert stats.char_count == 4 + # assert stats.most_used_chars == 0 + + +def test_stats_teflon_musk_all_2014_9(statistics): + stats = statistics('Teflon Musk', subject='all', start=dt(2014, 9), period='m') + assert stats.msg_count == 1 + # assert stats.most_used_msgs == 0 + # assert stats.msg_frequency == 0 + assert stats.word_count == 6 + # assert stats.word_frequency == 0 + # assert stats.most_used_chars == 0 + + +def test_stats_teflon_musk_me_2014_9(statistics): + stats = statistics('Teflon Musk', subject='me', start=dt(2014, 9), period='m') + assert stats.unique_word_count == 6 + + +def test_stats_teflon_musk_partner_2014_9(statistics): + stats = statistics('Teflon Musk', subject='partner', start=dt(2014, 9), period='m') + assert stats.unique_msg_count == 0 + assert stats.char_count == 0 + + +def test_stats_teflon_musk_all_2014_11(statistics): + stats = statistics('Teflon Musk', subject='all', start=dt(2014, 11), period='m') + assert stats.msg_count == 4 + # assert stats.most_used_msgs == 0 + # assert stats.msg_frequency == 0 + # assert stats.word_frequency == 0 + # assert stats.most_used_chars == 0 + + +def test_stats_teflon_musk_me_2014_11(statistics): + stats = statistics('Teflon Musk', subject='me', start=dt(2014, 11), period='m') + assert stats.word_count == 6 + + +def test_stats_teflon_musk_partner_2014_11(statistics): + stats = statistics('Teflon Musk', subject='partner', start=dt(2014, 11), period='m') + assert stats.unique_msg_count == 1 + assert stats.unique_word_count == 1 + assert stats.char_count == 4 + + +def test_stats_teflon_musk_all_2014_12(statistics): + stats = statistics('Teflon Musk', subject='all', start=dt(2014, 12), period='m') + + assert stats.msg_count == 1 + assert stats.unique_msg_count == 0 + # assert stats.most_used_msgs == 0 + # assert stats.msg_frequency == 0 + assert stats.word_count == 0 + assert stats.unique_word_count == 0 + # assert stats.word_frequency == 0 + assert stats.char_count == 0 + # assert stats.most_used_chars == 0 + + +def test_time_series_analysis_for_user(analyze): + analyzer = analyze('Teflon Musk') + analyzer.get_time_series_data(subject='all') + assert 1 diff --git a/tests/TestConversations.py b/tests/TestConversations.py new file mode 100644 index 0000000..e198dc9 --- /dev/null +++ b/tests/TestConversations.py @@ -0,0 +1,71 @@ +import pandas as pd +import pytest +from Conversations import Conversations +import os +TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' + + +@pytest.fixture() +def convos(): + convo = Conversations(f'{TEST_DATA_PATH}') + return convo.get_people() + + +def test_get_all_people_from_convo(convos): + people = [] + + for convo in convos.keys(): + if convo.startswith('group'): + people += [p for p in convos[convo].get('participants')] + else: + people.append(convo) + people = list(set(people)) + + expected = ['Dér Dénes', 'Facebook User', 'Foo Bar', 'John Doe', 'Teflon Musk', 'Benedek Elek', 'Donald Duck', + 'Tőke Hal'] + # TODO LATER what to do with Facebook User?????? + assert sorted(people) == sorted(expected) + + +def test_all_convos_have_dir(convos): + assert all([data.get('messages_dir') for data in convos.values()]) + + +def test_all_convos_have_messages_df(convos): + assert all([isinstance(data.get('messages'), pd.DataFrame) for data in convos.values()]) + + +def test_some_convos_as_media_dir(convos): + assert convos.get('Teflon Musk').get('media_dir') + assert not convos.get('Benedek Elek').get('media_dir') + +def test_convo_media_has_one_folder_of_possibles(convos): + listed_dir = os.listdir(f"{TEST_DATA_PATH}/{convos.get('Teflon Musk').get('media_dir')}") + assert 'files' in listed_dir + assert 'photos' in listed_dir + assert 'audio' not in listed_dir + +def test_groups_have_more_than_two_participates(convos): + groups = {convo: data for convo, data in convos.items() if convo.startswith('group')} + # TODO participants should contain the user itself as well + assert all([len(data.get('participants')) > 2 for data in groups.values()]) + + + +""" +testcases: +- individual convos contain all names, compact_names, message folders and media folders + - media folders are a big question. how do you get it? actually once you have the thread_path then from that you can guess, + OR better off use the uri in the messages... fuck seems complicated +- friends contain all names and compact names, +- convos and friends has a common set, and the set is identical +- people gets assigned with all the unique friends and individual/group convos + +gonna test: +- assigning messages to friends, +- deal with multiple directories, IF there are multiple directories, +- +concerns: +- what to do with non-friends, +- I assume multiple directories are because of files sent, +""" diff --git a/tests/TestFriends.py b/tests/TestFriends.py new file mode 100644 index 0000000..f336609 --- /dev/null +++ b/tests/TestFriends.py @@ -0,0 +1,38 @@ +import pytest + +from Friends import Friends + +TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' + + +@pytest.fixture() +def expected_friends(): + return {'John Doe': {'compact_name': 'johndoe', 'path': None}, + 'Donald Duck': {'compact_name': 'donaldduck', 'path': None}, + 'Szett Droxler': {'compact_name': 'szettdroxler', 'path': None}, + 'Foo Bar': {'compact_name': 'foobar', 'path': None}, + 'Tőke Hal': {'compact_name': 'tokehal', 'path': None}, + 'Dér Dénes': {'compact_name': 'derdenes', 'path': None}, + 'Teflon Musk': {'compact_name': 'teflonmusk', 'path': None}, + 'Daisy Duck': {'compact_name': 'daisyduck', 'path': None}, + 'Guy Fawkes': {'compact_name': 'guyfawkes', 'path': None}} + + +@pytest.fixture() +def friends(): + f = Friends(f'{TEST_DATA_PATH}/friends/friends.json') + return f.get_people() + + +def test_get_peoples_names_from_friends(friends, expected_friends): + assert all([p in expected_friends.keys() for p in friends]) + + +def test_get_peoples_compact_name_from_friends(friends, expected_friends): + expected_compact_names = [value.get('compact_name') for value in expected_friends.values()] + + assert all([p.get('compact_name') in expected_compact_names for p in friends.values()]) + + + +# TODO what happens when two friends have same name?? diff --git a/tests/TestMessages.py b/tests/TestMessages.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/TestMessagingAnalyzer.py b/tests/TestMessagingAnalyzer.py new file mode 100644 index 0000000..d53af70 --- /dev/null +++ b/tests/TestMessagingAnalyzer.py @@ -0,0 +1,143 @@ +import pytest +from MessagingAnalyzer import MessagingAnalyzer +from utils import dt + + +@pytest.fixture(scope='session') +def analyzer(people): + return MessagingAnalyzer(people.names, people.individuals) + + +def test_total_number_of_messages(analyzer): + assert analyzer.total_number_of_messages() == 14 + assert analyzer.total_number_of_messages(start=dt(year=2014), period='y') == 11 + assert analyzer.total_number_of_messages(start=dt(year=2018), period='y') == 3 + + assert analyzer.total_number_of_messages(start=dt(year=2014, month=9), period='m') == 1 + assert analyzer.total_number_of_messages(start=dt(year=2014, month=11), period='m') == 8 + assert analyzer.total_number_of_messages(start=dt(year=2014, month=12), period='m') == 2 + assert analyzer.total_number_of_messages(start=dt(year=2018, month=1), period='m') == 3 + + assert analyzer.total_number_of_messages(start=dt(year=2000), period='y') == 0 + assert analyzer.total_number_of_messages(start=dt(year=2011, month=11), period='m') == 0 + assert analyzer.total_number_of_messages(start=dt(year=2018, month=5), period='m') == 0 + + +def test_total_number_of_words(analyzer): + assert analyzer.total_number_of_words() == 24 + + assert analyzer.total_number_of_words(start=dt(year=2000), period='y') == 0 + assert analyzer.total_number_of_words(start=dt(year=2014), period='y') == 20 + assert analyzer.total_number_of_words(start=dt(year=2018), period='y') == 4 + + assert analyzer.total_number_of_words(start=dt(year=2014, month=9), period='m') == 6 + assert analyzer.total_number_of_words(start=dt(year=2014, month=11), period='m') == 13 + assert analyzer.total_number_of_words(start=dt(year=2014, month=12), period='m') == 1 + + assert analyzer.total_number_of_words(start=dt(year=2018, month=1), period='m') == 4 + assert analyzer.total_number_of_words(start=dt(year=2018, month=2), period='m') == 0 + + +def test_total_number_of_characters(analyzer): + assert analyzer.total_number_of_characters() == 81 + + assert analyzer.total_number_of_characters(start=dt(year=2000), period='y') == 0 + assert analyzer.total_number_of_characters(start=dt(year=2014), period='y') == 69 + assert analyzer.total_number_of_characters(start=dt(year=2018), period='y') == 12 + + assert analyzer.total_number_of_characters(start=dt(year=2014, month=9), period='m') == 24 + assert analyzer.total_number_of_characters(start=dt(year=2014, month=11), period='m') == 42 + assert analyzer.total_number_of_characters(start=dt(year=2014, month=12), period='m') == 3 + + assert analyzer.total_number_of_characters(start=dt(year=2018, month=1), period='m') == 12 + assert analyzer.total_number_of_characters(start=dt(year=2018, month=2), period='m') == 0 + + +def test_total_number_of_messages_sent(analyzer): + assert analyzer.total_number_of_messages_sent() == 8 + assert analyzer.total_number_of_messages_sent(start=dt(year=2014), period='y') == 6 + assert analyzer.total_number_of_messages_sent(start=dt(year=2018), period='y') == 2 + + assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=9), period='m') == 1 + assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=11), period='m') == 4 + assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=12), period='m') == 1 + assert analyzer.total_number_of_messages_sent(start=dt(year=2018, month=1), period='m') == 2 + + assert analyzer.total_number_of_messages_sent(start=dt(year=2000), period='y') == 0 + assert analyzer.total_number_of_messages_sent(start=dt(year=2011, month=11), period='m') == 0 + assert analyzer.total_number_of_messages_sent(start=dt(year=2018, month=5), period='m') == 0 + + +def test_total_number_of_words_sent(analyzer): + assert analyzer.total_number_of_words_sent() == 19 + + assert analyzer.total_number_of_words_sent(start=dt(year=2000), period='y') == 0 + assert analyzer.total_number_of_words_sent(start=dt(year=2014), period='y') == 16 + assert analyzer.total_number_of_words_sent(start=dt(year=2018), period='y') == 3 + + assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=9), period='m') == 6 + assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=11), period='m') == 9 + assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=12), period='m') == 1 + + assert analyzer.total_number_of_words_sent(start=dt(year=2018, month=1), period='m') == 3 + assert analyzer.total_number_of_words_sent(start=dt(year=2018, month=2), period='m') == 0 + + +def test_total_number_of_characters_sent(analyzer): + assert analyzer.total_number_of_characters_sent() == 69 + + assert analyzer.total_number_of_characters_sent(start=dt(year=2000), period='y') == 0 + assert analyzer.total_number_of_characters_sent(start=dt(year=2014), period='y') == 60 + assert analyzer.total_number_of_characters_sent(start=dt(year=2018), period='y') == 9 + + assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=9), period='m') == 24 + assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=11), period='m') == 33 + assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=12), period='m') == 3 + + assert analyzer.total_number_of_characters_sent(start=dt(year=2018, month=1), period='m') == 9 + assert analyzer.total_number_of_characters_sent(start=dt(year=2018, month=2), period='m') == 0 + + +def test_total_number_of_messages_received(analyzer): + assert analyzer.total_number_of_messages_received() == 6 + assert analyzer.total_number_of_messages_received(start=dt(year=2014), period='y') == 5 + assert analyzer.total_number_of_messages_received(start=dt(year=2018), period='y') == 1 + + assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=9), period='m') == 0 + assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=11), period='m') == 4 + assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=12), period='m') == 1 + assert analyzer.total_number_of_messages_received(start=dt(year=2018, month=1), period='m') == 1 + + assert analyzer.total_number_of_messages_received(start=dt(year=2000), period='y') == 0 + assert analyzer.total_number_of_messages_received(start=dt(year=2011, month=11), period='m') == 0 + assert analyzer.total_number_of_messages_received(start=dt(year=2018, month=5), period='m') == 0 + + +def test_total_number_of_words_received(analyzer): + assert analyzer.total_number_of_words_received() == 5 + + assert analyzer.total_number_of_words_received(start=dt(year=2000), period='y') == 0 + assert analyzer.total_number_of_words_received(start=dt(year=2014), period='y') == 4 + assert analyzer.total_number_of_words_received(start=dt(year=2018), period='y') == 1 + + assert analyzer.total_number_of_words_received(start=dt(year=2014, month=9), period='m') == 0 + assert analyzer.total_number_of_words_received(start=dt(year=2014, month=11), period='m') == 4 + assert analyzer.total_number_of_words_received(start=dt(year=2014, month=12), period='m') == 0 + + assert analyzer.total_number_of_words_received(start=dt(year=2018, month=1), period='m') == 1 + assert analyzer.total_number_of_words_received(start=dt(year=2018, month=2), period='m') == 0 + + +def test_total_number_of_characters_received(analyzer): + assert analyzer.total_number_of_characters_received() == 12 + + assert analyzer.total_number_of_characters_received(start=dt(year=2000), period='y') == 0 + assert analyzer.total_number_of_characters_received(start=dt(year=2014), period='y') == 9 + assert analyzer.total_number_of_characters_received(start=dt(year=2018), period='y') == 3 + + assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=9), period='m') == 0 + assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=11), period='m') == 9 + assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=12), period='m') == 0 + + assert analyzer.total_number_of_characters_received(start=dt(year=2018, month=1), period='m') == 3 + assert analyzer.total_number_of_characters_received(start=dt(year=2018, month=2), period='m') == 0 diff --git a/tests/TestPeople.py b/tests/TestPeople.py new file mode 100644 index 0000000..e29448b --- /dev/null +++ b/tests/TestPeople.py @@ -0,0 +1,36 @@ +import pytest + + + +@pytest.fixture() +def people_names(): + return ['John Doe', 'Donald Duck', 'Szett Droxler', 'Foo Bar', 'Tőke Hal', 'Dér Dénes', 'Teflon Musk', 'Daisy Duck', + 'Guy Fawkes', 'Benedek Elek'] + + +def test_specific_people_has_or_has_not_got_messages(people): + # TODO parametrize + import pandas as pd + assert isinstance(people.data.get('Benedek Elek').get('messages'), pd.DataFrame) + assert isinstance(people.data.get('Teflon Musk').get('messages'), pd.DataFrame) + assert isinstance(people.data.get('Tőke Hal').get('messages'), pd.DataFrame) + assert not isinstance(people.data.get('John Doe').get('messages'), pd.DataFrame) + assert not isinstance(people.data.get('Szett Droxler').get('messages'), pd.DataFrame) + assert not isinstance(people.data.get('Daisy Duck').get('messages'), pd.DataFrame) + assert not isinstance(people.data.get('Guy Fawkes').get('messages'), pd.DataFrame) + + +def test_people_name(people, people_names): + people_without_groups = [p for p in people.data.keys() if not p.startswith('group')] + assert sorted(people_names) == sorted(people_without_groups) + + +def test_some_convos_are_with_friends(people): + assert people.data.get('Teflon Musk').get('friend') + assert not people.data.get('Benedek Elek').get('friend') + + +def test_specific_people_has_or_has_not_got_media(people): + assert people.data.get('Teflon Musk').get('media_dir') + +#TODO test individuals too \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..430e923 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,10 @@ +import pytest +from People import People + +TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' + + +@pytest.fixture(scope='session') +def people(): + p = People(path=TEST_DATA_PATH) + return p diff --git a/tests/playground.py b/tests/playground.py new file mode 100644 index 0000000..681c3cb --- /dev/null +++ b/tests/playground.py @@ -0,0 +1,29 @@ +import pytest +from pytest_cases import parametrize, fixture, fixture_ref, lazy_value +# @pytest.mark.parametrize("test_input,expected", [("3+5", 8), ("2+4", 6), ("6*9", 42)]) +# def test_eval(test_input, expected): +# assert eval(test_input) == expected + +@pytest.fixture +def world_str(): + return 'world' + + +def whatfun(): + return 'what' + + +@fixture +@parametrize('who', [fixture_ref(world_str), + 'you']) +def greetings(who): + return 'hello ' + who + + +@parametrize('main_msg', ['nothing', + fixture_ref(world_str), + lazy_value(whatfun), + fixture_ref(greetings)]) +@pytest.mark.parametrize('ending', ['?', '!']) +def test_prints(main_msg, ending): + print(main_msg + ending) diff --git a/tests/test_data/friends/friends.json b/tests/test_data/friends/friends.json new file mode 100644 index 0000000..50c6b8f --- /dev/null +++ b/tests/test_data/friends/friends.json @@ -0,0 +1,40 @@ +{ + "friends": [ + { + "name": "John Doe", + "timestamp": 1590673319 + }, + { + "name": "Donald Duck", + "timestamp": 1586461325 + }, + { + "name": "Szett Droxler", + "timestamp": 1584299908 + }, + { + "name": "Foo Bar", + "timestamp": 1584219292 + }, + { + "name": "T\u00c5\u0091ke Hal", + "timestamp": 1582290479 + }, + { + "name": "D\u00c3\u00a9r D\u00c3\u00a9nes", + "timestamp": 1581523312 + }, + { + "name": "Teflon Musk", + "timestamp": 1580999341 + }, + { + "name": "Daisy Duck", + "timestamp": 1580999200 + }, + { + "name": "Guy Fawkes", + "timestamp": 1580999100 + } + ] +} \ No newline at end of file diff --git a/tests/test_data/messages/inbox/TeflonMusk_fSD454F/files/1810.04805.pdf b/tests/test_data/messages/inbox/TeflonMusk_fSD454F/files/1810.04805.pdf new file mode 100644 index 0000000..2394716 Binary files /dev/null and b/tests/test_data/messages/inbox/TeflonMusk_fSD454F/files/1810.04805.pdf differ diff --git a/tests/test_data/messages/inbox/TeflonMusk_fSD454F/photos/index.jpeg b/tests/test_data/messages/inbox/TeflonMusk_fSD454F/photos/index.jpeg new file mode 100644 index 0000000..57fae9c Binary files /dev/null and b/tests/test_data/messages/inbox/TeflonMusk_fSD454F/photos/index.jpeg differ diff --git a/tests/test_data/messages/inbox/benedekelek_s4f65sdg/message_1.json b/tests/test_data/messages/inbox/benedekelek_s4f65sdg/message_1.json new file mode 100644 index 0000000..61c06a1 --- /dev/null +++ b/tests/test_data/messages/inbox/benedekelek_s4f65sdg/message_1.json @@ -0,0 +1,34 @@ +{ + "participants": [ + { + "name": "Benedek Elek" + }, + { + "name": "Levente Cs\u00c5\u0091ke" + } + ], + "messages": [ + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1515619045145, + "content": "not much", + "type": "Generic" + }, + { + "sender_name": "Benedek Elek", + "timestamp_ms": 1515618506047, + "content": "zup", + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1515571228715, + "content": "yo", + "type": "Generic" + } + ], + "title": "Benedek Elek", + "is_still_participant": true, + "thread_type": "Regular", + "thread_path": "inbox/benedekelek_s4f65sdg" +} \ No newline at end of file diff --git a/tests/test_data/messages/inbox/marathon_sfFSFiD76/gifs/21297336_10214236646047101_8870179296803553280_n_2116299178397271.gif b/tests/test_data/messages/inbox/marathon_sfFSFiD76/gifs/21297336_10214236646047101_8870179296803553280_n_2116299178397271.gif new file mode 100644 index 0000000..666b745 Binary files /dev/null and b/tests/test_data/messages/inbox/marathon_sfFSFiD76/gifs/21297336_10214236646047101_8870179296803553280_n_2116299178397271.gif differ diff --git a/tests/test_data/messages/inbox/marathon_sfFSFiD76/photos/index.jpeg b/tests/test_data/messages/inbox/marathon_sfFSFiD76/photos/index.jpeg new file mode 100644 index 0000000..57fae9c Binary files /dev/null and b/tests/test_data/messages/inbox/marathon_sfFSFiD76/photos/index.jpeg differ diff --git a/tests/test_data/messages/inbox/marathon_sffsfid76/message_1.json b/tests/test_data/messages/inbox/marathon_sffsfid76/message_1.json new file mode 100644 index 0000000..89ba754 --- /dev/null +++ b/tests/test_data/messages/inbox/marathon_sffsfid76/message_1.json @@ -0,0 +1,85 @@ +{ + "participants": [ + { + "name": "Teflon Musk" + }, + { + "name": "Levente Cs\u00c5\u0091ke" + }, + { + "name": "Foo Bar" + }, + { + "name": "Donald Duck" + } + ], + "messages": [ + { + "sender_name": "Donald Duck", + "timestamp_ms": 1524142359709, + "content": ":D", + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1524137912776, + "content": "hmmm", + "type": "Generic" + }, + { + "sender_name": "Donald Duck", + "timestamp_ms": 1524137882444, + "content": "we could go but running is free", + "type": "Generic" + }, + { + "sender_name": "Donald Duck", + "timestamp_ms": 1524137859673, + "photos": [ + { + "uri": "messages/inbox/marathon_sfFSFiD76/photos/index.jpeg", + "creation_timestamp": 1524137857 + } + ], + "type": "Generic" + }, + { + "sender_name": "Donald Duck", + "timestamp_ms": 1524137749717, + "content": "i start today", + "type": "Generic" + }, + { + "sender_name": "Donald Duck", + "timestamp_ms": 1524137737066, + "gifs": [ + { + "uri": "messages/inbox/marathon_sfFSFiD76/gifs/21297336_10214236646047101_8870179296803553280_n_2116299178397271.gif" + } + ], + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1524133955273, + "content": "You named the group marathon.", + "type": "Generic" + }, + { + "sender_name": "Foo Bar", + "timestamp_ms": 1524133941074, + "content": "yapp yapp :D", + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1524133902152, + "content": "marathon?", + "type": "Generic" + } + ], + "title": "marathon", + "is_still_participant": true, + "thread_type": "RegularGroup", + "thread_path": "inbox/marathon_sffsfid76" +} \ No newline at end of file diff --git a/tests/test_data/messages/inbox/teflonmusk_fsd454f/message_1.json b/tests/test_data/messages/inbox/teflonmusk_fsd454f/message_1.json new file mode 100644 index 0000000..1c35b5e --- /dev/null +++ b/tests/test_data/messages/inbox/teflonmusk_fsd454f/message_1.json @@ -0,0 +1,60 @@ +{ + "participants": [ + { + "name": "Teflon Musk" + }, + { + "name": "Levente Cs\u00c5\u0091ke" + } + ], + "messages": [ + { + "sender_name": "Teflon Musk", + "timestamp_ms": 1419620506047, + "files": [ + { + "uri": "messages/inbox/TeflonMusk_fSD454F/files/1810.04805.pdf" + } + ], + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1416619045145, + "photos": [ + { + "uri": "messages/inbox/TeflonMusk_fSD454F/photos/index.jpeg" + } + ], + "type": "Generic" + }, + { + "sender_name": "Teflon Musk", + "timestamp_ms": 1415618806047, + "content": "no", + "type": "Generic" + }, + { + "sender_name": "Teflon Musk", + "timestamp_ms": 1415618506047, + "content": "no", + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1415571228715, + "content": "are you the real teflon musk?", + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1411570928715, + "content": "are you the real teflon musk?", + "type": "Generic" + } + ], + "title": "Teflon Musk", + "is_still_participant": true, + "thread_type": "Regular", + "thread_path": "inbox/teflonmusk_fsd454f" +} \ No newline at end of file diff --git a/tests/test_data/messages/inbox/teflonmuskfoobarandjohndoe_sf5d4f56sd_gf/message_1.json b/tests/test_data/messages/inbox/teflonmuskfoobarandjohndoe_sf5d4f56sd_gf/message_1.json new file mode 100644 index 0000000..84da80b --- /dev/null +++ b/tests/test_data/messages/inbox/teflonmuskfoobarandjohndoe_sf5d4f56sd_gf/message_1.json @@ -0,0 +1,46 @@ +{ + "participants": [ + { + "name": "Teflon Musk" + }, + { + "name": "Levente Cs\u00c5\u0091ke" + }, + { + "name": "John Doe" + }, + { + "name": "Foo Bar" + } + ], + "messages": [ + { + "sender_name": "John Doe", + "timestamp_ms": 1310907774237, + "content": "ok", + "type": "Generic" + }, + { + "sender_name": "Teflon Musk", + "timestamp_ms": 1310907632012, + "content": "basic group messages", + "type": "Generic" + }, + { + "sender_name": "Foo Bar", + "timestamp_ms": 1310907613721, + "content": "what do you test", + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1310907608580, + "content": "test", + "type": "Generic" + } + ], + "title": "Foo Bar, John Doe and Teflon Musk", + "is_still_participant": true, + "thread_type": "RegularGroup", + "thread_path": "inbox/teflonmuskfoobarandjohndoe_sf5d4f56sd_gf" +} \ No newline at end of file diff --git a/tests/test_data/messages/inbox/tokehal_sdf7fs9d876/message_1.json b/tests/test_data/messages/inbox/tokehal_sdf7fs9d876/message_1.json new file mode 100644 index 0000000..66fc886 --- /dev/null +++ b/tests/test_data/messages/inbox/tokehal_sdf7fs9d876/message_1.json @@ -0,0 +1,46 @@ +{ + "participants": [ + { + "name": "T\u00c5\u0091ke Hal" + }, + { + "name": "Levente Cs\u00c5\u0091ke" + } + ], + "messages": [ + { + "sender_name": "Levente CsÅ\u0091ke", + "timestamp_ms": 1417619245145, + "content": "not", + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1415619045145, + "content": "not much", + "type": "Generic" + }, + { + "sender_name": "T\u00c5\u0091ke Hal", + "timestamp_ms": 1415618506047, + "content": "zup", + "type": "Generic" + }, + { + "sender_name": "T\u00c5\u0091ke Hal", + "timestamp_ms": 1415618406047, + "content": "yo", + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1415571228715, + "content": "yo", + "type": "Generic" + } + ], + "title": "T\u00c5\u0091ke Hal", + "is_still_participant": true, + "thread_type": "Regular", + "thread_path": "inbox/tokehal_sdf7fs9d876" +} \ No newline at end of file diff --git a/tests/test_data/messages/inbox/tokehalfoobardonaldduckand2others_safsdf_fdf3/message_1.json b/tests/test_data/messages/inbox/tokehalfoobardonaldduckand2others_safsdf_fdf3/message_1.json new file mode 100644 index 0000000..d9dfd9a --- /dev/null +++ b/tests/test_data/messages/inbox/tokehalfoobardonaldduckand2others_safsdf_fdf3/message_1.json @@ -0,0 +1,58 @@ +{ + "participants": [ + { + "name": "T\u00c5\u0091ke Hal" + }, + { + "name": "Levente Cs\u00c5\u0091ke" + }, + { + "name": "D\u00c3\u00a9r D\u00c3\u00a9nes" + }, + { + "name": "Facebook User" + }, + { + "name": "Donald Duck" + }, + { + "name": "Foo Bar" + } + ], + "messages": [ + { + "sender_name": "Facebook User", + "timestamp_ms": 1310907774237, + "content": "ok", + "type": "Generic" + }, + { + "sender_name": "D\u00c3\u00a9r D\u00c3\u00a9nes", + "timestamp_ms": 1310907632012, + "content": "blabla", + "type": "Generic" + }, + { + "sender_name": "T\u00c5\u0091ke Hal", + "timestamp_ms": 1310907632012, + "content": "basic group messages", + "type": "Generic" + }, + { + "sender_name": "Foo Bar", + "timestamp_ms": 1310907613721, + "content": "what do you test", + "type": "Generic" + }, + { + "sender_name": "Donald Duck", + "timestamp_ms": 1310907608580, + "content": "test", + "type": "Generic" + } + ], + "title": "T\u00c5\u0091ke Hal, Foo Bar, Donald Duck and 2 others", + "is_still_participant": true, + "thread_type": "RegularGroup", + "thread_path": "inbox/tokehalfoobardonaldduckand2others_safsdf_fdf3" +} \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..76352c0 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,58 @@ +import unittest +from utils import * +from pathlib import Path +import reusables +from reusables.cli import * + + +#TODO convert this to pytest + +class TestUtils(unittest.TestCase): + def setUp(self): + self.test_data_path = Path(f'{pwd()}/test_data') + + def test_read_json(self): + dummy1 = {'0': 'a', '2': 'c'} + dummy1_path = Path(self.test_data_path / 'dummy1.json') + dump_to_json(file=dummy1_path, data=dummy1) + + dummy2 = {'1': 'b', '3': 'd'} + dummy2_path = Path(self.test_data_path / 'dummy2.json') + dump_to_json(file=dummy2_path, data=dummy2) + + dummy_unified = get_messages(dummy1_path, dummy2_path) + expected = {'0': 'a', '1': 'b', '2': 'c', '3': 'd'} + + self.assertDictEqual(expected, dummy_unified) + + dummy1_path.unlink() + dummy2_path.unlink() + + def test_decode_text(self): + dummy = {'0': '\u00c5\u0091', '1': '\u00c3\u00a1', '2': ['\u00c5\u0091', {'0': '\u00c3\u00a1'}], + '3': {'0': '\u00c5\u0091'}} + dummy_path = Path(self.test_data_path / 'dummy.json') + dump_to_json(file=dummy_path, data=dummy) + read = get_messages(dummy_path, decode=True) + expected = {'0': 'ő', '1': 'á', '2': ['ő', {'0': 'á'}], + '3': {'0': 'ő'}} + self.assertEqual(expected, read) + + dummy_path.unlink() + + def later_tests(self): + data = {'participants': [{'name': 'Csőke Boglárka'}, {'name': 'Levente Csőke'}], + 'title': 'Csőke Boglárka', + 'is_still_participant': True, + 'thread_type': 'Regular', + 'thread_path': 'inbox/CsokeBoglarka_5A48Zi9P1w', + 'messages': [...]} + msg_element = {'sender_name': 'Levente Csőke', + 'timestamp_ms': 1440948801592, + 'content': 'ahaa', + 'type': 'Generic'} + + def test_read_real_data(self): + bogi_msg = Path('/home/levente/projects/facebook-data-miner/data/messages/inbox/csokeboglarka_5a48zi9p1w') + data = get_messages(bogi_msg/'message_1.json', bogi_msg/'message_2.json') + msg = data['messages'] \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..8594258 --- /dev/null +++ b/utils.py @@ -0,0 +1,191 @@ +import json +import pandas as pd +import dateparser +from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta + +MONTHS = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', + 'november', 'december'] + + +def get_stats_for_intervals(func, df, time_series, subject='all'): + data = {} + for offset, series in time_series.items(): + data[offset] = {} + for i in range(len(series) - 1): # only looping len - 1 times + start = series[i] + # TODO will we miss the last entry? I dont think so (99%), but check and correct hand in hand with the timeseries bug + # IT DOES NOT! HOWEVER test it with new data injected/modified at runtime <- this is hard + end = series[i + 1] + data[offset][start] = func(df, subject=subject, start=start, end=end) + return data + + +# @date_checker +def dt(year: int = 2004, month: int = 1, day: int = 1, hour: int = 0): + return datetime(year=year, month=month, day=day, hour=hour) + + +def get_messages(*files, decode=True): + data = {} + for file in files: + temp = decode_text(read_json(file)) if decode else read_json(file) + if not data: + data = temp + elif data.get('messages') and temp.get('messages'): + data['messages'] += temp.get('messages') + if sorted(temp.keys()) != sorted(data.keys()): + data = {**temp, **data} + return data + + +def read_json(file): + with open(file) as f: + return json.load(f) + + +def dump_to_json(data=None, file=None): + with open(file, 'w') as f: + json.dump(data, f) + + +def decode_text(obj): + if isinstance(obj, str): + return obj.encode('latin_1').decode('utf-8') + + if isinstance(obj, list): + return [decode_text(o) for o in obj] + + if isinstance(obj, dict): + return {key: decode_text(item) for key, item in obj.items()} + + return obj + + +def order_list_of_dicts(lst, key='timestamp_ms'): + return sorted(lst, key=lambda k: k[key]) + + +accents_map = { + "á": "a", + "é": "e", + "í": "i", + "ó": "o", + "ö": "o", + "ő": "o", + "ú": "u", + "ü": "u", + "ű": "u", + # "Á": "A", + # "É": "E", + # "Í": "I", + # "Ó": "O", + # "Ö": "O", + # "Ő": "O", + # "Ú": "U", + # "Ü": "U", + # "Ű": "U", +} + + +# + + +def year_converter(func): + """ + Higher-order function that converts @year param passed to @func into numeric version. + @param func: + @return: + """ + + def wrapper(*args, **kwargs): + if not kwargs.get('year'): + return func(*args, **kwargs) + if not isinstance(kwargs.get('year'), int): + if kwargs.get('year').isdigit(): + kwargs['year'] = int(kwargs.get('year')) + else: + print(f'Year is not a digit. Given year: {kwargs.get("year")}') + return func(*args, **kwargs) + + return wrapper + + +def month_converter(func): + """ + Higher-order function that converts @month param passed to @func into numeric version. + @param func: + @return: + """ + + def wrapper(*args, **kwargs): + if not kwargs.get('month'): + return func(*args, **kwargs) + if isinstance(kwargs['month'], str) and not kwargs['month'].isdigit(): + kwargs['month'] = MONTHS.index(kwargs['month'].lower()) + 1 + return func(*args, **kwargs) + + return wrapper + + +# TODO period can refer to 2 things +# 1. I used it for y/m/d/h +# 2. return only with one date_range series +# decide wtf +def generate_time_series(start=None, end=None, period=None): + start = start or datetime(year=2009, month=10, day=2, hour=0) + end = end or datetime.now() + time_series = { + 'y': pd.date_range(start=start, end=end, freq='YS'), # TODO does not include 2009 + 'm': pd.date_range(start=start, end=end, freq='1MS'), # TODO does not include october + 'd': pd.date_range(start=start, end=end, freq='1D'), # TODO does not include 2. ?! not sure if it is true + # TODO put this back after dev phase is over + # 'h': pd.date_range(start=start, end=end, freq='1H'), # TODO hour should only be run ONCE + } + if period and period in ('y', 'm', 'd', 'h'): + return time_series[period] + return time_series + + +def subject_checker(func): + def wrapper(*args, **kwargs): + if not kwargs.get('subject') or kwargs.get('subject') not in ('all', 'me', 'partner'): + raise ValueError('Parameter `subject` should be one of {all, me, partner}') + return func(*args, **kwargs) + + return wrapper + + +def date_checker(func): + def wrapper(*args, **kwargs): + if kwargs.get('start') is not None and isinstance(kwargs.get('start'), str): + kwargs['start'] = dateparser.parse(kwargs.get('start')) + if kwargs.get('end') is not None and isinstance(kwargs.get('end'), str): + kwargs['end'] = dateparser.parse(kwargs.get('end')) + if kwargs.get('start') is None and kwargs.get('end') is None: + kwargs['start'] = datetime(year=2006, month=1, day=1) # foundation date of Facebook + kwargs['end'] = datetime.now() + if sum(map(bool, [kwargs.get('start'), kwargs.get('end'), kwargs.get('period')])) < 2: + raise ValueError( + 'At least two of the following three input variables has to be passed: {start, end, period}') + return func(*args, **kwargs) + + return wrapper + + +def period_checker(func): + def wrapper(*args, **kwargs): + if kwargs.get('start') is not None and kwargs.get('end') is not None: + return func(*args, **kwargs) + delta_map = { + 'y': relativedelta(years=+1), + 'm': relativedelta(months=+1), + 'd': timedelta(days=1), + 'h': timedelta(hours=1) + } + if not kwargs.get('period') or delta_map[kwargs.get('period')] is None: + raise ValueError('Parameter `period` should be one of {y, m, d, h}') + kwargs['period'] = delta_map[kwargs.get('period')] + return func(*args, **kwargs) + + return wrapper