diff --git a/.gitignore b/.gitignore index 5b8858a..4755795 100644 --- a/.gitignore +++ b/.gitignore @@ -228,24 +228,19 @@ dmypy.json # End of https://www.toptal.com/developers/gitignore/api/python,jupyternotebook,pycharm,visualstudiocode +# ignoring data folder +data -# ignoring data -data +# ignoring jupyter notebook +tests/playground.py -# ignoring todo +# ignoring various files created during development +plots +*.png todo.md - - -# ignoring trash file trash.py - - -# ignoring jupyter notebook explore.ipynb - - -# ignoring jupyter notebook -tests/playground.py +tests/test_data/messages/inbox/private_messages.json tests/.pytest_cache .pytest_cache \ No newline at end of file diff --git a/ConversationAnalyzer.py b/ConversationAnalyzer.py deleted file mode 100644 index cfe1a95..0000000 --- a/ConversationAnalyzer.py +++ /dev/null @@ -1,152 +0,0 @@ -import pandas as pd -from utils import date_checker, period_checker, subject_checker, generate_time_series, get_stats_for_intervals - - -class ConversationAnalyzer: - def __new__(cls, name, messages, *args, **kwargs): - if messages is None: # This deals with the case if no messages - return None - return super(ConversationAnalyzer, cls).__new__(cls, *args, **kwargs) - - def __init__(self, name, messages): - self.name = name - self.df = messages - - def __str__(self): - return f'{self.name}: {list(self.df.index)}' - - @property - def stats(self): - return self.get_stats(self.df) - - def get_stats(self, df, subject='all', start=None, end=None, period=None): - df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period) - stats = ConversationStats(df) - return stats - - def get_time_series_data(self, subject='all', **kwargs): - time_series = generate_time_series(**kwargs) - return get_stats_for_intervals(self.get_stats, self.df, time_series, subject=subject) - - @staticmethod - def get_plottable_time_series_data(interval_stats, statistic): - for k, v in interval_stats.items(): - if isinstance(v, ConversationStats): - interval_stats[k] = getattr(v, statistic) - return interval_stats - - @staticmethod - @subject_checker - @date_checker - @period_checker - def filter_by_input(df, subject='all', start=None, end=None, period=None): - if subject == 'me': - df = df[df.sender_name == 'Levente Csőke'] - elif subject == 'partner': - df = df[df.sender_name != 'Levente Csőke'] - if start and end: - df = df.loc[start:end] - elif start and not end: - df = df.loc[start:start + period] - elif not start and end: - df = df.loc[end - period:end] - return df - - -class ConversationStats: - """ - Statistics of conversation with one person. - """ - - # TODO do we need this or not?!?! smh - # def __new__(cls, df, *args, **kwargs): - # if not len(df.index): # This deals with the case if input df is empty - # return None - # return super(ConversationStats, cls).__new__(cls, *args, **kwargs) - - def __init__(self, df): - self.df = df - - def __repr__(self): - return f'{self.msg_count}' - - @property - def messages(self): - return self.df.content.dropna() - - @property - def words(self): - return self.get_words() - - # 1. - @property - def msg_count(self): - return len(self.df) - - # 2. - @property - def unique_msg_count(self): - return len(self.messages.unique()) - - # 3. - @property - def most_used_msgs(self): - # TODO LATER first few (1-10) messages - return self.messages.value_counts() - - # 4. - @property - def msg_frequency(self): - # NOTE this has been most likely depracated OR? - pass - - # 5. - @property - def word_count(self): - return len(self.words) - - # 6. - @property - def unique_word_count(self): - return len(set(self.words)) - - # 7. - @property - def most_used_words(self): - s = pd.Series(self.words) - return s.value_counts() - - # 8. - @property - def word_frequency(self): - pass - - # 9. - @property - def char_count(self): - char_count = 0 - for word in self.words: - char_count += len(word) - return char_count - - # 10. - @property - def most_used_chars(self): - return None # TODO LATER or not https://stackoverflow.com/questions/4131123/finding-the-most-frequent-character-in-a-string - - # 11. - @property - def rate_of_media_messages(self): - pass # NOTE what? - - def get_words(self): - token_list = self.messages.str.lower().str.split() - words = [] - for tokens in token_list: - # print(tokens) - if not isinstance(tokens, list): - print('WARNING! Not a list!') - continue # TODO ??? check this - for token in tokens: - words.append(token) - return words diff --git a/Conversations.py b/Conversations.py deleted file mode 100644 index 3fb1fbd..0000000 --- a/Conversations.py +++ /dev/null @@ -1,116 +0,0 @@ -import os -from FacebookData import FacebookData -import pandas as pd - -from datetime import datetime - -MESSAGE_SUBPATH = 'messages/inbox' - - -class Conversations: - def __init__(self, data_path): - self.data_path = f'{data_path}/{MESSAGE_SUBPATH}' - - def get_people(self): - json_paths = self.walk_directory_and_search('.json') - return self.extract_names_from_convos(json_paths) - - def walk_directory_and_search(self, extension): - paths = [] - for root, dirs, files in os.walk(self.data_path): - for name in files: - if name.endswith(extension): - paths.append(os.path.join(root, name)) - return paths - - # TODO simplify this function!! also this takes very long - @staticmethod - def extract_names_from_convos(jsons): - name_data_map = {} - count = 0 - for file in jsons: - msg = Messages(file) - for participant in msg.participants: - key = participant if msg.ttype == 'Regular' else f'group_{count}' - if key == 'Facebook User': # TODO ?? what to do with this?? - continue - if name_data_map.get(key) and key.startswith( - 'group'): # making sure run only once even if it is a group - continue - if name_data_map.get(key): - dfs = [name_data_map[key]['messages'], msg.df] - name_data_map[key]['messages'] = pd.concat(dfs, ignore_index=False).sort_index() - else: - name_data_map[key] = { - 'title': msg.title, - 'compact_name': msg.compact_names, - # 'participants': msg.participants + ['Levente Csőke'], - 'participants': msg.participants, - 'messages': msg.df, - 'friend': None, - 'messages_dir': msg.messages_dir, - 'media_dir': msg.media_dir - } - if msg.ttype == 'RegularGroup': - count += 1 - - return name_data_map - - -class Messages(FacebookData): - def __init__(self, json_path): - super().__init__(json_path) - self.to_df() - self.set_date_as_index() - - def to_df(self): - self._df = pd.DataFrame(self.decoded.get('messages')) - - def set_date_as_index(self): - # NOTE maybe not needed; could calculate real time - date_series = self._df.timestamp_ms.apply(self.ts_to_date) - self._df = self._df.set_index(date_series).iloc[::-1] - - @property - def names(self): - return pd.DataFrame(self.participants)[0] - - @property - def participants(self): - participants = self.decoded.get('participants') - # TODO I should be IN - # but this breaks stuff at TestMessagingAnalyzer - return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke'] - # return [p.get('name') for p in participants if p.get('name')] - - @property - def title(self): - return self.decoded.get('title') - - @property - def ttype(self): - return self.decoded.get('thread_type') - - @property - def messages_dir(self): - thread_path = self.decoded.get('thread_path') - if not thread_path.startswith('inbox/'): - raise ValueError('Something is not okay.') - # TODO here or in the upper function where we extract names - return thread_path.split('/')[1].lower() - - @property - def media_dir(self): - # todo what should the path contain - for media in ['photos', 'gifs', 'files', 'videos', 'audio']: - if media in self._df.columns: - media_in_msg = list(self._df[media][self._df[media].notnull()]) - # if len(media_in_msg) > 1: # TODO is this ok. i think it is. think multiple photos sent once - # print('Media in msg is bigger than 1') - uri = media_in_msg[0][0].get('uri') - return os.path.dirname(os.path.dirname(uri)) - return None - - @staticmethod - def ts_to_date(date): - return datetime.fromtimestamp(date / 1000) # .strftime('%Y-%m-%d') diff --git a/FacebookData.py b/FacebookData.py deleted file mode 100644 index a82c896..0000000 --- a/FacebookData.py +++ /dev/null @@ -1,40 +0,0 @@ -from utils import read_json, decode_text, accents_map - - -class FacebookData: - def __init__(self, json_path): - self.json_path = json_path - self._df = None - - @property - def df(self): - return self._df - - @property - def decoded(self): - return decode_text(self.json) - - @property - def json(self): - return read_json(self.json_path) - - @property - def compact_names(self): - # NOTE this is the place where we change pd/np to builtin - # do we have to do this? - name_list = list(self.without_accent_and_whitespace(self.lower_names(self.names))) - return name_list[0] if len(name_list) == 1 else name_list - - @staticmethod - def lower_names(col): - return col.str.lower() - - @staticmethod - def without_accent_and_whitespace(col): - def replace_accents(text): - for char in accents_map.keys(): - if char in text: - text = text.replace(char, accents_map[char]) - return text.replace(' ', '') - - return col.apply(replace_accents) diff --git a/Friends.py b/Friends.py deleted file mode 100644 index 6e0e991..0000000 --- a/Friends.py +++ /dev/null @@ -1,36 +0,0 @@ -import pandas as pd -import os -from FacebookData import FacebookData -from utils import accents_map - - -class Friends(FacebookData): - - def __init__(self, *args): - super().__init__(*args) - - # self.path = 'data/friends' - # self.json_path = f'{self.path}/friends.json' - - self.to_df() - - def get_people(self): - names = {} - for name, compact in zip(self.names, self.compact_names): - names[name] = { - 'title': name, - 'compact_name': compact, - 'messages': None, - 'friend': True, - 'participants': None, - 'messages_dir': None, - 'media_dir': None - } - return names - - def to_df(self): - self._df = pd.DataFrame(self.decoded.get('friends')) - - @property - def names(self): - return self.df.name diff --git a/Individual.py b/Individual.py deleted file mode 100644 index a9f8d03..0000000 --- a/Individual.py +++ /dev/null @@ -1,47 +0,0 @@ -class Individual: - def __init__(self, name=None, title=None,compact=None, messages=None, friend=None, messages_dir=None, media_dir=None, - member_of=None): - self._name = name - self._title = title - self._compact_name = compact - self._messages = messages - self._friend = friend - self._messages_dir = messages_dir - self._media_dir = media_dir - self._member_of = member_of - - - def __repr__(self): - return self.name - - @property - def name(self): - return self._name - - @property - def title(self): - return self._title - - @property - def messages(self): - return self._messages - - @property - def friend(self): - return self._friend - - @property - def media_dir(self): - return self._media_dir - - @property - def messages_dir(self): - return self._messages_dir - - @property - def compact_name(self): - return self._compact_name - - @property - def member_of(self): - return self._member_of diff --git a/Me.py b/Me.py deleted file mode 100644 index 3293bf7..0000000 --- a/Me.py +++ /dev/null @@ -1,10 +0,0 @@ -from FacebookData import FacebookData - - -class Me(FacebookData): - def __init__(self, *args): - super().__init__(*args) - - @property - def name(self): - return 'Levente Csőke' diff --git a/MessagingAnalyzer.py b/MessagingAnalyzer.py deleted file mode 100644 index 0619505..0000000 --- a/MessagingAnalyzer.py +++ /dev/null @@ -1,130 +0,0 @@ -from utils import year_converter, month_converter, generate_date_series, get_stats_for_intervals -from datetime import datetime, date, timedelta -import pandas as pd -from ConversationAnalyzer import ConversationAnalyzer - - -class MessagingAnalyzer: - def __init__(self, names, people): - # TODO input people only. class will know what to do - self.names = names - self.people = people - - def time_series_analysis_for_all(self, subject=None, **kwargs): - time_series = generate_date_series(**kwargs) - stacked_df = self.stack_dfs(self.people) - interval_stats = get_stats_for_intervals(self.get_stats, stacked_df, time_series, subject=subject) - - def get_stats(self, df, subject='all', start=None, end=None, period=None): - # TODO LATER - # here you have to do something with it - pass - - def get_count(self, attribute, subject='all', start=None, end=None, period=None): - count = 0 - # we have a list of names we want to iterate over - for name in self.names: - stats = self.get_conversation_stats(name=name, subject=subject, start=start, end=end, period=period) - if stats is not None: - count += getattr(stats, attribute) - return count - - def get_conversation_stats(self, name, subject='all', start=None, end=None, period=None): - messages = self.people.get(name).messages - analyzer = ConversationAnalyzer(name, messages) - if analyzer is None: - return None - return analyzer.get_stats(messages, subject=subject, start=start, end=end, period=period) - - def total_number_of_(self, attribute, subject='all', **kwargs): - return self.get_count(attribute=attribute, subject=subject, **kwargs) - - # 1. Ranking of friends by total count of messages/words/characters (also by year/month/day/hour) - def total_number_of_messages(self, **kwargs): - return self.total_number_of_(attribute='msg_count', **kwargs) - - def total_number_of_words(self, **kwargs): - return self.total_number_of_(attribute='word_count', **kwargs) - - def total_number_of_characters(self, **kwargs): - return self.total_number_of_(attribute='char_count', **kwargs) - - # 2. Ranking of friends who I sent the most messages/words/characters (also by year/month/day/hour) - def total_number_of_messages_sent(self, **kwargs): - return self.total_number_of_(attribute='msg_count', subject='me', **kwargs) - - def total_number_of_words_sent(self, **kwargs): - return self.total_number_of_(attribute='word_count', subject='me', **kwargs) - - def total_number_of_characters_sent(self, **kwargs): - return self.total_number_of_(attribute='char_count', subject='me', **kwargs) - - # 3. Ranking of friends who sent the most messages/words/characters (also by year/month) - def total_number_of_messages_received(self, **kwargs): - return self.total_number_of_(attribute='msg_count', subject='partner', **kwargs) - - def total_number_of_words_received(self, **kwargs): - return self.total_number_of_(attribute='word_count', subject='partner', **kwargs) - - def total_number_of_characters_received(self, **kwargs): - return self.total_number_of_(attribute='char_count', subject='partner', **kwargs) - - # 4. Most used messages/words in convos by me/partner (also by year/month/day/hour) - def most_used_messages_by_me(self, **kwargs): - """ - >>> s1 = pd.Series([3, 1, 2, 3, 4, 1, 1]) - >>> s2 = pd.Series([3, 2, 1, 1]) - >>> s1_vc = s1.value_counts() - >>> s2_vc = s2.value_counts() - TODO LATER most used is already a problem: - - because its a series of all the unique messages/words ever used in a convo - - it contains strings like ':d', ':p' and 'xd' - - from all the convos the result of value_counts has to be cleared - and has to be truncated (that is not use the 200th most used word, only top10 let's say) - - then these series has to be merged in a way that the same string's counts are added up - - what about typos????! - """ - pass - - def most_used_messages_by_partners(self, **kwargs): - pass - - def most_used_words_by_me(self, **kwargs): - pass - - def most_used_words_by_partners(self, **kwargs): - pass - - # 5. Number of messages sent/got on busiest period (by year/month/day/hour) - def days_when_most_messages_sent(self): - # TODO LATER hard algorithmic problem - pass - - def days_when_most_messages_received(self): - pass - - def hours_when_most_messages_sent(self): - # TODO LATER - # is this referring to the absolute hour most messages sent?? - # like: 2014.07.25. 15h-16h - # OR - # the pattern of most messages sent between this and this hours - # like: 20h-21h - # ACTUALLY BOTH - # for years/months/days/hours - # BUT this comes from the time series analysis - pass - - def hours_when_most_messages_received(self): - pass - - # 6. Time series: dict of 'year/month/day/hour : number of messages/words/characters (also sent/got) for user/all convos' - # TODO - - @staticmethod - def stack_dfs(people): - dfs = [] - for data in people.values(): - if data.messages is not None: - dfs.append(data.messages) - return pd.concat(dfs).sort_index() diff --git a/Miner.py b/Miner.py deleted file mode 100644 index 3b32806..0000000 --- a/Miner.py +++ /dev/null @@ -1,45 +0,0 @@ -DATA_PATH = '/home/levente/projects/facebook-data-miner/data' - -from People import People -from ConversationAnalyzer import ConversationAnalyzer -from MessagingAnalyzer import MessagingAnalyzer - - -class Miner: - def __init__(self): - pass - - @staticmethod - def analyze_messages(): - p = People(path=DATA_PATH) - - stats = {} - - for name, person in p.individuals.items(): - #assert name == person.name, 'ERRRRRRROR!!!' - if person.messages is None: - stats[person.name] = None - continue - analyzer = ConversationAnalyzer(person.name, person.messages) - stats[person.name] = analyzer.stats - # if stats[person.name].get('message_count').get('me') > 5000: - # top[person.name] = stats[person.name] - example = stats['Dániel Nagy'] - print() - - # print('LEN: ', len(top.keys())) - # top_all = {name: data.get('message_count').get('all') for name, data in top.items()} - # analyzer.visualize_stats(top) - - @staticmethod - def analyze_messaging(): - p = People(path=DATA_PATH) - - msg_analyzer = MessagingAnalyzer(p.names, p.individuals) - - msgs = msg_analyzer.total_number_of_messages() - - -if __name__ == '__main__': - m = Miner() - m.analyze_messages() diff --git a/People.py b/People.py deleted file mode 100644 index 11d1887..0000000 --- a/People.py +++ /dev/null @@ -1,79 +0,0 @@ -from Individual import Individual -from Conversations import Conversations -from Friends import Friends - -# from Me import Me -DATA_PATH = '/home/levente/projects/facebook-data-miner/data' -import time -from Group import Group - - -# TODO we dont need both data and individuals... or?? - -class People: - def __init__(self, path=None): - self.data_path = path if path else DATA_PATH - self._names = [] - self._individuals = {} - self._groups = [] - self._data = self.get_people() # TODO is this supposed to be here or elsewhere - self.to_individuals() # TODO is this supposed to be here or elsewhere - - @property - def data(self): - return self._data - - @property - def names(self): - return self._names - - @property - def individuals(self): - return self._individuals - - @property - def groups(self): - return self._groups - - def get_people(self): - start = time.time() - friends = Friends(self.data_path + '/friends/friends.json') - people1 = friends.get_people() - print('friends: ', time.time() - start) - - # TODO LATER too slow - # takes about 30 secs both - # read it once, store it in DB OR? - start = time.time() - conversations = Conversations(self.data_path) - people2 = conversations.get_people() - print('convos: ', time.time() - start) - - return self.unify_people(people1, people2) - - def to_individuals(self): # TODO maybe rather split_convos or differentiate_convos - start = time.time() - for person, data in self._data.items(): - if person.startswith('group'): - g = Group(name=data.get('name'), title=data.get('title'), messages=data.get('messages'), - compact=data.get('compact_name'), messages_dir=data.get('messages_dir'), - media_dir=data.get('media_dir'), members=None) - self._groups.append(g) - else: - indie = Individual(name=person, title=data.get('title'), messages=data.get('messages'), - compact=data.get('compact_name'), messages_dir=data.get('messages_dir'), - media_dir=data.get('media_dir'), member_of=None) - self._names.append(person) - self._individuals[person] = indie - print('indies: ', time.time() - start) - - @staticmethod - def unify_people(friends, convos): - for person, data in friends.items(): - if not convos.get(person): - convos[person] = data - convos[person]['friend'] = True - return convos - -# if __name__ == '__main__': -# p = People() diff --git a/README.md b/README.md index 943a123..c4b3b6a 100644 --- a/README.md +++ b/README.md @@ -11,4 +11,10 @@ More info soon... ## Contribution Help is more than welcome. If somebody feel the urge to contribute, I would share my plans with them. -Ideas are welcome too. Feel free to open a new issue. \ No newline at end of file +Ideas are welcome too. Feel free to open a new issue. + + +For running VIsualizer CLI: +```shell script +export PYTHONPATH="$PWD" +``` diff --git a/__main__.py b/__main__.py new file mode 100644 index 0000000..33f7113 --- /dev/null +++ b/__main__.py @@ -0,0 +1,5 @@ +from miner.App import App + +if __name__ == '__main__': + app = App() + app.analyze_messages() \ No newline at end of file diff --git a/miner/Analyzer.py b/miner/Analyzer.py new file mode 100644 index 0000000..17d9f68 --- /dev/null +++ b/miner/Analyzer.py @@ -0,0 +1,127 @@ +import pandas as pd + +from miner.ConversationStats import ConversationStats +from miner import utils + + +class Analyzer: + """ + Analyzer for analyzing specific and/or all conversations + + """ + + def __init__(self, people): + self.people = people + self.people_data = people.data + self.names = list(people.names) + self.multi = len(self.people_data) > 1 + + if self.multi: + self.df = self.stack_dfs(self.people_data) + else: + self.df = self.people_data.get(self.names[0]).messages + + def __str__(self): + if self.multi: + return self.names + else: + return f'{self.names[0]}: {list(self.df.index)}' + + @property + def stats(self): + return self.get_stats() + + def get_stats_for_intervals(self, time_series, period, subject='all'): + data = {} + for i in range(len(time_series)): + start = time_series[i] + try: # with this solution we will have data for the very last moments until datetime.now() + end = time_series[i + 1] + except IndexError: + end = None + data[start] = self.get_stats(df=self.df, subject=subject, start=start, end=end, period=period) + return data + + def get_stats(self, df=None, subject='all', start=None, end=None, period=None): + df = self.df if df is None else df + df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period) + stats = ConversationStats(df) + return stats + + # 1. Total count of messages/words/characters (also by year/month/day/hour) + # 2. Total count of messages/words/characters sent (also by year/month/day/hour) + # 3. Total count of messages/words/characters received (also by year/month) + def get_count(self, attribute, subject='all', start=None, end=None, period=None): + stats = self.get_stats(subject=subject, start=start, end=end, period=period) + return getattr(stats, attribute) + + # 4. Most used messages/words in convos by me/partner (also by year/month/day/hour) + def most_used_messages_(self, **kwargs): + """ + >>> s1 = pd.Series([3, 1, 2, 3, 4, 1, 1]) + >>> s2 = pd.Series([3, 2, 1, 1]) + >>> s1_vc = s1.value_counts() + >>> s2_vc = s2.value_counts() + TODO LATER most used is already a problem: + - because its a series of all the unique messages/words ever used in a convo + - it contains strings like ':d', ':p' and 'xd' + - from all the convos the result of value_counts has to be cleared + and has to be truncated (that is not use the 200th most used word, only top10 let's say) + - then these series has to be merged in a way that the same string's counts are added up + - what about typos????! + """ + pass + + # 5. Number of messages sent/got on busiest period (by year/month/day/hour) + def stat_per_period(self, period, statistic, **kwargs): + interval_stats = self.get_time_series_data(period, **kwargs) + time_series_data = self.get_stat_count(interval_stats, statistic=statistic) + return utils.count_stat_for_period(time_series_data, period) + + # 6. Time series: dict of 'y/m/d/h : number of messages/words/characters (also sent/got) for user/all convos' + def get_time_series_data(self, period, subject='all', **kwargs): + time_series = utils.generate_date_series(period=period, **kwargs) + return self.get_stats_for_intervals(time_series, period, subject=subject) + + # # 7. Ranking of partners by messages by y/m/d/h, by different stats, by sent/got + def get_ranking_of_partners_by_messages(self, statistic='msg_count', **kwargs): + count_dict = {} + for name in self.names: + df = self.df[self.df.partner == name] + stats = self.get_stats(df=df, **kwargs) + if stats is not None: + count_dict = utils.fill_dict(count_dict, name, getattr(stats, statistic)) + return count_dict + + @staticmethod + def stack_dfs(people_data): + dfs = [] + for data in people_data.values(): + if data.messages is not None: + dfs.append(data.messages) + return pd.concat(dfs).sort_index() + + @staticmethod + @utils.attribute_checker + def get_stat_count(interval_stats, statistic='msg_count'): + for k, v in interval_stats.items(): + interval_stats[k] = getattr(v, statistic) + return interval_stats + + @staticmethod + @utils.subject_checker + @utils.date_checker + @utils.start_end_period_checker + def filter_by_input(df, subject='all', start=None, end=None, period=None): + + if subject == 'me': + df = df[df.sender_name == 'Levente Csőke'] + elif subject == 'partner': + df = df[df.sender_name != 'Levente Csőke'] + if start and end: + df = df.loc[start:end] + elif start and not end: + df = df.loc[start:start + period] + elif not start and end: + df = df.loc[end - period:end] + return df diff --git a/miner/App.py b/miner/App.py new file mode 100644 index 0000000..7813f9e --- /dev/null +++ b/miner/App.py @@ -0,0 +1,27 @@ +import os + +from miner.Analyzer import Analyzer +from miner.People import People + +DATA_PATH = f'{os.getcwd()}/data' + + +class App: + """ + Entrypoint. Not yet used extensively. + # TODO LATER turn it into a cli + """ + def __init__(self): + pass + + @staticmethod + def analyze_messages(): + p = People(path=DATA_PATH) + + analyzer = Analyzer(p) + rank = analyzer.get_ranking_of_partners_by_messages(attribute='char_count') + + +if __name__ == '__main__': + app = App() + app.analyze_messages() diff --git a/miner/ConversationStats.py b/miner/ConversationStats.py new file mode 100644 index 0000000..040bd17 --- /dev/null +++ b/miner/ConversationStats.py @@ -0,0 +1,89 @@ + +class ConversationStats: + """ + Statistics of conversation with one person. + """ + + def __init__(self, df): + self.df = df + + def __repr__(self): + return f'{self.msg_count}' + + @property + def messages(self): + return self.df.content.dropna() + + @property + def words(self): + return self.get_words() + + # 1. + @property + def msg_count(self): + return len(self.df) + + # 2. + @property + def unique_msg_count(self): + return len(self.messages.unique()) + + # 3. + @property + def most_used_msgs(self): + return self.messages.value_counts() + + # 4. + @property + def msg_frequency(self): + # NOTE this has been most likely depracated OR? + pass + + # 5. + @property + def word_count(self): + return len(self.words) + + # 6. + @property + def unique_word_count(self): + return len(set(self.words)) + + # 7. + @property + def most_used_words(self): + return pd.Series(self.words).value_counts() + + # 8. + @property + def word_frequency(self): + pass + + # 9. + @property + def char_count(self): + char_count = 0 + for word in self.words: + char_count += len(word) + return char_count + + # 10. + @property + def rate_of_media_messages(self): + """ + TODO LATER + search for media messages all 5 of them + rate is only the second or third abstraction + """ + pass + + def get_words(self): + token_list = self.messages.str.lower().str.split() + words = [] + for tokens in token_list: + if not isinstance(tokens, list): + print('WARNING! Not a list!') + continue + for token in tokens: + words.append(token) + return words diff --git a/miner/Conversations.py b/miner/Conversations.py new file mode 100644 index 0000000..b4a4381 --- /dev/null +++ b/miner/Conversations.py @@ -0,0 +1,108 @@ +import pandas as pd +import os + +from miner.Messages import Messages +from miner.Individual import Individual + +from miner import utils + + +class Conversations: + """ + Class for managing and parsing conversations + """ + + def __init__(self, data_path): + self.private_convo_paths = {} + self.group_convo_paths = {} # TODO LATER fill this as well + self.deleted_user_convo_paths = [] # NOTE these are collected but not yet used + + self.data_path = f'{data_path}/{utils.MESSAGE_SUBPATH}' + self.order_paths() + + def order_paths(self): + paths_map = f'{self.data_path}/private_messages.json' + if os.path.isfile(paths_map): + self.read_paths(paths_map) + return + json_paths = utils.walk_directory_and_search(self.data_path, '.json', contains_string='message_') + self.differentiate_paths(json_paths) + self.register_paths() + + def differentiate_paths(self, jsons): + for file in jsons: + msg = Messages(file) + if msg.title == 'Facebook User': + self.deleted_user_convo_paths.append(msg.messages_dir) + elif msg.ttype == 'RegularGroup': + self.map_group_convo_files(msg, file) + elif msg.ttype == 'Regular': + # self.private_convo_paths[msg.title] = msg.messages_dir + self.map_private_convo_files(msg, file) + else: + raise ValueError('Should not happen!') + + def register_paths(self): + utils.dump_to_json(self.private_convo_paths, f'{self.data_path}/private_messages.json') + + def read_paths(self, file): + self.private_convo_paths = utils.read_json(file) + print() + + def map_private_convo_files(self, msg, file): + name = msg.title + if self.private_convo_paths.get(name): + self.private_convo_paths[name].append(file) + else: + self.private_convo_paths[name] = [file] + + def map_group_convo_files(self, msg, file): + for participant in msg.participants: + if participant == 'Levente Csőke': + continue + if self.group_convo_paths.get(file): + self.group_convo_paths[file].append(participant) + else: + self.group_convo_paths[file] = [participant] + + def get_people_from_private_messages(self, name=None, membership=True): + name_data_map = {} + convo_path_map = self.filter_by_name(name) if name is not None else self.private_convo_paths.values() + for paths in convo_path_map: + for file in paths: + messages = Messages(file) + name = messages.title + if name_data_map.get(name) is not None: + dfs = [name_data_map[name].messages, messages.df] + name_data_map[name].messages = pd.concat(dfs).sort_index() + else: + name_data_map[name] = self.create_individual(messages, membership=membership) + return name_data_map + + def filter_by_name(self, name): + filtered_paths = [] + names = [] + if isinstance(name, str): + names = [name] + elif isinstance(name, list): + names = name + for name in names: + filtered_paths.append(self.private_convo_paths.get(name)) + return filtered_paths + + def create_individual(self, messages, membership=None): + return Individual( + name=messages.title, + compact=messages.compact_names, + messages=messages.df, + messages_dir=messages.messages_dir, + media_dir=messages.media_dir, + member_of=self.group_membership(messages.title) if membership else None, + ) + + @staticmethod + def group_membership(name): + return None + + def get_people_from_group_messages(self): + pass diff --git a/miner/FacebookData.py b/miner/FacebookData.py new file mode 100644 index 0000000..ef5ba78 --- /dev/null +++ b/miner/FacebookData.py @@ -0,0 +1,32 @@ +from miner import utils +import pandas as pd + + +class FacebookData: + """ + Base class for reading in tabular data from JSONs. + """ + + def __init__(self, json_path): + self.json_path = json_path + self._df = None + + @property + def df(self): + return self._df + + @property + def decoded(self): + return utils.decode_text(self.json) + + @property + def json(self): + return utils.read_json(self.json_path) + + @property + def compact_names(self): + name_list = list(utils.without_accent_and_whitespace(utils.lower_names(self.names))) + return name_list[0] if len(name_list) == 1 else name_list + + def to_df(self, field=None): + self._df = pd.DataFrame(self.decoded.get(field)) diff --git a/miner/Friends.py b/miner/Friends.py new file mode 100644 index 0000000..5acc1be --- /dev/null +++ b/miner/Friends.py @@ -0,0 +1,28 @@ +from miner.FacebookData import FacebookData +from miner.Individual import Individual + + +class Friends(FacebookData): + """ + Class for storing data in friends.json + """ + + def __init__(self, *args): + super().__init__(*args) + self.to_df('friends') + + def get_people(self, name=None): + names = {} + for full_name, compact in zip(self.names, self.compact_names): + if name is not None and name != full_name: # filtering for name + continue + names[full_name] = Individual( + name=full_name, + compact=compact, + friend=True, + ) + return names + + @property + def names(self): + return self.df.name diff --git a/Group.py b/miner/Group.py similarity index 85% rename from Group.py rename to miner/Group.py index 2152d77..3a1b131 100644 --- a/Group.py +++ b/miner/Group.py @@ -1,9 +1,13 @@ -# TODO LATER groups should be searched by looking into jsons unfortunately :( +# NOTE groups should be searched by looking into jsons unfortunately :( # because of directory says others # maybe we dont use groups right away? class Group: + """ + Class for holding a group-message's data + """ + def __init__(self, name=None, title=None, messages=None, compact=None, messages_dir=None, media_dir=None, members=None): self._name = name @@ -26,10 +30,6 @@ def title(self): def messages(self): return self._messages - # @property - # def get_message_jsons(self): - # return self._messages - @property def media_dir(self): return self._media_dir diff --git a/miner/Individual.py b/miner/Individual.py new file mode 100644 index 0000000..6f818ab --- /dev/null +++ b/miner/Individual.py @@ -0,0 +1,61 @@ +class Individual: + """ + Class for holding a person's data the user ever interacted with + """ + + def __init__(self, name=None, compact=None, messages=None, friend=None, messages_dir=None, + media_dir=None, + member_of=None): + self._name = name + self._compact_name = compact + self._messages = messages + self._friend = friend + self._messages_dir = messages_dir + self._media_dir = media_dir + self._member_of = member_of + + def __repr__(self): + return f'{self.name}, messages: {self.messages}' + + def __add__(self, other): + return Individual( + name=self.name if self.name else other.name, + friend=self.friend if self.friend else other.friend, + compact=self.compact_name if self.compact_name else other.compact_name, + messages=self.messages if len(self.messages) else other.messages, + messages_dir=self.messages_dir if self.messages_dir else other.messages_dir, + media_dir=self.media_dir if self.media_dir else other.media_dir, + member_of=self.member_of if self.member_of else other.member_of + ) + + @property + def name(self): + return self._name + + @property + def messages(self): + return self._messages + + @messages.setter + def messages(self, df): + self._messages = df + + @property + def friend(self): + return self._friend + + @property + def media_dir(self): + return self._media_dir + + @property + def messages_dir(self): + return self._messages_dir + + @property + def compact_name(self): + return self._compact_name + + @property + def member_of(self): + return self._member_of diff --git a/miner/Me.py b/miner/Me.py new file mode 100644 index 0000000..ef2179e --- /dev/null +++ b/miner/Me.py @@ -0,0 +1,14 @@ +from miner.FacebookData import FacebookData + + +class Me(FacebookData): + """ + Class for storing basic data about the user + """ + + def __init__(self, *args): + super().__init__(*args) + + @property + def name(self): + return '' diff --git a/miner/Messages.py b/miner/Messages.py new file mode 100644 index 0000000..ebdaadf --- /dev/null +++ b/miner/Messages.py @@ -0,0 +1,64 @@ +from datetime import datetime +import pandas as pd +import os + +from miner.FacebookData import FacebookData +from miner import utils + + +class Messages(FacebookData): + """ + Class for representing data of all the messages with a user or a group + """ + + def __init__(self, json_path): + super().__init__(json_path) + self.to_df('messages') + self.set_date_as_index() + self.add_partner_column() + + @property + def names(self): + try: + return pd.DataFrame(self.participants)[0] + except KeyError: + return pd.Series({0: 'Facebook User'}) + + @property + def participants(self): + participants = self.decoded.get('participants') + return [p.get('name') for p in participants if p.get('name')] + + @property + def title(self): + return self.decoded.get('title') + + @property + def ttype(self): + return self.decoded.get('thread_type') + + @property + def messages_dir(self): + thread_path = self.decoded.get('thread_path') + if not thread_path.startswith('inbox/'): + raise ValueError('Field `thread_path` should start with `inbox/`.') + return thread_path.split('inbox/')[1] + + @property + def media_dir(self): + for media in utils.MEDIA_DIRS: + if media in self._df.columns: + media_in_msg = list(self._df[media][self._df[media].notnull()]) + uri = media_in_msg[0][0].get('uri') + return os.path.dirname(os.path.dirname(uri)).split('inbox/')[1] + + def set_date_as_index(self): + date_series = self._df.timestamp_ms.apply(self.ts_to_date) + self._df = self._df.set_index(date_series).iloc[::-1] + + def add_partner_column(self): + self._df['partner'] = self.title + + @staticmethod + def ts_to_date(date): + return datetime.fromtimestamp(date / 1000) diff --git a/miner/People.py b/miner/People.py new file mode 100644 index 0000000..b7852e8 --- /dev/null +++ b/miner/People.py @@ -0,0 +1,56 @@ +import time +import os + +from miner.Conversations import Conversations +from miner.Friends import Friends + +DATA_PATH = f'{os.getcwd()}/data' + + +class People: + """ + Class that manages and represents people from different kind of interactions + # TODO LATER abstractional flaw?! people? person? indie? + """ + + def __init__(self, path=None, name=None): + self.data_path = path if path else DATA_PATH + self._groups = [] + self._data = self.get_people(name=name) + self._names = self.data.keys() + + @property + def data(self): + return self._data + + @property + def names(self): + return self._names # if len(self._names) > 1 else self._names[0] + + @property + def groups(self): + return self._groups + + def get_people(self, name=None): + start = time.time() + friend = Friends(self.data_path + '/friends/friends.json') + friends = friend.get_people(name=name) + print('friends: ', time.time() - start) + + start = time.time() + conversations = Conversations(self.data_path) + print('convos1: ', time.time() - start) + start = time.time() + individuals = conversations.get_people_from_private_messages(name=name) + print('convos2: ', time.time() - start) + + return self.unify_people(friends, individuals) + + @staticmethod + def unify_people(friends, convo_partners): + for person, friend in friends.items(): + if not convo_partners.get(person): + convo_partners[person] = friend + else: + convo_partners[person] = convo_partners.get(person) + friend + return convo_partners diff --git a/Visualizer.py b/miner/Visualizer.py similarity index 88% rename from Visualizer.py rename to miner/Visualizer.py index 052ecb3..440c3e7 100644 --- a/Visualizer.py +++ b/miner/Visualizer.py @@ -1,8 +1,8 @@ import matplotlib.pyplot as plt import seaborn as sns import pandas as pd -from People import People -from ConversationAnalyzer import ConversationAnalyzer +from miner.People import People +from miner.ConversationAnalyzer import ConversationAnalyzer # plt.rcParams.update({'figure.figsize': (10, 7), 'figure.dpi': 120}) @@ -22,7 +22,7 @@ def plot_convos(self, names): @staticmethod def set_up_data(people, name, period='y'): - analyzer = ConversationAnalyzer(name, people.individuals.get(name).messages) + analyzer = ConversationAnalyzer(name, people.data.get(name).messages) interval_stats = analyzer.get_time_series_data(subject='all', start=None, end=None, period=period) return analyzer.get_plottable_time_series_data(interval_stats, statistic='msg_count') diff --git a/tests/TestMessages.py b/miner/__init__.py similarity index 100% rename from tests/TestMessages.py rename to miner/__init__.py diff --git a/miner/requirements.txt b/miner/requirements.txt new file mode 100644 index 0000000..8ee3351 --- /dev/null +++ b/miner/requirements.txt @@ -0,0 +1,5 @@ +numpy==1.18.1 +pandas==1.0.3 +dateparser==0.7.6 +matplotlib==3.2.1 +python_dateutil==2.8.1 diff --git a/utils.py b/miner/utils.py similarity index 55% rename from utils.py rename to miner/utils.py index 2a48624..6f6f565 100644 --- a/utils.py +++ b/miner/utils.py @@ -1,28 +1,28 @@ -import json -import pandas as pd -import dateparser -from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta +from datetime import datetime, timedelta +import dateparser +from itertools import islice +import json +import os +MESSAGE_SUBPATH = 'messages/inbox' +MEDIA_DIRS = ['photos', 'gifs', 'files', 'videos', 'audio'] MONTHS = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] - - -def read_json(file): - with open(file) as f: - return json.load(f) - - -def dump_to_json(data=None, file=None): - with open(file, 'w') as f: - json.dump(data, f) - - -def order_list_of_dicts(lst, key='timestamp_ms'): - return sorted(lst, key=lambda k: k[key]) - - -accents_map = { +WEEKDAYS = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"] +PERIOD_MAP = { + 'y': None, + 'm': MONTHS, + 'd': WEEKDAYS, + 'h': None, +} +DELTA_MAP = { + 'y': relativedelta(years=+1), + 'm': relativedelta(months=+1), + 'd': timedelta(days=1), + 'h': timedelta(hours=1) +} +ACCENTS_MAP = { "á": "a", "é": "e", "í": "i", @@ -32,19 +32,21 @@ def order_list_of_dicts(lst, key='timestamp_ms'): "ú": "u", "ü": "u", "ű": "u", - # "Á": "A", - # "É": "E", - # "Í": "I", - # "Ó": "O", - # "Ö": "O", - # "Ő": "O", - # "Ú": "U", - # "Ü": "U", - # "Ű": "U", } -# +def read_json(file): + with open(file) as f: + return json.load(f) + + +def dump_to_json(data=None, file=None): + with open(file, 'w', encoding='utf8') as f: + json.dump(data, f, ensure_ascii=False) + + +def order_list_of_dicts(lst, key='timestamp_ms'): + return sorted(lst, key=lambda k: k[key]) def year_converter(func): @@ -110,19 +112,10 @@ def wrapper(*args, **kwargs): return wrapper -DELTA_MAP = { - 'y': relativedelta(years=+1), - 'm': relativedelta(months=+1), - 'd': timedelta(days=1), - 'h': timedelta(hours=1) -} - - -def period_checker(func): +def start_end_period_checker(func): def wrapper(*args, **kwargs): if kwargs.get('start') is not None and kwargs.get('end') is not None: return func(*args, **kwargs) - if not kwargs.get('period') or DELTA_MAP[kwargs.get('period')] is None: raise ValueError('Parameter `period` should be one of {y, m, d, h}') kwargs['period'] = DELTA_MAP[kwargs.get('period')] @@ -131,33 +124,38 @@ def wrapper(*args, **kwargs): return wrapper -def generate_date_series(start=None, end=None, period=None): - if period is None or DELTA_MAP.get(period) is None: - raise ValueError('Parameter `period` should be one of {y, m, d, h}') - start = start or datetime(year=2009, month=10, day=2, hour=0) # TODO change this to date when user joined FB - end = end or datetime.now() +def period_checker(func): + def wrapper(*args, **kwargs): + if not kwargs.get('period') or DELTA_MAP[kwargs.get('period')] is None: + raise ValueError('Parameter `period` should be one of {y, m, d, h}') + return func(*args, **kwargs) + + return wrapper + + +def get_start_based_on_period(join_date, period): + if period == 'y': + return datetime(join_date.year, 1, 1) + elif period == 'm': + return datetime(join_date.year, join_date.month, 1) + return join_date + +@period_checker +def generate_date_series(period='y', start=None, end=None): dates = [] + + join_date = datetime(year=2009, month=10, day=2) # TODO later get this from somewhere + start = start or get_start_based_on_period(join_date, period) + end = end or datetime.now() + intermediate = start - while intermediate <= end: + while intermediate <= end: # means that we want to have the end in it as well dates.append(intermediate) intermediate = intermediate + DELTA_MAP.get(period) return dates -def get_stats_for_intervals(func, df, time_series, subject='all'): - data = {} - for offset, series in time_series.items(): - data[offset] = {} - for i in range(len(series) - 1): # only looping len - 1 times - start = series[i] - # TODO LATER will we miss the last entry? I dont think so (99%), but check and correct hand in hand with the timeseries bug - # IT DOES NOT! HOWEVER test it with new data injected/modified at runtime <- this is hard - end = series[i + 1] - data[offset][start] = func(df, subject=subject, start=start, end=end) - return data - - def dt(year: int = 2004, month: int = 1, day: int = 1, hour: int = 0): return datetime(year=year, month=month, day=day, hour=hour) @@ -186,3 +184,82 @@ def decode_text(obj): return {key: decode_text(item) for key, item in obj.items()} return obj + + +def lower_names(col): + return col.str.lower() + + +def replace_accents(text): + for char in ACCENTS_MAP.keys(): + if char in text: + text = text.replace(char, ACCENTS_MAP[char]) + return text.replace(' ', '') + + +def without_accent_and_whitespace(col): + return col.apply(replace_accents) + + +def walk_directory_and_search(path, extension, contains_string=None): + paths = [] + for root, dirs, files in os.walk(path): + for file_name in files: + if file_name.endswith(extension): + if contains_string is not None and contains_string in file_name: + paths.append(os.path.join(root, file_name)) + return paths + + +def fill_dict(dictionary, key, value): + if dictionary.get(key) is not None: + dictionary[key] += value + else: + dictionary[key] = value + return dictionary + + +def month_sorter(x): + return MONTHS.index(x[0]) + + +def count_stat_for_period(data, period): + # TODO sort by lists + periods = {} + for key, value in data.items(): + if period == 'y': + periods = fill_dict(periods, key.year, value) + periods = dict(sorted(periods.items())) + elif period == 'm': + periods = fill_dict(periods, MONTHS[key.month - 1], value) + periods = dict(sorted(periods.items(), key=lambda x: MONTHS.index(x[0]))) + elif period == 'd': + periods = fill_dict(periods, WEEKDAYS[key.weekday()], value) + periods = dict(sorted(periods.items(), key=lambda x: WEEKDAYS.index(x[0]))) + elif period == 'h': + periods = fill_dict(periods, key.hour, value) + periods = dict(sorted(periods.items())) + return periods + + +def sort_dict(dictionary, func=lambda x: x, reverse=False): + return {key: value for key, value in sorted(dictionary.items(), key=func, reverse=reverse)} + + +def remove_items_where_value_is_falsible(dictionary): + return {k: v for k, v in dictionary.items() if v} + + +# keep only first 20 entries +def slice_dict(dictionary, n): + return dict(islice(dictionary.items(), n)) + + +def attribute_checker(func): + def wrapper(*args, **kwargs): + statistic = kwargs.get('statistic') + if not statistic or statistic not in ('msg_count', 'word_count', 'char_count'): + raise ValueError('Parameter `statistic` should be one of {msg_count, word_count, char_count}') + return func(*args, **kwargs) + + return wrapper diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..76f8eef --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +seaborn==0.10.1 +dateparser==0.7.6 +pandas==1.0.3 +matplotlib==3.2.1 +python_dateutil==2.8.1 diff --git a/tests/TestConversations.py b/tests/TestConversations.py deleted file mode 100644 index e198dc9..0000000 --- a/tests/TestConversations.py +++ /dev/null @@ -1,71 +0,0 @@ -import pandas as pd -import pytest -from Conversations import Conversations -import os -TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' - - -@pytest.fixture() -def convos(): - convo = Conversations(f'{TEST_DATA_PATH}') - return convo.get_people() - - -def test_get_all_people_from_convo(convos): - people = [] - - for convo in convos.keys(): - if convo.startswith('group'): - people += [p for p in convos[convo].get('participants')] - else: - people.append(convo) - people = list(set(people)) - - expected = ['Dér Dénes', 'Facebook User', 'Foo Bar', 'John Doe', 'Teflon Musk', 'Benedek Elek', 'Donald Duck', - 'Tőke Hal'] - # TODO LATER what to do with Facebook User?????? - assert sorted(people) == sorted(expected) - - -def test_all_convos_have_dir(convos): - assert all([data.get('messages_dir') for data in convos.values()]) - - -def test_all_convos_have_messages_df(convos): - assert all([isinstance(data.get('messages'), pd.DataFrame) for data in convos.values()]) - - -def test_some_convos_as_media_dir(convos): - assert convos.get('Teflon Musk').get('media_dir') - assert not convos.get('Benedek Elek').get('media_dir') - -def test_convo_media_has_one_folder_of_possibles(convos): - listed_dir = os.listdir(f"{TEST_DATA_PATH}/{convos.get('Teflon Musk').get('media_dir')}") - assert 'files' in listed_dir - assert 'photos' in listed_dir - assert 'audio' not in listed_dir - -def test_groups_have_more_than_two_participates(convos): - groups = {convo: data for convo, data in convos.items() if convo.startswith('group')} - # TODO participants should contain the user itself as well - assert all([len(data.get('participants')) > 2 for data in groups.values()]) - - - -""" -testcases: -- individual convos contain all names, compact_names, message folders and media folders - - media folders are a big question. how do you get it? actually once you have the thread_path then from that you can guess, - OR better off use the uri in the messages... fuck seems complicated -- friends contain all names and compact names, -- convos and friends has a common set, and the set is identical -- people gets assigned with all the unique friends and individual/group convos - -gonna test: -- assigning messages to friends, -- deal with multiple directories, IF there are multiple directories, -- -concerns: -- what to do with non-friends, -- I assume multiple directories are because of files sent, -""" diff --git a/tests/TestMessagingAnalyzer.py b/tests/TestMessagingAnalyzer.py deleted file mode 100644 index b803693..0000000 --- a/tests/TestMessagingAnalyzer.py +++ /dev/null @@ -1,243 +0,0 @@ -import pytest -from MessagingAnalyzer import MessagingAnalyzer -from utils import dt - -@pytest.fixture(scope='session') -def analyzer(people): - return MessagingAnalyzer(people.names, people.individuals) - - -def test_total_number_of_messages(analyzer): - assert analyzer.total_number_of_messages() == 29 - - assert analyzer.total_number_of_messages(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_messages(start=dt(year=2014), period='y') == 11 - assert analyzer.total_number_of_messages(start=dt(year=2018), period='y') == 3 - assert analyzer.total_number_of_messages(start=dt(year=2020), period='y') == 15 - - assert analyzer.total_number_of_messages(start=dt(year=2011, month=11), period='m') == 0 - assert analyzer.total_number_of_messages(start=dt(year=2014, month=9), period='m') == 1 - assert analyzer.total_number_of_messages(start=dt(year=2014, month=11), period='m') == 8 - assert analyzer.total_number_of_messages(start=dt(year=2014, month=12), period='m') == 2 - - assert analyzer.total_number_of_messages(start=dt(year=2018, month=1), period='m') == 3 - assert analyzer.total_number_of_messages(start=dt(year=2018, month=5), period='m') == 0 - - assert analyzer.total_number_of_messages(start=dt(year=2020, month=2), period='m') == 10 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=3), period='m') == 1 # jpg - assert analyzer.total_number_of_messages(start=dt(year=2020, month=4), period='m') == 2 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=5), period='m') == 1 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=8), period='m') == 1 - - assert analyzer.total_number_of_messages(start=dt(year=2020, month=2, day=13), period='d') == 2 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 2 - - assert analyzer.total_number_of_messages(start=dt(year=2020, month=2, day=13, hour=6), period='d') == 4 - - -def test_total_number_of_words(analyzer): - assert analyzer.total_number_of_words() == 86 - - assert analyzer.total_number_of_words(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_words(start=dt(year=2014), period='y') == 20 - assert analyzer.total_number_of_words(start=dt(year=2018), period='y') == 32 - assert analyzer.total_number_of_words(start=dt(year=2020), period='y') == 34 - - assert analyzer.total_number_of_words(start=dt(year=2014, month=9), period='m') == 6 - assert analyzer.total_number_of_words(start=dt(year=2014, month=11), period='m') == 13 - assert analyzer.total_number_of_words(start=dt(year=2014, month=12), period='m') == 1 - - assert analyzer.total_number_of_words(start=dt(year=2018, month=1), period='m') == 32 - assert analyzer.total_number_of_words(start=dt(year=2018, month=2), period='m') == 0 - - assert analyzer.total_number_of_words(start=dt(year=2020, month=2), period='m') == 27 - assert analyzer.total_number_of_words(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_words(start=dt(year=2020, month=4), period='m') == 4 - assert analyzer.total_number_of_words(start=dt(year=2020, month=5), period='m') == 1 - assert analyzer.total_number_of_words(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_words(start=dt(year=2020, month=8), period='m') == 2 - - assert analyzer.total_number_of_words(start=dt(year=2020, month=2, day=13), period='d') == 14 - assert analyzer.total_number_of_words(start=dt(year=2020, month=2, day=13, hour=5), period='d') == 14 - - -def test_total_number_of_characters(analyzer): - assert analyzer.total_number_of_characters() == 379 - - assert analyzer.total_number_of_characters(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_characters(start=dt(year=2014), period='y') == 69 - assert analyzer.total_number_of_characters(start=dt(year=2018), period='y') == 170 - assert analyzer.total_number_of_characters(start=dt(year=2020), period='y') == 140 - - assert analyzer.total_number_of_characters(start=dt(year=2014, month=9), period='m') == 24 - assert analyzer.total_number_of_characters(start=dt(year=2014, month=11), period='m') == 42 - assert analyzer.total_number_of_characters(start=dt(year=2014, month=12), period='m') == 3 - - assert analyzer.total_number_of_characters(start=dt(year=2018, month=1), period='m') == 170 - assert analyzer.total_number_of_characters(start=dt(year=2018, month=2), period='m') == 0 - - assert analyzer.total_number_of_characters(start=dt(year=2020, month=2), period='m') == 114 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=4), period='m') == 17 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=5), period='m') == 4 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=8), period='m') == 5 - - -def test_total_number_of_messages_sent(analyzer): - assert analyzer.total_number_of_messages_sent() == 17 - assert analyzer.total_number_of_messages_sent(start=dt(year=2014), period='y') == 6 - assert analyzer.total_number_of_messages_sent(start=dt(year=2018), period='y') == 2 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020), period='y') == 9 - - assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=9), period='m') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=11), period='m') == 4 - assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=12), period='m') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2018, month=1), period='m') == 2 - - assert analyzer.total_number_of_messages_sent(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2011, month=11), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2018, month=5), period='m') == 0 - - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2), period='m') == 6 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=4), period='m') == 2 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=5), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=8), period='m') == 1 - - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2, day=13), period='d') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2, day=13, hour=18), period='h') == 0 - - -def test_total_number_of_words_sent(analyzer): - assert analyzer.total_number_of_words_sent() == 69 - - assert analyzer.total_number_of_words_sent(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2014), period='y') == 16 - assert analyzer.total_number_of_words_sent(start=dt(year=2018), period='y') == 31 - assert analyzer.total_number_of_words_sent(start=dt(year=2020), period='y') == 22 - - assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=9), period='m') == 6 - assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=11), period='m') == 9 - assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=12), period='m') == 1 - - assert analyzer.total_number_of_words_sent(start=dt(year=2018, month=1), period='m') == 31 - assert analyzer.total_number_of_words_sent(start=dt(year=2018, month=2), period='m') == 0 - - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2), period='m') == 16 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=4), period='m') == 4 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=5), period='m') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=8), period='m') == 2 - - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2, day=13), period='d') == 5 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 5 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 - - -def test_total_number_of_characters_sent(analyzer): - assert analyzer.total_number_of_characters_sent() == 311 - - assert analyzer.total_number_of_characters_sent(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2014), period='y') == 60 - assert analyzer.total_number_of_characters_sent(start=dt(year=2018), period='y') == 167 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020), period='y') == 84 - - assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=9), period='m') == 24 - assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=11), period='m') == 33 - assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=12), period='m') == 3 - - assert analyzer.total_number_of_characters_sent(start=dt(year=2018, month=1), period='m') == 167 - assert analyzer.total_number_of_characters_sent(start=dt(year=2018, month=2), period='m') == 0 - - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2), period='m') == 62 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=4), period='m') == 17 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=5), period='m') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=8), period='m') == 5 - - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=6), period='d') == 21 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=7), period='d') == 0 - - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 21 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 - - -def test_total_number_of_messages_received(analyzer): - assert analyzer.total_number_of_messages_received() == 12 - assert analyzer.total_number_of_messages_received(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2014), period='y') == 5 - assert analyzer.total_number_of_messages_received(start=dt(year=2018), period='y') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2020), period='y') == 6 - - assert analyzer.total_number_of_messages_received(start=dt(year=2011, month=11), period='m') == 0 - - assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=9), period='m') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=11), period='m') == 4 - assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=12), period='m') == 1 - - assert analyzer.total_number_of_messages_received(start=dt(year=2018, month=1), period='m') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2018, month=5), period='m') == 0 - - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2), period='m') == 4 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=3), period='m') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=4), period='m') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=5), period='m') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=8), period='m') == 0 - - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2, day=13), period='d') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2, day=14), period='d') == 2 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2, day=18), period='d') == 1 - - -def test_total_number_of_words_received(analyzer): - assert analyzer.total_number_of_words_received() == 17 - - assert analyzer.total_number_of_words_received(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_words_received(start=dt(year=2014), period='y') == 4 - assert analyzer.total_number_of_words_received(start=dt(year=2018), period='y') == 1 - assert analyzer.total_number_of_words_received(start=dt(year=2020), period='y') == 12 - - assert analyzer.total_number_of_words_received(start=dt(year=2014, month=9), period='m') == 0 - assert analyzer.total_number_of_words_received(start=dt(year=2014, month=11), period='m') == 4 - assert analyzer.total_number_of_words_received(start=dt(year=2014, month=12), period='m') == 0 - - assert analyzer.total_number_of_words_received(start=dt(year=2018, month=1), period='m') == 1 - assert analyzer.total_number_of_words_received(start=dt(year=2018, month=2), period='m') == 0 - - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2), period='m') == 11 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=5), period='m') == 1 - - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2, day=13), period='d') == 9 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2, day=14), period='d') == 2 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2, day=18), period='d') == 0 - - -def test_total_number_of_characters_received(analyzer): - assert analyzer.total_number_of_characters_received() == 68 - - assert analyzer.total_number_of_characters_received(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_characters_received(start=dt(year=2014), period='y') == 9 - assert analyzer.total_number_of_characters_received(start=dt(year=2018), period='y') == 3 - assert analyzer.total_number_of_characters_received(start=dt(year=2020), period='y') == 56 - - assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=9), period='m') == 0 - assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=11), period='m') == 9 - assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=12), period='m') == 0 - - assert analyzer.total_number_of_characters_received(start=dt(year=2018, month=1), period='m') == 3 - assert analyzer.total_number_of_characters_received(start=dt(year=2018, month=2), period='m') == 0 - - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2), period='m') == 52 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=5), period='m') == 4 - - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2, day=13), period='d') == 30 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2, day=14), period='d') == 22 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2, day=18), period='d') == 0 diff --git a/tests/TestPeople.py b/tests/TestPeople.py deleted file mode 100644 index 61295d8..0000000 --- a/tests/TestPeople.py +++ /dev/null @@ -1,36 +0,0 @@ -import pytest - - - -@pytest.fixture() -def people_names(): - return ['John Doe', 'Donald Duck', 'Szett Droxler', 'Foo Bar', 'Tőke Hal', 'Dér Dénes', 'Teflon Musk', 'Daisy Duck', - 'Guy Fawkes', 'Benedek Elek'] - - -def test_specific_people_has_or_has_not_got_messages(people): - # TODO LATER parametrize - import pandas as pd - assert isinstance(people.data.get('Benedek Elek').get('messages'), pd.DataFrame) - assert isinstance(people.data.get('Teflon Musk').get('messages'), pd.DataFrame) - assert isinstance(people.data.get('Tőke Hal').get('messages'), pd.DataFrame) - assert not isinstance(people.data.get('John Doe').get('messages'), pd.DataFrame) - assert not isinstance(people.data.get('Szett Droxler').get('messages'), pd.DataFrame) - assert not isinstance(people.data.get('Daisy Duck').get('messages'), pd.DataFrame) - assert not isinstance(people.data.get('Guy Fawkes').get('messages'), pd.DataFrame) - - -def test_people_name(people, people_names): - people_without_groups = [p for p in people.data.keys() if not p.startswith('group')] - assert sorted(people_names) == sorted(people_without_groups) - - -def test_some_convos_are_with_friends(people): - assert people.data.get('Teflon Musk').get('friend') - assert not people.data.get('Benedek Elek').get('friend') - - -def test_specific_people_has_or_has_not_got_media(people): - assert people.data.get('Teflon Musk').get('media_dir') - -#TODO LATER test individuals too \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 430e923..ebdc35e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,14 @@ import pytest -from People import People +import os -TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' +from miner.People import People + +TEST_DATA_PATH = f'{os.getcwd()}/test_data' @pytest.fixture(scope='session') -def people(): - p = People(path=TEST_DATA_PATH) - return p +def get_people(): + def _get_people(name=None): + return People(path=TEST_DATA_PATH, name=name) + return _get_people + diff --git a/tests/TestConversationAnalyzer.py b/tests/test_ConversationAnalyzer.py similarity index 87% rename from tests/TestConversationAnalyzer.py rename to tests/test_ConversationAnalyzer.py index 015ac19..61ab388 100644 --- a/tests/TestConversationAnalyzer.py +++ b/tests/test_ConversationAnalyzer.py @@ -1,42 +1,33 @@ import pytest -from ConversationAnalyzer import ConversationAnalyzer -from People import People -from utils import dt -TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' +from miner.Analyzer import Analyzer +from miner.utils import dt -# @pytest.mark.parametrize("test_input,expected", [("3+5", 8), ("2+4", 6), ("6*9", 42)]) -# def test_eval(test_input, expected): -# assert eval(test_input) == expected - -# get\(\'.*\'\)\. - - -@pytest.fixture(scope='session') -def person(people): - def _person(name): - return people.individuals[name] - - return _person +# @pytest.fixture(scope='session') +# def person(get_people): +# def _person(name): +# people = get_people(name) +# return people.data[name] +# +# return _person @pytest.fixture(scope='session') -def analyze(person): +def analyze(get_people): def _analyze(name): - individual = person(name) - return ConversationAnalyzer(name, individual.messages) + people = get_people(name) + return Analyzer(people) return _analyze @pytest.fixture(scope='session') -def statistics(person, analyze): +def statistics(analyze): def _stats(name, **kwargs): - individual = person(name) analyzer = analyze(name) if 'subject' in kwargs or 'start' in kwargs or 'end' in kwargs: # and others - return analyzer.get_stats(individual.messages, **kwargs) + return analyzer.get_stats(**kwargs) else: return analyzer.stats @@ -224,11 +215,5 @@ def test_stats_teflon_musk_all_2014_12(statistics): assert stats.char_count == 0 # assert stats.most_used_chars == 0 -class TestConversationAnalyzer: # Foo Bar - pass -def test_time_series_analysis_for_user(analyze): - analyzer = analyze('Teflon Musk') - analyzer.get_time_series_data(subject='all') - assert 1 diff --git a/tests/test_Conversations.py b/tests/test_Conversations.py new file mode 100644 index 0000000..5a7cb58 --- /dev/null +++ b/tests/test_Conversations.py @@ -0,0 +1,75 @@ +import pandas as pd +import pytest +from miner.Conversations import Conversations +from miner.Individual import Individual +from miner import utils +import os + +TEST_DATA_PATH = f'{os.getcwd()}/test_data' + + +@pytest.fixture() +def conversations(): + return Conversations(f'{TEST_DATA_PATH}') + + +@pytest.fixture +def people_from_private_convos(conversations): + return conversations.get_people_from_private_messages() + + +def test_if_paths_are_registered(conversations): + assert len(conversations.private_convo_paths) == 4 + assert len(conversations.group_convo_paths) == 3 + assert len(conversations.deleted_user_convo_paths) == 0 + + +def test_get_all_people_from_private_messages(people_from_private_convos): + people = list(people_from_private_convos.keys()) + expected = ['Foo Bar', 'Teflon Musk', 'Benedek Elek', 'Tőke Hal'] + assert sorted(people) == sorted(expected) + + +def test_get_all_people_from_convo(conversations): + people = [] + # indie + people += list(conversations.private_convo_paths.keys()) + # group + people_from_groups = [p for people in conversations.group_convo_paths.values() for p in people] + + people += people_from_groups + + expected = ['Dér Dénes', 'Facebook User', 'Foo Bar', 'John Doe', 'Teflon Musk', 'Benedek Elek', 'Donald Duck', + 'Tőke Hal'] + + assert sorted(list(set(people))) == sorted(expected) + + +def test_people_are_individual_instances(people_from_private_convos): + assert all([isinstance(person, Individual) for person in people_from_private_convos.values()]) + + +def test_all_individual_have_messages_df(people_from_private_convos): + assert all([isinstance(data.messages, pd.DataFrame) for data in people_from_private_convos.values()]) + + +def test_all_individual_have_dir(people_from_private_convos): + assert all([data.messages_dir for data in people_from_private_convos.values()]) + + +def test_some_individual_as_media_dir(people_from_private_convos): + assert people_from_private_convos.get('Teflon Musk').media_dir + assert not people_from_private_convos.get('Benedek Elek').media_dir + + +def test_individual_media_has_one_folder_of_possibles(people_from_private_convos): + listed_dir = os.listdir( + f"{TEST_DATA_PATH}/{utils.MESSAGE_SUBPATH}/{people_from_private_convos.get('Teflon Musk').media_dir}") + assert 'files' in listed_dir + assert 'photos' in listed_dir + assert 'audio' not in listed_dir + + +def test_groups_have_more_than_two_participates(people_from_private_convos): + groups = {convo: data for convo, data in people_from_private_convos.items() if convo.startswith('group')} + assert all([len(data.get('participants')) > 2 for data in groups.values()]) diff --git a/tests/TestFriends.py b/tests/test_Friends.py similarity index 85% rename from tests/TestFriends.py rename to tests/test_Friends.py index f336609..c6abfea 100644 --- a/tests/TestFriends.py +++ b/tests/test_Friends.py @@ -1,8 +1,9 @@ import pytest +import os -from Friends import Friends +from miner.Friends import Friends -TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' +TEST_DATA_PATH = f'{os.getcwd()}/test_data' @pytest.fixture() @@ -31,7 +32,7 @@ def test_get_peoples_names_from_friends(friends, expected_friends): def test_get_peoples_compact_name_from_friends(friends, expected_friends): expected_compact_names = [value.get('compact_name') for value in expected_friends.values()] - assert all([p.get('compact_name') in expected_compact_names for p in friends.values()]) + assert all([p.compact_name in expected_compact_names for p in friends.values()]) diff --git a/tests/test_Messages.py b/tests/test_Messages.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_MessagingAnalyzer.py b/tests/test_MessagingAnalyzer.py new file mode 100644 index 0000000..969dc5d --- /dev/null +++ b/tests/test_MessagingAnalyzer.py @@ -0,0 +1,245 @@ +import pytest + +from miner.Analyzer import Analyzer +from miner.utils import dt + +@pytest.fixture(scope='session') +def analyzer(get_people): + people = get_people() + return Analyzer(people) + + +def test_total_number_of_messages(analyzer): + assert analyzer.get_count(attribute='msg_count', ) == 29 + + assert analyzer.get_count(attribute='msg_count', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2014), period='y') == 11 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2018), period='y') == 3 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020), period='y') == 15 + + assert analyzer.get_count(attribute='msg_count', start=dt(year=2011, month=11), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2014, month=9), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2014, month=11), period='m') == 8 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2014, month=12), period='m') == 2 + + assert analyzer.get_count(attribute='msg_count', start=dt(year=2018, month=1), period='m') == 3 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2018, month=5), period='m') == 0 + + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=2), period='m') == 10 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=3), period='m') == 1 # jpg + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=4), period='m') == 2 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=8), period='m') == 1 + + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=2, day=13), period='d') == 2 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=2, day=13, hour=6), period='h') == 2 + + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=2, day=13, hour=6), period='d') == 4 + + +def test_total_number_of_words(analyzer): + assert analyzer.get_count(attribute='word_count', ) == 86 + + assert analyzer.get_count(attribute='word_count', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='word_count', start=dt(year=2014), period='y') == 20 + assert analyzer.get_count(attribute='word_count', start=dt(year=2018), period='y') == 32 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020), period='y') == 34 + + assert analyzer.get_count(attribute='word_count', start=dt(year=2014, month=9), period='m') == 6 + assert analyzer.get_count(attribute='word_count', start=dt(year=2014, month=11), period='m') == 13 + assert analyzer.get_count(attribute='word_count', start=dt(year=2014, month=12), period='m') == 1 + + assert analyzer.get_count(attribute='word_count', start=dt(year=2018, month=1), period='m') == 32 + assert analyzer.get_count(attribute='word_count', start=dt(year=2018, month=2), period='m') == 0 + + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=2), period='m') == 27 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=4), period='m') == 4 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=8), period='m') == 2 + + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=2, day=13), period='d') == 14 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=2, day=13, hour=5), period='d') == 14 + + +def test_total_number_of_characters(analyzer): + assert analyzer.get_count(attribute='char_count', ) == 379 + + assert analyzer.get_count(attribute='char_count', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='char_count', start=dt(year=2014), period='y') == 69 + assert analyzer.get_count(attribute='char_count', start=dt(year=2018), period='y') == 170 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020), period='y') == 140 + + assert analyzer.get_count(attribute='char_count', start=dt(year=2014, month=9), period='m') == 24 + assert analyzer.get_count(attribute='char_count', start=dt(year=2014, month=11), period='m') == 42 + assert analyzer.get_count(attribute='char_count', start=dt(year=2014, month=12), period='m') == 3 + + assert analyzer.get_count(attribute='char_count', start=dt(year=2018, month=1), period='m') == 170 + assert analyzer.get_count(attribute='char_count', start=dt(year=2018, month=2), period='m') == 0 + + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=2), period='m') == 114 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=4), period='m') == 17 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=5), period='m') == 4 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=8), period='m') == 5 + + +def test_total_number_of_messages_sent(analyzer): + assert analyzer.get_count(attribute='msg_count', subject='me', ) == 17 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2014), period='y') == 6 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2018), period='y') == 2 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020), period='y') == 9 + + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2014, month=9), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2014, month=11), period='m') == 4 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2014, month=12), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2018, month=1), period='m') == 2 + + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2011, month=11), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2018, month=5), period='m') == 0 + + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=2), period='m') == 6 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=4), period='m') == 2 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=5), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=8), period='m') == 1 + + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=2, day=13), period='d') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=2, day=13, hour=6), period='h') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=2, day=13, hour=18), period='h') == 0 + + +def test_total_number_of_words_sent(analyzer): + assert analyzer.get_count(attribute='word_count', subject='me', ) == 69 + + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2014), period='y') == 16 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2018), period='y') == 31 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020), period='y') == 22 + + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2014, month=9), period='m') == 6 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2014, month=11), period='m') == 9 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2014, month=12), period='m') == 1 + + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2018, month=1), period='m') == 31 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2018, month=2), period='m') == 0 + + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=2), period='m') == 16 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=4), period='m') == 4 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=5), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=8), period='m') == 2 + + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=2, day=13), period='d') == 5 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=2, day=13, hour=6), period='h') == 5 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 + + +def test_total_number_of_characters_sent(analyzer): + assert analyzer.get_count(attribute='char_count', subject='me', ) == 311 + + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2014), period='y') == 60 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2018), period='y') == 167 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020), period='y') == 84 + + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2014, month=9), period='m') == 24 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2014, month=11), period='m') == 33 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2014, month=12), period='m') == 3 + + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2018, month=1), period='m') == 167 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2018, month=2), period='m') == 0 + + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2), period='m') == 62 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=4), period='m') == 17 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=5), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=8), period='m') == 5 + + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2, day=13, hour=6), period='d') == 21 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2, day=13, hour=7), period='d') == 0 + + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2, day=13, hour=6), period='h') == 21 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 + + +def test_total_number_of_messages_received(analyzer): + assert analyzer.get_count(attribute='msg_count', subject='partner', ) == 12 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2014), period='y') == 5 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2018), period='y') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020), period='y') == 6 + + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2011, month=11), period='m') == 0 + + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2014, month=9), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2014, month=11), period='m') == 4 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2014, month=12), period='m') == 1 + + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2018, month=1), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2018, month=5), period='m') == 0 + + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=2), period='m') == 4 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=3), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=4), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=8), period='m') == 0 + + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=2, day=13), period='d') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=2, day=14), period='d') == 2 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=2, day=18), period='d') == 1 + + +def test_total_number_of_words_received(analyzer): + assert analyzer.get_count(attribute='word_count', subject='partner', ) == 17 + + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2014), period='y') == 4 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2018), period='y') == 1 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020), period='y') == 12 + + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2014, month=9), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2014, month=11), period='m') == 4 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2014, month=12), period='m') == 0 + + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2018, month=1), period='m') == 1 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2018, month=2), period='m') == 0 + + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=2), period='m') == 11 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=5), period='m') == 1 + + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=2, day=13), period='d') == 9 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=2, day=14), period='d') == 2 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=2, day=18), period='d') == 0 + + +def test_total_number_of_characters_received(analyzer): + assert analyzer.get_count(attribute='char_count', subject='partner', ) == 68 + + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2014), period='y') == 9 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2018), period='y') == 3 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020), period='y') == 56 + + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2014, month=9), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2014, month=11), period='m') == 9 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2014, month=12), period='m') == 0 + + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2018, month=1), period='m') == 3 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2018, month=2), period='m') == 0 + + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=2), period='m') == 52 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=5), period='m') == 4 + + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=2, day=13), period='d') == 30 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=2, day=14), period='d') == 22 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=2, day=18), period='d') == 0 diff --git a/tests/test_People.py b/tests/test_People.py new file mode 100644 index 0000000..e9f270e --- /dev/null +++ b/tests/test_People.py @@ -0,0 +1,39 @@ +import pytest + + + +@pytest.fixture() +def people_names(): + return ['John Doe', 'Donald Duck', 'Szett Droxler', 'Foo Bar', 'Tőke Hal', 'Dér Dénes', 'Teflon Musk', 'Daisy Duck', + 'Guy Fawkes', 'Benedek Elek'] + +@pytest.fixture +def people(get_people): + return get_people() + +def test_specific_people_has_or_has_not_got_messages(people): + # TODO LATER parametrize + import pandas as pd + assert isinstance(people.data.get('Benedek Elek').messages, pd.DataFrame) + assert isinstance(people.data.get('Teflon Musk').messages, pd.DataFrame) + assert isinstance(people.data.get('Tőke Hal').messages, pd.DataFrame) + assert not isinstance(people.data.get('John Doe').messages, pd.DataFrame) + assert not isinstance(people.data.get('Szett Droxler').messages, pd.DataFrame) + assert not isinstance(people.data.get('Daisy Duck').messages, pd.DataFrame) + assert not isinstance(people.data.get('Guy Fawkes').messages, pd.DataFrame) + + +def test_people_name(people, people_names): + people_without_groups = [p for p in people.data.keys() if not p.startswith('group')] + assert sorted(people_names) == sorted(people_without_groups) + + +def test_some_convos_are_with_friends(people): + assert people.data.get('Teflon Musk').friend + assert not people.data.get('Benedek Elek').friend + + +def test_specific_people_has_or_has_not_got_media(people): + assert people.data.get('Teflon Musk').media_dir + +#TODO LATER test individuals too \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index 579569c..3577add 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,5 @@ import unittest -from utils import * +from miner.utils import * from pathlib import Path import reusables from reusables.cli import * @@ -65,23 +65,24 @@ def test_generate_date_series(): + # TODO resolve start = datetime(2020, 1, 1, 0, 0) end = datetime(2021, 1, 1, 0, 0) - date_range_year = generate_date_series(start, end, 'y') + date_range_year = generate_date_series(period='y', start=start, end=end) assert len(date_range_year) == 1 + 1 - date_range_month = generate_date_series(start, end, 'm') + date_range_month = generate_date_series(period='m', start=start, end=end) assert len(date_range_month) == 12 + 1 - date_range_day = generate_date_series(start, end, 'd') + date_range_day = generate_date_series(period='d', start=start, end=end) assert len(date_range_day) == 366 + 1 - date_range_hour = generate_date_series(start, end, 'h') + date_range_hour = generate_date_series(period='h', start=start, end=end) assert len(date_range_hour) == (366 * 24) + 1 for day in date_range_day: assert isinstance(day, datetime) with pytest.raises(ValueError): - faulty_date_range = generate_date_series(start, end, ) + faulty_date_range = generate_date_series(start=start, end=end, )