diff --git a/ConversationAnalyzer.py b/ConversationAnalyzer.py index fe30520..cfe1a95 100644 --- a/ConversationAnalyzer.py +++ b/ConversationAnalyzer.py @@ -19,22 +19,22 @@ def __str__(self): def stats(self): return self.get_stats(self.df) - # TODO has to be tested + def get_stats(self, df, subject='all', start=None, end=None, period=None): + df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period) + stats = ConversationStats(df) + return stats + def get_time_series_data(self, subject='all', **kwargs): time_series = generate_time_series(**kwargs) return get_stats_for_intervals(self.get_stats, self.df, time_series, subject=subject) - def get_plotable_time_series_data(self, interval_stats, statistic): + @staticmethod + def get_plottable_time_series_data(interval_stats, statistic): for k, v in interval_stats.items(): if isinstance(v, ConversationStats): interval_stats[k] = getattr(v, statistic) return interval_stats - def get_stats(self, df, subject='all', start=None, end=None, period=None): - df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period) - stats = ConversationStats(df) - return stats - @staticmethod @subject_checker @date_checker @@ -91,13 +91,13 @@ def unique_msg_count(self): # 3. @property def most_used_msgs(self): - # TODO first few (1-10) messages + # TODO LATER first few (1-10) messages return self.messages.value_counts() # 4. @property def msg_frequency(self): - # TODO this has been most likely depracated + # NOTE this has been most likely depracated OR? pass # 5. @@ -132,12 +132,12 @@ def char_count(self): # 10. @property def most_used_chars(self): - return None # TODO or not https://stackoverflow.com/questions/4131123/finding-the-most-frequent-character-in-a-string + return None # TODO LATER or not https://stackoverflow.com/questions/4131123/finding-the-most-frequent-character-in-a-string # 11. @property def rate_of_media_messages(self): - pass # TODO what? + pass # NOTE what? def get_words(self): token_list = self.messages.str.lower().str.split() diff --git a/Conversations.py b/Conversations.py index 112f336..3fb1fbd 100644 --- a/Conversations.py +++ b/Conversations.py @@ -43,7 +43,7 @@ def extract_names_from_convos(jsons): else: name_data_map[key] = { 'title': msg.title, - 'compact_name': msg.compact_names, # TODO is list ok for if length is only 1?? + 'compact_name': msg.compact_names, # 'participants': msg.participants + ['Levente Csőke'], 'participants': msg.participants, 'messages': msg.df, @@ -67,7 +67,7 @@ def to_df(self): self._df = pd.DataFrame(self.decoded.get('messages')) def set_date_as_index(self): - # TODO maybe not needed; could calculate real time + # NOTE maybe not needed; could calculate real time date_series = self._df.timestamp_ms.apply(self.ts_to_date) self._df = self._df.set_index(date_series).iloc[::-1] diff --git a/Group.py b/Group.py index 41b966f..2152d77 100644 --- a/Group.py +++ b/Group.py @@ -1,4 +1,4 @@ -# TODO groups should be searched by looking into jsons unfortunately :( +# TODO LATER groups should be searched by looking into jsons unfortunately :( # because of directory says others # maybe we dont use groups right away? diff --git a/MessagingAnalyzer.py b/MessagingAnalyzer.py index 07965e5..0619505 100644 --- a/MessagingAnalyzer.py +++ b/MessagingAnalyzer.py @@ -1,26 +1,22 @@ -from utils import year_converter, month_converter, generate_time_series, get_stats_for_intervals +from utils import year_converter, month_converter, generate_date_series, get_stats_for_intervals from datetime import datetime, date, timedelta import pandas as pd from ConversationAnalyzer import ConversationAnalyzer -""" - -""" - class MessagingAnalyzer: def __init__(self, names, people): - # TODO input people only. class ill know what to do + # TODO input people only. class will know what to do self.names = names self.people = people def time_series_analysis_for_all(self, subject=None, **kwargs): - time_series = generate_time_series(**kwargs) + time_series = generate_date_series(**kwargs) stacked_df = self.stack_dfs(self.people) interval_stats = get_stats_for_intervals(self.get_stats, stacked_df, time_series, subject=subject) def get_stats(self, df, subject='all', start=None, end=None, period=None): - # TODO + # TODO LATER # here you have to do something with it pass @@ -29,14 +25,14 @@ def get_count(self, attribute, subject='all', start=None, end=None, period=None) # we have a list of names we want to iterate over for name in self.names: stats = self.get_conversation_stats(name=name, subject=subject, start=start, end=end, period=period) - if stats is not None: # TODO too explicit; needed because it is possible that None will be returned, if t got an empty df + if stats is not None: count += getattr(stats, attribute) return count def get_conversation_stats(self, name, subject='all', start=None, end=None, period=None): messages = self.people.get(name).messages analyzer = ConversationAnalyzer(name, messages) - if analyzer is None: # TODO this is too explicit ?! + if analyzer is None: return None return analyzer.get_stats(messages, subject=subject, start=start, end=end, period=period) @@ -80,7 +76,7 @@ def most_used_messages_by_me(self, **kwargs): >>> s2 = pd.Series([3, 2, 1, 1]) >>> s1_vc = s1.value_counts() >>> s2_vc = s2.value_counts() - TODO (later) most used is already a problem: + TODO LATER most used is already a problem: - because its a series of all the unique messages/words ever used in a convo - it contains strings like ':d', ':p' and 'xd' - from all the convos the result of value_counts has to be cleared @@ -101,14 +97,14 @@ def most_used_words_by_partners(self, **kwargs): # 5. Number of messages sent/got on busiest period (by year/month/day/hour) def days_when_most_messages_sent(self): - # TODO hard algorithmic problem + # TODO LATER hard algorithmic problem pass def days_when_most_messages_received(self): pass def hours_when_most_messages_sent(self): - # TODO + # TODO LATER # is this referring to the absolute hour most messages sent?? # like: 2014.07.25. 15h-16h # OR @@ -131,5 +127,4 @@ def stack_dfs(people): for data in people.values(): if data.messages is not None: dfs.append(data.messages) - # TODO do I need to sort by index (date)? yes! - return pd.concat(dfs).sort_index() # TODO why ignore_index?? + return pd.concat(dfs).sort_index() diff --git a/Visualizer.py b/Visualizer.py index b4a3779..052ecb3 100644 --- a/Visualizer.py +++ b/Visualizer.py @@ -4,33 +4,37 @@ from People import People from ConversationAnalyzer import ConversationAnalyzer - # plt.rcParams.update({'figure.figsize': (10, 7), 'figure.dpi': 120}) +TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' + + class Visualizer: def __init__(self): pass - def plot_time_series(self, x, y, title="Time series", xlabel='Date', ylabel='Value', dpi=100): + def plot_convos(self, names): + people = People(path=TEST_DATA_PATH) + for name in names: + data = self.set_up_data(people, name, period='d') + df = pd.DataFrame(data.items(), columns=['date', 'value']) + v.plot_time_series(x=df.date, y=df.value, title=name) + + @staticmethod + def set_up_data(people, name, period='y'): + analyzer = ConversationAnalyzer(name, people.individuals.get(name).messages) + interval_stats = analyzer.get_time_series_data(subject='all', start=None, end=None, period=period) + return analyzer.get_plottable_time_series_data(interval_stats, statistic='msg_count') + + @staticmethod + def plot_time_series(x, y, title="Time series", xlabel='Date', ylabel='Value', dpi=100): plt.figure(figsize=(16, 5), dpi=dpi) plt.plot(x, y, color='tab:red') plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel) plt.show() -def set_up(people, name, interval='y'): - analyzer = ConversationAnalyzer(name, people.individuals.get(name).messages) - interval_stats = analyzer.get_time_series_data() - stats = interval_stats.get(interval) - return analyzer.get_plotable_time_series_data(stats, statistic='msg_count') - - if __name__ == "__main__": - v = Visualizer() - TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' - people = People(path=TEST_DATA_PATH) names = ['Teflon Musk', 'Tőke Hal'] - for name in names: - data = set_up(people, name, interval='d') - df = pd.DataFrame(data.items(), columns=['date', 'value']) - v.plot_time_series(x=df.date, y=df.value, title=name) + v = Visualizer() + v.plot_convos(names) diff --git a/tests/TestConversationAnalyzer.py b/tests/TestConversationAnalyzer.py index 5fa87d8..015ac19 100644 --- a/tests/TestConversationAnalyzer.py +++ b/tests/TestConversationAnalyzer.py @@ -43,7 +43,7 @@ def _stats(name, **kwargs): return _stats -# TODO extend all functions with all the data +# TODO LATER or not extend all functions with all the data def test_stats_toke_hal_all(statistics): stats = statistics('Tőke Hal') @@ -133,8 +133,8 @@ def test_stats_toke_hal_me_2014_12(statistics): def test_stats_teflon_musk(statistics): stats = statistics('Teflon Musk') assert stats.msg_count == 6 - assert stats.unique_msg_count == 2 # TODO this does not count media messages - # assert stats.most_used_msgs == 0 # TODO should only return the most used or e.g. top10 most used + assert stats.unique_msg_count == 2 + # assert stats.most_used_msgs == 0 # TODO LATER should only return the most used or e.g. top10 most used # assert stats.msg_frequency == 0 assert stats.word_count == 14 assert stats.unique_word_count == 7 @@ -224,6 +224,9 @@ def test_stats_teflon_musk_all_2014_12(statistics): assert stats.char_count == 0 # assert stats.most_used_chars == 0 +class TestConversationAnalyzer: # Foo Bar + pass + def test_time_series_analysis_for_user(analyze): analyzer = analyze('Teflon Musk') diff --git a/tests/TestMessagingAnalyzer.py b/tests/TestMessagingAnalyzer.py index d53af70..b803693 100644 --- a/tests/TestMessagingAnalyzer.py +++ b/tests/TestMessagingAnalyzer.py @@ -2,61 +2,94 @@ from MessagingAnalyzer import MessagingAnalyzer from utils import dt - @pytest.fixture(scope='session') def analyzer(people): return MessagingAnalyzer(people.names, people.individuals) def test_total_number_of_messages(analyzer): - assert analyzer.total_number_of_messages() == 14 + assert analyzer.total_number_of_messages() == 29 + + assert analyzer.total_number_of_messages(start=dt(year=2000), period='y') == 0 assert analyzer.total_number_of_messages(start=dt(year=2014), period='y') == 11 assert analyzer.total_number_of_messages(start=dt(year=2018), period='y') == 3 + assert analyzer.total_number_of_messages(start=dt(year=2020), period='y') == 15 + assert analyzer.total_number_of_messages(start=dt(year=2011, month=11), period='m') == 0 assert analyzer.total_number_of_messages(start=dt(year=2014, month=9), period='m') == 1 assert analyzer.total_number_of_messages(start=dt(year=2014, month=11), period='m') == 8 assert analyzer.total_number_of_messages(start=dt(year=2014, month=12), period='m') == 2 - assert analyzer.total_number_of_messages(start=dt(year=2018, month=1), period='m') == 3 - assert analyzer.total_number_of_messages(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_messages(start=dt(year=2011, month=11), period='m') == 0 + assert analyzer.total_number_of_messages(start=dt(year=2018, month=1), period='m') == 3 assert analyzer.total_number_of_messages(start=dt(year=2018, month=5), period='m') == 0 + assert analyzer.total_number_of_messages(start=dt(year=2020, month=2), period='m') == 10 + assert analyzer.total_number_of_messages(start=dt(year=2020, month=3), period='m') == 1 # jpg + assert analyzer.total_number_of_messages(start=dt(year=2020, month=4), period='m') == 2 + assert analyzer.total_number_of_messages(start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.total_number_of_messages(start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.total_number_of_messages(start=dt(year=2020, month=8), period='m') == 1 + + assert analyzer.total_number_of_messages(start=dt(year=2020, month=2, day=13), period='d') == 2 + assert analyzer.total_number_of_messages(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 2 + + assert analyzer.total_number_of_messages(start=dt(year=2020, month=2, day=13, hour=6), period='d') == 4 + def test_total_number_of_words(analyzer): - assert analyzer.total_number_of_words() == 24 + assert analyzer.total_number_of_words() == 86 assert analyzer.total_number_of_words(start=dt(year=2000), period='y') == 0 assert analyzer.total_number_of_words(start=dt(year=2014), period='y') == 20 - assert analyzer.total_number_of_words(start=dt(year=2018), period='y') == 4 + assert analyzer.total_number_of_words(start=dt(year=2018), period='y') == 32 + assert analyzer.total_number_of_words(start=dt(year=2020), period='y') == 34 assert analyzer.total_number_of_words(start=dt(year=2014, month=9), period='m') == 6 assert analyzer.total_number_of_words(start=dt(year=2014, month=11), period='m') == 13 assert analyzer.total_number_of_words(start=dt(year=2014, month=12), period='m') == 1 - assert analyzer.total_number_of_words(start=dt(year=2018, month=1), period='m') == 4 + assert analyzer.total_number_of_words(start=dt(year=2018, month=1), period='m') == 32 assert analyzer.total_number_of_words(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.total_number_of_words(start=dt(year=2020, month=2), period='m') == 27 + assert analyzer.total_number_of_words(start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.total_number_of_words(start=dt(year=2020, month=4), period='m') == 4 + assert analyzer.total_number_of_words(start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.total_number_of_words(start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.total_number_of_words(start=dt(year=2020, month=8), period='m') == 2 + + assert analyzer.total_number_of_words(start=dt(year=2020, month=2, day=13), period='d') == 14 + assert analyzer.total_number_of_words(start=dt(year=2020, month=2, day=13, hour=5), period='d') == 14 + def test_total_number_of_characters(analyzer): - assert analyzer.total_number_of_characters() == 81 + assert analyzer.total_number_of_characters() == 379 assert analyzer.total_number_of_characters(start=dt(year=2000), period='y') == 0 assert analyzer.total_number_of_characters(start=dt(year=2014), period='y') == 69 - assert analyzer.total_number_of_characters(start=dt(year=2018), period='y') == 12 + assert analyzer.total_number_of_characters(start=dt(year=2018), period='y') == 170 + assert analyzer.total_number_of_characters(start=dt(year=2020), period='y') == 140 assert analyzer.total_number_of_characters(start=dt(year=2014, month=9), period='m') == 24 assert analyzer.total_number_of_characters(start=dt(year=2014, month=11), period='m') == 42 assert analyzer.total_number_of_characters(start=dt(year=2014, month=12), period='m') == 3 - assert analyzer.total_number_of_characters(start=dt(year=2018, month=1), period='m') == 12 + assert analyzer.total_number_of_characters(start=dt(year=2018, month=1), period='m') == 170 assert analyzer.total_number_of_characters(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.total_number_of_characters(start=dt(year=2020, month=2), period='m') == 114 + assert analyzer.total_number_of_characters(start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.total_number_of_characters(start=dt(year=2020, month=4), period='m') == 17 + assert analyzer.total_number_of_characters(start=dt(year=2020, month=5), period='m') == 4 + assert analyzer.total_number_of_characters(start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.total_number_of_characters(start=dt(year=2020, month=8), period='m') == 5 + def test_total_number_of_messages_sent(analyzer): - assert analyzer.total_number_of_messages_sent() == 8 + assert analyzer.total_number_of_messages_sent() == 17 assert analyzer.total_number_of_messages_sent(start=dt(year=2014), period='y') == 6 assert analyzer.total_number_of_messages_sent(start=dt(year=2018), period='y') == 2 + assert analyzer.total_number_of_messages_sent(start=dt(year=2020), period='y') == 9 assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=9), period='m') == 1 assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=11), period='m') == 4 @@ -67,58 +100,108 @@ def test_total_number_of_messages_sent(analyzer): assert analyzer.total_number_of_messages_sent(start=dt(year=2011, month=11), period='m') == 0 assert analyzer.total_number_of_messages_sent(start=dt(year=2018, month=5), period='m') == 0 + assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2), period='m') == 6 + assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=4), period='m') == 2 + assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=5), period='m') == 0 + assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=8), period='m') == 1 + + assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2, day=13), period='d') == 1 + assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 1 + assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2, day=13, hour=18), period='h') == 0 + def test_total_number_of_words_sent(analyzer): - assert analyzer.total_number_of_words_sent() == 19 + assert analyzer.total_number_of_words_sent() == 69 assert analyzer.total_number_of_words_sent(start=dt(year=2000), period='y') == 0 assert analyzer.total_number_of_words_sent(start=dt(year=2014), period='y') == 16 - assert analyzer.total_number_of_words_sent(start=dt(year=2018), period='y') == 3 + assert analyzer.total_number_of_words_sent(start=dt(year=2018), period='y') == 31 + assert analyzer.total_number_of_words_sent(start=dt(year=2020), period='y') == 22 assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=9), period='m') == 6 assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=11), period='m') == 9 assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=12), period='m') == 1 - assert analyzer.total_number_of_words_sent(start=dt(year=2018, month=1), period='m') == 3 + assert analyzer.total_number_of_words_sent(start=dt(year=2018, month=1), period='m') == 31 assert analyzer.total_number_of_words_sent(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2), period='m') == 16 + assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=4), period='m') == 4 + assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=5), period='m') == 0 + assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=8), period='m') == 2 + + assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2, day=13), period='d') == 5 + assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 5 + assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 + def test_total_number_of_characters_sent(analyzer): - assert analyzer.total_number_of_characters_sent() == 69 + assert analyzer.total_number_of_characters_sent() == 311 assert analyzer.total_number_of_characters_sent(start=dt(year=2000), period='y') == 0 assert analyzer.total_number_of_characters_sent(start=dt(year=2014), period='y') == 60 - assert analyzer.total_number_of_characters_sent(start=dt(year=2018), period='y') == 9 + assert analyzer.total_number_of_characters_sent(start=dt(year=2018), period='y') == 167 + assert analyzer.total_number_of_characters_sent(start=dt(year=2020), period='y') == 84 assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=9), period='m') == 24 assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=11), period='m') == 33 assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=12), period='m') == 3 - assert analyzer.total_number_of_characters_sent(start=dt(year=2018, month=1), period='m') == 9 + assert analyzer.total_number_of_characters_sent(start=dt(year=2018, month=1), period='m') == 167 assert analyzer.total_number_of_characters_sent(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2), period='m') == 62 + assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=4), period='m') == 17 + assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=5), period='m') == 0 + assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=8), period='m') == 5 + + assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=6), period='d') == 21 + assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=7), period='d') == 0 + + assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 21 + assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 + def test_total_number_of_messages_received(analyzer): - assert analyzer.total_number_of_messages_received() == 6 + assert analyzer.total_number_of_messages_received() == 12 + assert analyzer.total_number_of_messages_received(start=dt(year=2000), period='y') == 0 assert analyzer.total_number_of_messages_received(start=dt(year=2014), period='y') == 5 assert analyzer.total_number_of_messages_received(start=dt(year=2018), period='y') == 1 + assert analyzer.total_number_of_messages_received(start=dt(year=2020), period='y') == 6 + + assert analyzer.total_number_of_messages_received(start=dt(year=2011, month=11), period='m') == 0 assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=9), period='m') == 0 assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=11), period='m') == 4 assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=12), period='m') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2018, month=1), period='m') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2011, month=11), period='m') == 0 + assert analyzer.total_number_of_messages_received(start=dt(year=2018, month=1), period='m') == 1 assert analyzer.total_number_of_messages_received(start=dt(year=2018, month=5), period='m') == 0 + assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2), period='m') == 4 + assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=3), period='m') == 1 + assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=4), period='m') == 0 + assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=8), period='m') == 0 + + assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2, day=13), period='d') == 1 + assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2, day=14), period='d') == 2 + assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2, day=18), period='d') == 1 + def test_total_number_of_words_received(analyzer): - assert analyzer.total_number_of_words_received() == 5 + assert analyzer.total_number_of_words_received() == 17 assert analyzer.total_number_of_words_received(start=dt(year=2000), period='y') == 0 assert analyzer.total_number_of_words_received(start=dt(year=2014), period='y') == 4 assert analyzer.total_number_of_words_received(start=dt(year=2018), period='y') == 1 + assert analyzer.total_number_of_words_received(start=dt(year=2020), period='y') == 12 assert analyzer.total_number_of_words_received(start=dt(year=2014, month=9), period='m') == 0 assert analyzer.total_number_of_words_received(start=dt(year=2014, month=11), period='m') == 4 @@ -127,13 +210,22 @@ def test_total_number_of_words_received(analyzer): assert analyzer.total_number_of_words_received(start=dt(year=2018, month=1), period='m') == 1 assert analyzer.total_number_of_words_received(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2), period='m') == 11 + assert analyzer.total_number_of_words_received(start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.total_number_of_words_received(start=dt(year=2020, month=5), period='m') == 1 + + assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2, day=13), period='d') == 9 + assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2, day=14), period='d') == 2 + assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2, day=18), period='d') == 0 + def test_total_number_of_characters_received(analyzer): - assert analyzer.total_number_of_characters_received() == 12 + assert analyzer.total_number_of_characters_received() == 68 assert analyzer.total_number_of_characters_received(start=dt(year=2000), period='y') == 0 assert analyzer.total_number_of_characters_received(start=dt(year=2014), period='y') == 9 assert analyzer.total_number_of_characters_received(start=dt(year=2018), period='y') == 3 + assert analyzer.total_number_of_characters_received(start=dt(year=2020), period='y') == 56 assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=9), period='m') == 0 assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=11), period='m') == 9 @@ -141,3 +233,11 @@ def test_total_number_of_characters_received(analyzer): assert analyzer.total_number_of_characters_received(start=dt(year=2018, month=1), period='m') == 3 assert analyzer.total_number_of_characters_received(start=dt(year=2018, month=2), period='m') == 0 + + assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2), period='m') == 52 + assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=5), period='m') == 4 + + assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2, day=13), period='d') == 30 + assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2, day=14), period='d') == 22 + assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2, day=18), period='d') == 0 diff --git a/tests/TestPeople.py b/tests/TestPeople.py index e29448b..61295d8 100644 --- a/tests/TestPeople.py +++ b/tests/TestPeople.py @@ -9,7 +9,7 @@ def people_names(): def test_specific_people_has_or_has_not_got_messages(people): - # TODO parametrize + # TODO LATER parametrize import pandas as pd assert isinstance(people.data.get('Benedek Elek').get('messages'), pd.DataFrame) assert isinstance(people.data.get('Teflon Musk').get('messages'), pd.DataFrame) @@ -33,4 +33,4 @@ def test_some_convos_are_with_friends(people): def test_specific_people_has_or_has_not_got_media(people): assert people.data.get('Teflon Musk').get('media_dir') -#TODO test individuals too \ No newline at end of file +#TODO LATER test individuals too \ No newline at end of file diff --git a/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/audio/audioclip15905232600004598_2621787141481389.mp4 b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/audio/audioclip15905232600004598_2621787141481389.mp4 new file mode 100644 index 0000000..7c1e48d Binary files /dev/null and b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/audio/audioclip15905232600004598_2621787141481389.mp4 differ diff --git a/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/files/1810.04805.pdf b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/files/1810.04805.pdf new file mode 100644 index 0000000..2394716 Binary files /dev/null and b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/files/1810.04805.pdf differ diff --git a/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/gifs/19349964_1624604560892442_7457726181358436352_n_487109582171361.gif b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/gifs/19349964_1624604560892442_7457726181358436352_n_487109582171361.gif new file mode 100644 index 0000000..54f7837 Binary files /dev/null and b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/gifs/19349964_1624604560892442_7457726181358436352_n_487109582171361.gif differ diff --git a/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/gifs/97999627_1419172538270405_8596479473619042304_n_2963870430335255.gif b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/gifs/97999627_1419172538270405_8596479473619042304_n_2963870430335255.gif new file mode 100644 index 0000000..079f486 Binary files /dev/null and b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/gifs/97999627_1419172538270405_8596479473619042304_n_2963870430335255.gif differ diff --git a/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/photos/apple-5391076_960_720.jpg b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/photos/apple-5391076_960_720.jpg new file mode 100644 index 0000000..6c1d5ff Binary files /dev/null and b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/photos/apple-5391076_960_720.jpg differ diff --git a/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/photos/blueberry-5417154_960_720.jpg b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/photos/blueberry-5417154_960_720.jpg new file mode 100644 index 0000000..15530bd Binary files /dev/null and b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/photos/blueberry-5417154_960_720.jpg differ diff --git a/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/videos/thumbnails/20562500_1414664788613193_5052712665724834582_n.jpg b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/videos/thumbnails/20562500_1414664788613193_5052712665724834582_n.jpg new file mode 100644 index 0000000..0fca343 Binary files /dev/null and b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/videos/thumbnails/20562500_1414664788613193_5052712665724834582_n.jpg differ diff --git a/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/videos/video1501528035_1573509826004448.mp4 b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/videos/video1501528035_1573509826004448.mp4 new file mode 100644 index 0000000..147852b Binary files /dev/null and b/tests/test_data/messages/inbox/FooBar_n5fd6gG50h/videos/video1501528035_1573509826004448.mp4 differ diff --git a/tests/test_data/messages/inbox/benedekelek_s4f65sdg/message_1.json b/tests/test_data/messages/inbox/benedekelek_s4f65sdg/message_1.json index 61c06a1..ecdb68d 100644 --- a/tests/test_data/messages/inbox/benedekelek_s4f65sdg/message_1.json +++ b/tests/test_data/messages/inbox/benedekelek_s4f65sdg/message_1.json @@ -23,7 +23,7 @@ { "sender_name": "Levente Cs\u00c5\u0091ke", "timestamp_ms": 1515571228715, - "content": "yo", + "content": "yo Legyen az, hogy most megprobalok ekezet nelkul irni. Seems pretty easy. I need some english words in here. Right? A magyar szavak felismereset probalom tesztelni ezzekkel a mondatokkal.", "type": "Generic" } ], diff --git a/tests/test_data/messages/inbox/foobar_n5fd6gG50h/message_1.json b/tests/test_data/messages/inbox/foobar_n5fd6gG50h/message_1.json new file mode 100644 index 0000000..26953da --- /dev/null +++ b/tests/test_data/messages/inbox/foobar_n5fd6gG50h/message_1.json @@ -0,0 +1,157 @@ +{ + "participants": [ + { + "name": "Foo Bar" + }, + { + "name": "Levente Cs\u00c5\u0091ke" + } + ], + "messages": [ + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1596910928321, + "content": "OUT! \u00e2\u009d\u00a4", + "type": "Generic" + }, + { + "sender_name": "Foo Bar", + "timestamp_ms": 1588500928123, + "content": "OUT!", + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1587850928047, + "videos": [ + { + "uri": "messages/inbox/FooBar_n5fd6gG50h/videos/video1501528035_1573509826004448.mp4", + "creation_timestamp": 1587850940, + "thumbnail": { + "uri": "messages/inbox/FooBar_n5fd6gG50h/videos/thumbnails/20562500_1414664788613193_5052712665724834582_n.jpg" + } + } + ], + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1585850928047, + "content": "Whet? Check this! :P", + "type": "Generic" + }, + { + "sender_name": "Foo Bar", + "timestamp_ms": 1583750928047, + "audio_files": [ + { + "uri": "messages/inbox/FooBar_n5fd6gG50h/audio/audioclip15905232600004598_2621787141481389.mp4", + "creation_timestamp": 1583750927 + } + ], + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1582720928145, + "photos": [ + { + "uri": "messages/inbox/FooBar_n5fd6gG50h/photos/apple-5391076_960_720.jpg" + } + ], + "type": "Generic" + }, + { + "sender_name": "Foo Bar", + "timestamp_ms": 1582010928145, + "gifs": [ + { + "uri": "messages/inbox/FooBar_n5fd6gG50h/gifs/19349964_1624604560892442_7457726181358436352_n_487109582171361.gif" + } + ], + "reactions": [ + { + "reaction": "\u00e2\u009d\u00a4", + "actor": "Levente Cs\u00c5\u0091ke" + } + ], + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1581980928047, + "content": "What the hack? xdddddd :D", + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1581700928145, + "gifs": [ + { + "uri": "messages/inbox/FooBar_n5fd6gG50h/gifs/97999627_1419172538270405_8596479473619042304_n_2963870430335255.gif" + } + ], + "reactions": [ + { + "reaction": "\u00f0\u009f\u0098\u00ae", + "actor": "Foo Bar" + } + ], + "type": "Generic" + }, + { + "sender_name": "Foo Bar", + "timestamp_ms": 1581690928047, + "content": "Excepteur...laborum. :D", + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1581680928047, + "content": "Duis duia .. ! xdddddd :D", + "type": "Generic" + }, + { + "sender_name": "Foo Bar", + "timestamp_ms": 1581650928047, + "files": [ + { + "uri": "messages/inbox/FooBar_n5fd6gG50h/files/1810.04805.pdf" + } + ], + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1581640928145, + "photos": [ + { + "uri": "messages/inbox/FooBar_n5fd6gG50h/photos/blueberry-5417154_960_720.jpg" + } + ], + "type": "Generic" + }, + { + "sender_name": "Foo Bar", + "timestamp_ms": 1581570938715, + "content": "Ut akar ... consequat. oO wow :P xd :D", + "reactions": [ + { + "reaction": "\u00e2\u009d\u00a4", + "actor": "Levente Cs\u00c5\u0091ke" + } + ], + "type": "Generic" + }, + { + "sender_name": "Levente Cs\u00c5\u0091ke", + "timestamp_ms": 1581570928715, + "content": "Lorem lorim.. foo bar \u00f0\u009f\u0098\u00a1\u00f0\u009f\u0098\u00a1\u00f0\u009f\u0098\u00a1", + "type": "Generic" + } + ], + "title": "Foo Bar", + "is_still_participant": true, + "thread_type": "Regular", + "thread_path": "inbox/foobar_n5fd6gG50h" +} \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index 76352c0..579569c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,56 +3,85 @@ from pathlib import Path import reusables from reusables.cli import * +from datetime import datetime +import pytest -#TODO convert this to pytest - -class TestUtils(unittest.TestCase): - def setUp(self): - self.test_data_path = Path(f'{pwd()}/test_data') - - def test_read_json(self): - dummy1 = {'0': 'a', '2': 'c'} - dummy1_path = Path(self.test_data_path / 'dummy1.json') - dump_to_json(file=dummy1_path, data=dummy1) - - dummy2 = {'1': 'b', '3': 'd'} - dummy2_path = Path(self.test_data_path / 'dummy2.json') - dump_to_json(file=dummy2_path, data=dummy2) - - dummy_unified = get_messages(dummy1_path, dummy2_path) - expected = {'0': 'a', '1': 'b', '2': 'c', '3': 'd'} - - self.assertDictEqual(expected, dummy_unified) - - dummy1_path.unlink() - dummy2_path.unlink() - - def test_decode_text(self): - dummy = {'0': '\u00c5\u0091', '1': '\u00c3\u00a1', '2': ['\u00c5\u0091', {'0': '\u00c3\u00a1'}], - '3': {'0': '\u00c5\u0091'}} - dummy_path = Path(self.test_data_path / 'dummy.json') - dump_to_json(file=dummy_path, data=dummy) - read = get_messages(dummy_path, decode=True) - expected = {'0': 'ő', '1': 'á', '2': ['ő', {'0': 'á'}], - '3': {'0': 'ő'}} - self.assertEqual(expected, read) - - dummy_path.unlink() - - def later_tests(self): - data = {'participants': [{'name': 'Csőke Boglárka'}, {'name': 'Levente Csőke'}], - 'title': 'Csőke Boglárka', - 'is_still_participant': True, - 'thread_type': 'Regular', - 'thread_path': 'inbox/CsokeBoglarka_5A48Zi9P1w', - 'messages': [...]} - msg_element = {'sender_name': 'Levente Csőke', - 'timestamp_ms': 1440948801592, - 'content': 'ahaa', - 'type': 'Generic'} - - def test_read_real_data(self): - bogi_msg = Path('/home/levente/projects/facebook-data-miner/data/messages/inbox/csokeboglarka_5a48zi9p1w') - data = get_messages(bogi_msg/'message_1.json', bogi_msg/'message_2.json') - msg = data['messages'] \ No newline at end of file + +# class TestUtils(unittest.TestCase): +# def setUp(self): +# self.test_data_path = Path(f'{pwd()}/test_data') +# +# def test_read_json(self): +# dummy1 = {'0': 'a', '2': 'c'} +# dummy1_path = Path(self.test_data_path / 'dummy1.json') +# dump_to_json(file=dummy1_path, data=dummy1) +# +# dummy2 = {'1': 'b', '3': 'd'} +# dummy2_path = Path(self.test_data_path / 'dummy2.json') +# dump_to_json(file=dummy2_path, data=dummy2) +# +# dummy_unified = get_messages(dummy1_path, dummy2_path) +# expected = {'0': 'a', '1': 'b', '2': 'c', '3': 'd'} +# +# self.assertDictEqual(expected, dummy_unified) +# +# dummy1_path.unlink() +# dummy2_path.unlink() +# +# def test_decode_text(self): +# dummy = {'0': '\u00c5\u0091', '1': '\u00c3\u00a1', '2': ['\u00c5\u0091', {'0': '\u00c3\u00a1'}], +# '3': {'0': '\u00c5\u0091'}} +# dummy_path = Path(self.test_data_path / 'dummy.json') +# dump_to_json(file=dummy_path, data=dummy) +# read = get_messages(dummy_path, decode=True) +# expected = {'0': 'ő', '1': 'á', '2': ['ő', {'0': 'á'}], +# '3': {'0': 'ő'}} +# self.assertEqual(expected, read) +# +# dummy_path.unlink() +# +# def later_tests(self): +# data = {'participants': [{'name': 'Csőke Boglárka'}, {'name': 'Levente Csőke'}], +# 'title': 'Csőke Boglárka', +# 'is_still_participant': True, +# 'thread_type': 'Regular', +# 'thread_path': 'inbox/CsokeBoglarka_5A48Zi9P1w', +# 'messages': [...]} +# msg_element = {'sender_name': 'Levente Csőke', +# 'timestamp_ms': 1440948801592, +# 'content': 'ahaa', +# 'type': 'Generic'} +# +# def test_read_real_data(self): +# bogi_msg = Path('/home/levente/projects/facebook-data-miner/data/messages/inbox/csokeboglarka_5a48zi9p1w') +# data = get_messages(bogi_msg/'message_1.json', bogi_msg/'message_2.json') +# msg = data['messages'] +# +# +# decode_text('Minden jot Levii \u00e2\u009d\u00a4\u00ef\u00b8\u008f') +# decode_text('”Hat \u00f0\u009f\u008e\u00a9\u00f0\u009f\u00a4\u0094') +# decode_text('\u00f0\u009f\u0098\u00a1\u00f0\u009f\u0098\u00a1\u00f0\u009f\u0098\u00a1') + + +def test_generate_date_series(): + start = datetime(2020, 1, 1, 0, 0) + end = datetime(2021, 1, 1, 0, 0) + + date_range_year = generate_date_series(start, end, 'y') + assert len(date_range_year) == 1 + 1 + + date_range_month = generate_date_series(start, end, 'm') + assert len(date_range_month) == 12 + 1 + + date_range_day = generate_date_series(start, end, 'd') + assert len(date_range_day) == 366 + 1 + + date_range_hour = generate_date_series(start, end, 'h') + assert len(date_range_hour) == (366 * 24) + 1 + + for day in date_range_day: + assert isinstance(day, datetime) + + with pytest.raises(ValueError): + faulty_date_range = generate_date_series(start, end, ) diff --git a/utils.py b/utils.py index 8594258..2a48624 100644 --- a/utils.py +++ b/utils.py @@ -8,37 +8,6 @@ 'november', 'december'] -def get_stats_for_intervals(func, df, time_series, subject='all'): - data = {} - for offset, series in time_series.items(): - data[offset] = {} - for i in range(len(series) - 1): # only looping len - 1 times - start = series[i] - # TODO will we miss the last entry? I dont think so (99%), but check and correct hand in hand with the timeseries bug - # IT DOES NOT! HOWEVER test it with new data injected/modified at runtime <- this is hard - end = series[i + 1] - data[offset][start] = func(df, subject=subject, start=start, end=end) - return data - - -# @date_checker -def dt(year: int = 2004, month: int = 1, day: int = 1, hour: int = 0): - return datetime(year=year, month=month, day=day, hour=hour) - - -def get_messages(*files, decode=True): - data = {} - for file in files: - temp = decode_text(read_json(file)) if decode else read_json(file) - if not data: - data = temp - elif data.get('messages') and temp.get('messages'): - data['messages'] += temp.get('messages') - if sorted(temp.keys()) != sorted(data.keys()): - data = {**temp, **data} - return data - - def read_json(file): with open(file) as f: return json.load(f) @@ -49,19 +18,6 @@ def dump_to_json(data=None, file=None): json.dump(data, f) -def decode_text(obj): - if isinstance(obj, str): - return obj.encode('latin_1').decode('utf-8') - - if isinstance(obj, list): - return [decode_text(o) for o in obj] - - if isinstance(obj, dict): - return {key: decode_text(item) for key, item in obj.items()} - - return obj - - def order_list_of_dicts(lst, key='timestamp_ms'): return sorted(lst, key=lambda k: k[key]) @@ -128,25 +84,6 @@ def wrapper(*args, **kwargs): return wrapper -# TODO period can refer to 2 things -# 1. I used it for y/m/d/h -# 2. return only with one date_range series -# decide wtf -def generate_time_series(start=None, end=None, period=None): - start = start or datetime(year=2009, month=10, day=2, hour=0) - end = end or datetime.now() - time_series = { - 'y': pd.date_range(start=start, end=end, freq='YS'), # TODO does not include 2009 - 'm': pd.date_range(start=start, end=end, freq='1MS'), # TODO does not include october - 'd': pd.date_range(start=start, end=end, freq='1D'), # TODO does not include 2. ?! not sure if it is true - # TODO put this back after dev phase is over - # 'h': pd.date_range(start=start, end=end, freq='1H'), # TODO hour should only be run ONCE - } - if period and period in ('y', 'm', 'd', 'h'): - return time_series[period] - return time_series - - def subject_checker(func): def wrapper(*args, **kwargs): if not kwargs.get('subject') or kwargs.get('subject') not in ('all', 'me', 'partner'): @@ -173,19 +110,79 @@ def wrapper(*args, **kwargs): return wrapper +DELTA_MAP = { + 'y': relativedelta(years=+1), + 'm': relativedelta(months=+1), + 'd': timedelta(days=1), + 'h': timedelta(hours=1) +} + + def period_checker(func): def wrapper(*args, **kwargs): if kwargs.get('start') is not None and kwargs.get('end') is not None: return func(*args, **kwargs) - delta_map = { - 'y': relativedelta(years=+1), - 'm': relativedelta(months=+1), - 'd': timedelta(days=1), - 'h': timedelta(hours=1) - } - if not kwargs.get('period') or delta_map[kwargs.get('period')] is None: + + if not kwargs.get('period') or DELTA_MAP[kwargs.get('period')] is None: raise ValueError('Parameter `period` should be one of {y, m, d, h}') - kwargs['period'] = delta_map[kwargs.get('period')] + kwargs['period'] = DELTA_MAP[kwargs.get('period')] return func(*args, **kwargs) return wrapper + + +def generate_date_series(start=None, end=None, period=None): + if period is None or DELTA_MAP.get(period) is None: + raise ValueError('Parameter `period` should be one of {y, m, d, h}') + start = start or datetime(year=2009, month=10, day=2, hour=0) # TODO change this to date when user joined FB + end = end or datetime.now() + + dates = [] + intermediate = start + while intermediate <= end: + dates.append(intermediate) + intermediate = intermediate + DELTA_MAP.get(period) + return dates + + +def get_stats_for_intervals(func, df, time_series, subject='all'): + data = {} + for offset, series in time_series.items(): + data[offset] = {} + for i in range(len(series) - 1): # only looping len - 1 times + start = series[i] + # TODO LATER will we miss the last entry? I dont think so (99%), but check and correct hand in hand with the timeseries bug + # IT DOES NOT! HOWEVER test it with new data injected/modified at runtime <- this is hard + end = series[i + 1] + data[offset][start] = func(df, subject=subject, start=start, end=end) + return data + + +def dt(year: int = 2004, month: int = 1, day: int = 1, hour: int = 0): + return datetime(year=year, month=month, day=day, hour=hour) + + +def get_messages(*files, decode=True): + data = {} + for file in files: + temp = decode_text(read_json(file)) if decode else read_json(file) + if not data: + data = temp + elif data.get('messages') and temp.get('messages'): + data['messages'] += temp.get('messages') + if sorted(temp.keys()) != sorted(data.keys()): + data = {**temp, **data} + return data + + +def decode_text(obj): + if isinstance(obj, str): + return obj.encode('latin_1').decode('utf-8') + + if isinstance(obj, list): + return [decode_text(o) for o in obj] + + if isinstance(obj, dict): + return {key: decode_text(item) for key, item in obj.items()} + + return obj