refactored analyzer; added some documentation; resolved TODOs

tardigrde · Aug 16, 2020 · 55ba0d0 · 55ba0d0
1 parent b2f725e
commit 55ba0d0
Show file tree

Hide file tree

Showing 17 changed files with 166 additions and 113 deletions.
diff --git a/miner/Analyzer.py b/miner/Analyzer.py
@@ -1,34 +1,45 @@
+import pandas as pd
+
 from miner.ConversationStats import ConversationStats
 from miner import utils
-import pandas as pd
 
 
 class Analyzer:
-    # TODO do we need to override __subclasscheck__ ?
+    """
+    Analyzer for analyzing specific and/or all conversations
 
-    # def __new__(cls, name, messages, *args, **kwargs):
-    #     if messages is None:  # This deals with the case if no messages
-    #         return None
-    #     return super(Analyzer, cls).__new__(cls, *args, **kwargs)
+    """
 
     def __init__(self, people):
         self.people = people
         self.people_data = people.data
-        self.names = people.names
+        self.names = list(people.names)
         self.multi = len(self.people_data) > 1
 
         if self.multi:
-            self.df = self.stack_dfs()
+            self.df = self.stack_dfs(self.people_data)
+        else:
+            self.df = self.people_data.get(self.names[0]).messages
+
+    def __str__(self):
+        if self.multi:
+            return self.names
         else:
-            # TODO solve this hand in hand with the __new__ method. too ugly
-            self.df = self.people_data.get(list(self.names)[0]).messages
+            return f'{self.names[0]}: {list(self.df.index)}'
 
-    def get_stats_for_intervals(self, time_series, subject='all'):
+    @property
+    def stats(self):
+        return self.get_stats()
+
+    def get_stats_for_intervals(self, time_series, period, subject='all'):
         data = {}
-        for i in range(len(time_series) - 1):  # only looping len - 1 times
+        for i in range(len(time_series)):
             start = time_series[i]
-            end = time_series[i + 1]
-            data[start] = self.get_stats(self.df, subject=subject, start=start, end=end)
+            try:  # with this solution we will have data for the very last moments until datetime.now()
+                end = time_series[i + 1]
+            except IndexError:
+                end = None
+            data[start] = self.get_stats(df=self.df, subject=subject, start=start, end=end, period=period)
         return data
 
     def get_stats(self, df=None, subject='all', start=None, end=None, period=None):
@@ -37,39 +48,13 @@ def get_stats(self, df=None, subject='all', start=None, end=None, period=None):
         stats = ConversationStats(df)
         return stats
 
-    @staticmethod
-    def get_plottable_time_series_data(interval_stats, statistic):
-        for k, v in interval_stats.items():
-            if isinstance(v, ConversationStats):
-                interval_stats[k] = getattr(v, statistic)
-        return interval_stats
-
-    @property
-    def stats(self):
-        return self.get_stats()
-
-    def __str__(self):
-        if self.multi:
-            return self.names
-        else:
-            return f'{self.names[0]}: {list(self.df.index)}'
-
-    def stack_dfs(self):
-        dfs = []
-        for data in self.people_data.values():
-            if data.messages is not None:
-                dfs.append(data.messages)
-        return pd.concat(dfs).sort_index()
-
     # 1. Total count of messages/words/characters (also by year/month/day/hour)
     # 2. Total count of messages/words/characters sent (also by year/month/day/hour)
     # 3. Total count of messages/words/characters received (also by year/month)
     def get_count(self, attribute, subject='all', start=None, end=None, period=None):
         stats = self.get_stats(subject=subject, start=start, end=end, period=period)
         return getattr(stats, attribute)
 
-    #################
-
     # 4. Most used messages/words in convos by me/partner (also by year/month/day/hour)
     def most_used_messages_(self, **kwargs):
         """
@@ -88,38 +73,47 @@ def most_used_messages_(self, **kwargs):
         pass
 
     # 5. Number of messages sent/got on busiest period (by year/month/day/hour)
-    def stat_per_period(self, period, attribute, **kwargs):
+    def stat_per_period(self, period, statistic, **kwargs):
         interval_stats = self.get_time_series_data(period, **kwargs)
-        # TODO attribute is one of (msg, word, char)
-        time_series_data = self.get_plottable_time_series_data(interval_stats, statistic=attribute)
+        time_series_data = self.get_stat_count(interval_stats, statistic=statistic)
         return utils.count_stat_for_period(time_series_data, period)
 
-    # 6. Time series: dict of 'year/month/day/hour : number of messages/words/characters (also sent/got) for user/all convos'
+    # 6. Time series: dict of 'y/m/d/h : number of messages/words/characters (also sent/got) for user/all convos'
     def get_time_series_data(self, period, subject='all', **kwargs):
-        time_series = utils.generate_date_series(period, **kwargs)
-        return self.get_stats_for_intervals(self.df, time_series, subject=subject)
+        time_series = utils.generate_date_series(period=period, **kwargs)
+        return self.get_stats_for_intervals(time_series, period, subject=subject)
 
-    # # 7. Ranking of friends by messages by y/m/d/h, by different stats, by sent/got
-    def get_ranking_of_friends_by_messages(self, attribute='msg_count', subject='all', start=None, end=None,
-                                           period=None):
-        # TODO almost the same function as get_count
+    # # 7. Ranking of partners by messages by y/m/d/h, by different stats, by sent/got
+    def get_ranking_of_partners_by_messages(self, statistic='msg_count', **kwargs):
         count_dict = {}
         for name in self.names:
-            # analyzer = Analyzer({name: self.people.get(name)}) # this has to be a people instance?! OR?
-            # analyzer = Analyzer(People(self.people.data_path, name=name))  # this has to be a people instance?! OR?
             df = self.df[self.df.partner == name]
-            stats = self.get_stats(df=df, subject=subject, start=start, end=end, period=period)
+            stats = self.get_stats(df=df, **kwargs)
             if stats is not None:
-                count_dict = utils.fill_dict(count_dict, name, getattr(stats, attribute))
-
-        count_dict = {key: value for key, value in sorted(count_dict.items(), key=lambda item: item[1], reverse=True)}
+                count_dict = utils.fill_dict(count_dict, name, getattr(stats, statistic))
         return count_dict
 
+    @staticmethod
+    def stack_dfs(people_data):
+        dfs = []
+        for data in people_data.values():
+            if data.messages is not None:
+                dfs.append(data.messages)
+        return pd.concat(dfs).sort_index()
+
+    @staticmethod
+    @utils.attribute_checker
+    def get_stat_count(interval_stats, statistic='msg_count'):
+        for k, v in interval_stats.items():
+            interval_stats[k] = getattr(v, statistic)
+        return interval_stats
+
     @staticmethod
     @utils.subject_checker
     @utils.date_checker
-    @utils.period_checker
+    @utils.start_end_period_checker
     def filter_by_input(df, subject='all', start=None, end=None, period=None):
+
         if subject == 'me':
             df = df[df.sender_name == 'Levente Csőke']
         elif subject == 'partner':

diff --git a/miner/App.py b/miner/App.py
@@ -1,11 +1,16 @@
-from miner.Analyzer import Analyzer
+import os
 
+from miner.Analyzer import Analyzer
 from miner.People import People
 
-DATA_PATH = '/home/levente/projects/facebook-data-miner/data'
+DATA_PATH = f'{os.getcwd()}/data'
 
 
 class App:
+    """
+    Entrypoint. Not yet used extensively.
+    # TODO LATER turn it into a cli
+    """
     def __init__(self):
         pass
 
@@ -14,7 +19,7 @@ def analyze_messages():
         p = People(path=DATA_PATH)
 
         analyzer = Analyzer(p)
-        rank = analyzer.get_ranking_of_friends_by_messages(attribute='char_count')
+        rank = analyzer.get_ranking_of_partners_by_messages(attribute='char_count')
 
 
 if __name__ == '__main__':

diff --git a/miner/Conversations.py b/miner/Conversations.py
@@ -1,17 +1,20 @@
 import pandas as pd
 import os
 
-
 from miner.Messages import Messages
 from miner.Individual import Individual
 
 from miner import utils
 
 
 class Conversations:
+    """
+    Class for managing and parsing conversations
+    """
+
     def __init__(self, data_path):
         self.private_convo_paths = {}
-        self.group_convo_paths = {} # TODO fill this as well
+        self.group_convo_paths = {}  # TODO LATER fill this as well
         self.deleted_user_convo_paths = []  # NOTE these are collected but not yet used
 
         self.data_path = f'{data_path}/{utils.MESSAGE_SUBPATH}'
@@ -41,7 +44,7 @@ def differentiate_paths(self, jsons):
 
     def register_paths(self):
         utils.dump_to_json(self.private_convo_paths, f'{self.data_path}/private_messages.json')
-        
+
     def read_paths(self, file):
         self.private_convo_paths = utils.read_json(file)
         print()
@@ -55,6 +58,8 @@ def map_private_convo_files(self, msg, file):
 
     def map_group_convo_files(self, msg, file):
         for participant in msg.participants:
+            if participant == 'Levente Csőke':
+                continue
             if self.group_convo_paths.get(file):
                 self.group_convo_paths[file].append(participant)
             else:
@@ -100,7 +105,4 @@ def group_membership(name):
         return None
 
     def get_people_from_group_messages(self):
-        pass  # TODO for v0.0.4
-
-
-
+        pass
diff --git a/miner/FacebookData.py b/miner/FacebookData.py
@@ -3,6 +3,10 @@
 
 
 class FacebookData:
+    """
+    Base class for reading in tabular data from JSONs.
+    """
+
     def __init__(self, json_path):
         self.json_path = json_path
         self._df = None
@@ -21,7 +25,7 @@ def json(self):
 
     @property
     def compact_names(self):
-        name_list = list(utils.without_accent_and_whitespace(utils.lower_names(self.names)))  # should be just fine
+        name_list = list(utils.without_accent_and_whitespace(utils.lower_names(self.names)))
         return name_list[0] if len(name_list) == 1 else name_list
 
     def to_df(self, field=None):

diff --git a/miner/Friends.py b/miner/Friends.py
@@ -3,6 +3,9 @@
 
 
 class Friends(FacebookData):
+    """
+    Class for storing data in friends.json
+    """
 
     def __init__(self, *args):
         super().__init__(*args)

diff --git a/miner/Group.py b/miner/Group.py
@@ -4,6 +4,10 @@
 
 
 class Group:
+    """
+    Class for holding a group-message's data
+    """
+
     def __init__(self, name=None, title=None, messages=None, compact=None, messages_dir=None, media_dir=None,
                  members=None):
         self._name = name
@@ -26,10 +30,6 @@ def title(self):
     def messages(self):
         return self._messages
 
-    # @property
-    # def get_message_jsons(self):
-    #     return self._messages
-
     @property
     def media_dir(self):
         return self._media_dir

diff --git a/miner/Individual.py b/miner/Individual.py
@@ -1,4 +1,8 @@
 class Individual:
+    """
+    Class for holding a person's data the user ever interacted with
+    """
+
     def __init__(self, name=None, compact=None, messages=None, friend=None, messages_dir=None,
                  media_dir=None,
                  member_of=None):

diff --git a/miner/Me.py b/miner/Me.py
@@ -2,6 +2,10 @@
 
 
 class Me(FacebookData):
+    """
+    Class for storing basic data about the user
+    """
+
     def __init__(self, *args):
         super().__init__(*args)
 

diff --git a/miner/Messages.py b/miner/Messages.py
@@ -7,6 +7,10 @@
 
 
 class Messages(FacebookData):
+    """
+    Class for representing data of all the messages with a user or a group
+    """
+
     def __init__(self, json_path):
         super().__init__(json_path)
         self.to_df('messages')
@@ -15,7 +19,6 @@ def __init__(self, json_path):
 
     @property
     def names(self):
-        # TODO ugly
         try:
             return pd.DataFrame(self.participants)[0]
         except KeyError:
@@ -24,10 +27,7 @@ def names(self):
     @property
     def participants(self):
         participants = self.decoded.get('participants')
-        # TODO I should be IN
-        # but this breaks stuff at TestMessagingAnalyzer
-        return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke']
-        # return [p.get('name') for p in participants if p.get('name')]
+        return [p.get('name') for p in participants if p.get('name')]
 
     @property
     def title(self):

diff --git a/miner/People.py b/miner/People.py
@@ -1,12 +1,18 @@
 import time
+import os
 
 from miner.Conversations import Conversations
 from miner.Friends import Friends
 
-DATA_PATH = '/home/levente/projects/facebook-data-miner/data'
+DATA_PATH = f'{os.getcwd()}/data'
 
 
 class People:
+    """
+    Class that manages and represents people from different kind of interactions
+    # TODO LATER abstractional flaw?! people? person? indie?
+    """
+
     def __init__(self, path=None, name=None):
         self.data_path = path if path else DATA_PATH
         self._groups = []
@@ -19,7 +25,7 @@ def data(self):
 
     @property
     def names(self):
-        return self._names #if len(self._names) > 1 else self._names[0]
+        return self._names  # if len(self._names) > 1 else self._names[0]
 
     @property
     def groups(self):
@@ -31,7 +37,6 @@ def get_people(self, name=None):
         friends = friend.get_people(name=name)
         print('friends: ', time.time() - start)
 
-        # TODO LATER too slow -> store in file
         start = time.time()
         conversations = Conversations(self.data_path)
         print('convos1: ', time.time() - start)

diff --git a/miner/requirements.txt b/miner/requirements.txt
@@ -1,9 +1,5 @@
 numpy==1.18.1
 pandas==1.0.3
 dateparser==0.7.6
-seaborn==0.10.1
 matplotlib==3.2.1
-plotly==4.8.2
-miner==0.0.0
-Pillow==7.2.0
 python_dateutil==2.8.1