Merge pull request #1 from tardigrde/message

merging without a conflict
tardigrde · Aug 11, 2020 · 3a6e413 · 3a6e413
2 parents 6fb14da + 0cba2e9
commit 3a6e413
Show file tree

Hide file tree

Showing 34 changed files with 1,945 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -236,4 +236,16 @@ data
 todo.md
 
 
+# ignoring trash file
+trash.py
+
+
+# ignoring jupyter notebook
+explore.ipynb
+
+
+# ignoring jupyter notebook
+tests/playground.py
+
+tests/.pytest_cache
 .pytest_cache
diff --git a/ConversationAnalyzer.py b/ConversationAnalyzer.py
@@ -0,0 +1,152 @@
+import pandas as pd
+from utils import date_checker, period_checker, subject_checker, generate_time_series, get_stats_for_intervals
+
+
+class ConversationAnalyzer:
+    def __new__(cls, name, messages, *args, **kwargs):
+        if messages is None:  # This deals with the case if no messages
+            return None
+        return super(ConversationAnalyzer, cls).__new__(cls, *args, **kwargs)
+
+    def __init__(self, name, messages):
+        self.name = name
+        self.df = messages
+
+    def __str__(self):
+        return f'{self.name}: {list(self.df.index)}'
+
+    @property
+    def stats(self):
+        return self.get_stats(self.df)
+
+    # TODO has to be tested
+    def get_time_series_data(self, subject='all', **kwargs):
+        time_series = generate_time_series(**kwargs)
+        return get_stats_for_intervals(self.get_stats, self.df, time_series, subject=subject)
+
+    def get_plotable_time_series_data(self, interval_stats, statistic):
+        for k, v in interval_stats.items():
+            if isinstance(v, ConversationStats):
+                interval_stats[k] = getattr(v, statistic)
+        return interval_stats
+
+    def get_stats(self, df, subject='all', start=None, end=None, period=None):
+        df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period)
+        stats = ConversationStats(df)
+        return stats
+
+    @staticmethod
+    @subject_checker
+    @date_checker
+    @period_checker
+    def filter_by_input(df, subject='all', start=None, end=None, period=None):
+        if subject == 'me':
+            df = df[df.sender_name == 'Levente Csőke']
+        elif subject == 'partner':
+            df = df[df.sender_name != 'Levente Csőke']
+        if start and end:
+            df = df.loc[start:end]
+        elif start and not end:
+            df = df.loc[start:start + period]
+        elif not start and end:
+            df = df.loc[end - period:end]
+        return df
+
+
+class ConversationStats:
+    """
+    Statistics of conversation with one person.
+    """
+
+    # TODO do we need this or not?!?! smh
+    # def __new__(cls, df, *args, **kwargs):
+    #     if not len(df.index):  # This deals with the case if input df is empty
+    #         return None
+    #     return super(ConversationStats, cls).__new__(cls, *args, **kwargs)
+
+    def __init__(self, df):
+        self.df = df
+
+    def __repr__(self):
+        return f'{self.msg_count}'
+
+    @property
+    def messages(self):
+        return self.df.content.dropna()
+
+    @property
+    def words(self):
+        return self.get_words()
+
+    # 1.
+    @property
+    def msg_count(self):
+        return len(self.df)
+
+    # 2.
+    @property
+    def unique_msg_count(self):
+        return len(self.messages.unique())
+
+    # 3.
+    @property
+    def most_used_msgs(self):
+        # TODO first few (1-10) messages
+        return self.messages.value_counts()
+
+    # 4.
+    @property
+    def msg_frequency(self):
+        # TODO this has been most likely depracated
+        pass
+
+    # 5.
+    @property
+    def word_count(self):
+        return len(self.words)
+
+    # 6.
+    @property
+    def unique_word_count(self):
+        return len(set(self.words))
+
+    # 7.
+    @property
+    def most_used_words(self):
+        s = pd.Series(self.words)
+        return s.value_counts()
+
+    # 8.
+    @property
+    def word_frequency(self):
+        pass
+
+    # 9.
+    @property
+    def char_count(self):
+        char_count = 0
+        for word in self.words:
+            char_count += len(word)
+        return char_count
+
+    # 10.
+    @property
+    def most_used_chars(self):
+        return None  # TODO or not  https://stackoverflow.com/questions/4131123/finding-the-most-frequent-character-in-a-string
+
+    # 11.
+    @property
+    def rate_of_media_messages(self):
+        pass  # TODO what?
+
+    def get_words(self):
+        token_list = self.messages.str.lower().str.split()
+        words = []
+        for tokens in token_list:
+            # print(tokens)
+            if not isinstance(tokens, list):
+                print('WARNING! Not a list!')
+                continue  # TODO ??? check this
+            for token in tokens:
+                words.append(token)
+        return words
diff --git a/Conversations.py b/Conversations.py
@@ -0,0 +1,116 @@
+import os
+from FacebookData import FacebookData
+import pandas as pd
+
+from datetime import datetime
+
+MESSAGE_SUBPATH = 'messages/inbox'
+
+
+class Conversations:
+    def __init__(self, data_path):
+        self.data_path = f'{data_path}/{MESSAGE_SUBPATH}'
+
+    def get_people(self):
+        json_paths = self.walk_directory_and_search('.json')
+        return self.extract_names_from_convos(json_paths)
+
+    def walk_directory_and_search(self, extension):
+        paths = []
+        for root, dirs, files in os.walk(self.data_path):
+            for name in files:
+                if name.endswith(extension):
+                    paths.append(os.path.join(root, name))
+        return paths
+
+    # TODO simplify this function!! also this takes very long
+    @staticmethod
+    def extract_names_from_convos(jsons):
+        name_data_map = {}
+        count = 0
+        for file in jsons:
+            msg = Messages(file)
+            for participant in msg.participants:
+                key = participant if msg.ttype == 'Regular' else f'group_{count}'
+                if key == 'Facebook User':  # TODO ?? what to do with this??
+                    continue
+                if name_data_map.get(key) and key.startswith(
+                        'group'):  # making sure run only once even if it is a group
+                    continue
+                if name_data_map.get(key):
+                    dfs = [name_data_map[key]['messages'], msg.df]
+                    name_data_map[key]['messages'] = pd.concat(dfs, ignore_index=False).sort_index()
+                else:
+                    name_data_map[key] = {
+                        'title': msg.title,
+                        'compact_name': msg.compact_names,  # TODO is list ok for if length is  only  1??
+                        # 'participants': msg.participants + ['Levente Csőke'],
+                        'participants': msg.participants,
+                        'messages': msg.df,
+                        'friend': None,
+                        'messages_dir': msg.messages_dir,
+                        'media_dir': msg.media_dir
+                    }
+            if msg.ttype == 'RegularGroup':
+                count += 1
+
+        return name_data_map
+
+
+class Messages(FacebookData):
+    def __init__(self, json_path):
+        super().__init__(json_path)
+        self.to_df()
+        self.set_date_as_index()
+
+    def to_df(self):
+        self._df = pd.DataFrame(self.decoded.get('messages'))
+
+    def set_date_as_index(self):
+        # TODO maybe not needed; could calculate real time
+        date_series = self._df.timestamp_ms.apply(self.ts_to_date)
+        self._df = self._df.set_index(date_series).iloc[::-1]
+
+    @property
+    def names(self):
+        return pd.DataFrame(self.participants)[0]
+
+    @property
+    def participants(self):
+        participants = self.decoded.get('participants')
+        # TODO I should be IN
+        # but this breaks stuff at TestMessagingAnalyzer
+        return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke']
+        # return [p.get('name') for p in participants if p.get('name')]
+
+    @property
+    def title(self):
+        return self.decoded.get('title')
+
+    @property
+    def ttype(self):
+        return self.decoded.get('thread_type')
+
+    @property
+    def messages_dir(self):
+        thread_path = self.decoded.get('thread_path')
+        if not thread_path.startswith('inbox/'):
+            raise ValueError('Something is not okay.')
+        # TODO here or in the upper function where we extract names
+        return thread_path.split('/')[1].lower()
+
+    @property
+    def media_dir(self):
+        # todo what should the path contain
+        for media in ['photos', 'gifs', 'files', 'videos', 'audio']:
+            if media in self._df.columns:
+                media_in_msg = list(self._df[media][self._df[media].notnull()])
+                # if len(media_in_msg) > 1:  # TODO is this ok. i think it is. think multiple photos sent once
+                #    print('Media in msg is bigger than 1')
+                uri = media_in_msg[0][0].get('uri')
+                return os.path.dirname(os.path.dirname(uri))
+        return None
+
+    @staticmethod
+    def ts_to_date(date):
+        return datetime.fromtimestamp(date / 1000)  # .strftime('%Y-%m-%d')
diff --git a/FacebookData.py b/FacebookData.py
@@ -0,0 +1,40 @@
+from utils import read_json, decode_text, accents_map
+
+
+class FacebookData:
+    def __init__(self, json_path):
+        self.json_path = json_path
+        self._df = None
+
+    @property
+    def df(self):
+        return self._df
+
+    @property
+    def decoded(self):
+        return decode_text(self.json)
+
+    @property
+    def json(self):
+        return read_json(self.json_path)
+
+    @property
+    def compact_names(self):
+        # NOTE this is the place where we change pd/np to builtin
+        # do we have to do this?
+        name_list = list(self.without_accent_and_whitespace(self.lower_names(self.names)))
+        return name_list[0] if len(name_list) == 1 else name_list
+
+    @staticmethod
+    def lower_names(col):
+        return col.str.lower()
+
+    @staticmethod
+    def without_accent_and_whitespace(col):
+        def replace_accents(text):
+            for char in accents_map.keys():
+                if char in text:
+                    text = text.replace(char, accents_map[char])
+            return text.replace(' ', '')
+
+        return col.apply(replace_accents)
diff --git a/Friends.py b/Friends.py
@@ -0,0 +1,36 @@
+import pandas as pd
+import os
+from FacebookData import FacebookData
+from utils import accents_map
+
+
+class Friends(FacebookData):
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+        # self.path = 'data/friends'
+        # self.json_path = f'{self.path}/friends.json'
+
+        self.to_df()
+
+    def get_people(self):
+        names = {}
+        for name, compact in zip(self.names, self.compact_names):
+            names[name] = {
+                'title': name,
+                'compact_name': compact,
+                'messages': None,
+                'friend': True,
+                'participants': None,
+                'messages_dir': None,
+                'media_dir': None
+            }
+        return names
+
+    def to_df(self):
+        self._df = pd.DataFrame(self.decoded.get('friends'))
+
+    @property
+    def names(self):
+        return self.df.name