Skip to content

Commit

Permalink
Merge pull request #1 from tardigrde/message
Browse files Browse the repository at this point in the history
merging without a conflict
  • Loading branch information
tardigrde authored Aug 11, 2020
2 parents 6fb14da + 0cba2e9 commit 3a6e413
Show file tree
Hide file tree
Showing 34 changed files with 1,945 additions and 1 deletion.
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -236,4 +236,16 @@ data
todo.md


# ignoring trash file
trash.py


# ignoring jupyter notebook
explore.ipynb


# ignoring jupyter notebook
tests/playground.py

tests/.pytest_cache
.pytest_cache
152 changes: 152 additions & 0 deletions ConversationAnalyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import pandas as pd
from utils import date_checker, period_checker, subject_checker, generate_time_series, get_stats_for_intervals


class ConversationAnalyzer:
def __new__(cls, name, messages, *args, **kwargs):
if messages is None: # This deals with the case if no messages
return None
return super(ConversationAnalyzer, cls).__new__(cls, *args, **kwargs)

def __init__(self, name, messages):
self.name = name
self.df = messages

def __str__(self):
return f'{self.name}: {list(self.df.index)}'

@property
def stats(self):
return self.get_stats(self.df)

# TODO has to be tested
def get_time_series_data(self, subject='all', **kwargs):
time_series = generate_time_series(**kwargs)
return get_stats_for_intervals(self.get_stats, self.df, time_series, subject=subject)

def get_plotable_time_series_data(self, interval_stats, statistic):
for k, v in interval_stats.items():
if isinstance(v, ConversationStats):
interval_stats[k] = getattr(v, statistic)
return interval_stats

def get_stats(self, df, subject='all', start=None, end=None, period=None):
df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period)
stats = ConversationStats(df)
return stats

@staticmethod
@subject_checker
@date_checker
@period_checker
def filter_by_input(df, subject='all', start=None, end=None, period=None):
if subject == 'me':
df = df[df.sender_name == 'Levente Csőke']
elif subject == 'partner':
df = df[df.sender_name != 'Levente Csőke']
if start and end:
df = df.loc[start:end]
elif start and not end:
df = df.loc[start:start + period]
elif not start and end:
df = df.loc[end - period:end]
return df


class ConversationStats:
"""
Statistics of conversation with one person.
"""

# TODO do we need this or not?!?! smh
# def __new__(cls, df, *args, **kwargs):
# if not len(df.index): # This deals with the case if input df is empty
# return None
# return super(ConversationStats, cls).__new__(cls, *args, **kwargs)

def __init__(self, df):
self.df = df

def __repr__(self):
return f'{self.msg_count}'

@property
def messages(self):
return self.df.content.dropna()

@property
def words(self):
return self.get_words()

# 1.
@property
def msg_count(self):
return len(self.df)

# 2.
@property
def unique_msg_count(self):
return len(self.messages.unique())

# 3.
@property
def most_used_msgs(self):
# TODO first few (1-10) messages
return self.messages.value_counts()

# 4.
@property
def msg_frequency(self):
# TODO this has been most likely depracated
pass

# 5.
@property
def word_count(self):
return len(self.words)

# 6.
@property
def unique_word_count(self):
return len(set(self.words))

# 7.
@property
def most_used_words(self):
s = pd.Series(self.words)
return s.value_counts()

# 8.
@property
def word_frequency(self):
pass

# 9.
@property
def char_count(self):
char_count = 0
for word in self.words:
char_count += len(word)
return char_count

# 10.
@property
def most_used_chars(self):
return None # TODO or not https://stackoverflow.com/questions/4131123/finding-the-most-frequent-character-in-a-string

# 11.
@property
def rate_of_media_messages(self):
pass # TODO what?

def get_words(self):
token_list = self.messages.str.lower().str.split()
words = []
for tokens in token_list:
# print(tokens)
if not isinstance(tokens, list):
print('WARNING! Not a list!')
continue # TODO ??? check this
for token in tokens:
words.append(token)
return words
116 changes: 116 additions & 0 deletions Conversations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import os
from FacebookData import FacebookData
import pandas as pd

from datetime import datetime

MESSAGE_SUBPATH = 'messages/inbox'


class Conversations:
def __init__(self, data_path):
self.data_path = f'{data_path}/{MESSAGE_SUBPATH}'

def get_people(self):
json_paths = self.walk_directory_and_search('.json')
return self.extract_names_from_convos(json_paths)

def walk_directory_and_search(self, extension):
paths = []
for root, dirs, files in os.walk(self.data_path):
for name in files:
if name.endswith(extension):
paths.append(os.path.join(root, name))
return paths

# TODO simplify this function!! also this takes very long
@staticmethod
def extract_names_from_convos(jsons):
name_data_map = {}
count = 0
for file in jsons:
msg = Messages(file)
for participant in msg.participants:
key = participant if msg.ttype == 'Regular' else f'group_{count}'
if key == 'Facebook User': # TODO ?? what to do with this??
continue
if name_data_map.get(key) and key.startswith(
'group'): # making sure run only once even if it is a group
continue
if name_data_map.get(key):
dfs = [name_data_map[key]['messages'], msg.df]
name_data_map[key]['messages'] = pd.concat(dfs, ignore_index=False).sort_index()
else:
name_data_map[key] = {
'title': msg.title,
'compact_name': msg.compact_names, # TODO is list ok for if length is only 1??
# 'participants': msg.participants + ['Levente Csőke'],
'participants': msg.participants,
'messages': msg.df,
'friend': None,
'messages_dir': msg.messages_dir,
'media_dir': msg.media_dir
}
if msg.ttype == 'RegularGroup':
count += 1

return name_data_map


class Messages(FacebookData):
def __init__(self, json_path):
super().__init__(json_path)
self.to_df()
self.set_date_as_index()

def to_df(self):
self._df = pd.DataFrame(self.decoded.get('messages'))

def set_date_as_index(self):
# TODO maybe not needed; could calculate real time
date_series = self._df.timestamp_ms.apply(self.ts_to_date)
self._df = self._df.set_index(date_series).iloc[::-1]

@property
def names(self):
return pd.DataFrame(self.participants)[0]

@property
def participants(self):
participants = self.decoded.get('participants')
# TODO I should be IN
# but this breaks stuff at TestMessagingAnalyzer
return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke']
# return [p.get('name') for p in participants if p.get('name')]

@property
def title(self):
return self.decoded.get('title')

@property
def ttype(self):
return self.decoded.get('thread_type')

@property
def messages_dir(self):
thread_path = self.decoded.get('thread_path')
if not thread_path.startswith('inbox/'):
raise ValueError('Something is not okay.')
# TODO here or in the upper function where we extract names
return thread_path.split('/')[1].lower()

@property
def media_dir(self):
# todo what should the path contain
for media in ['photos', 'gifs', 'files', 'videos', 'audio']:
if media in self._df.columns:
media_in_msg = list(self._df[media][self._df[media].notnull()])
# if len(media_in_msg) > 1: # TODO is this ok. i think it is. think multiple photos sent once
# print('Media in msg is bigger than 1')
uri = media_in_msg[0][0].get('uri')
return os.path.dirname(os.path.dirname(uri))
return None

@staticmethod
def ts_to_date(date):
return datetime.fromtimestamp(date / 1000) # .strftime('%Y-%m-%d')
40 changes: 40 additions & 0 deletions FacebookData.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from utils import read_json, decode_text, accents_map


class FacebookData:
def __init__(self, json_path):
self.json_path = json_path
self._df = None

@property
def df(self):
return self._df

@property
def decoded(self):
return decode_text(self.json)

@property
def json(self):
return read_json(self.json_path)

@property
def compact_names(self):
# NOTE this is the place where we change pd/np to builtin
# do we have to do this?
name_list = list(self.without_accent_and_whitespace(self.lower_names(self.names)))
return name_list[0] if len(name_list) == 1 else name_list

@staticmethod
def lower_names(col):
return col.str.lower()

@staticmethod
def without_accent_and_whitespace(col):
def replace_accents(text):
for char in accents_map.keys():
if char in text:
text = text.replace(char, accents_map[char])
return text.replace(' ', '')

return col.apply(replace_accents)
36 changes: 36 additions & 0 deletions Friends.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pandas as pd
import os
from FacebookData import FacebookData
from utils import accents_map


class Friends(FacebookData):

def __init__(self, *args):
super().__init__(*args)

# self.path = 'data/friends'
# self.json_path = f'{self.path}/friends.json'

self.to_df()

def get_people(self):
names = {}
for name, compact in zip(self.names, self.compact_names):
names[name] = {
'title': name,
'compact_name': compact,
'messages': None,
'friend': True,
'participants': None,
'messages_dir': None,
'media_dir': None
}
return names

def to_df(self):
self._df = pd.DataFrame(self.decoded.get('friends'))

@property
def names(self):
return self.df.name
Loading

0 comments on commit 3a6e413

Please sign in to comment.