-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from tardigrde/message
merging without a conflict
- Loading branch information
Showing
34 changed files
with
1,945 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
import pandas as pd | ||
from utils import date_checker, period_checker, subject_checker, generate_time_series, get_stats_for_intervals | ||
|
||
|
||
class ConversationAnalyzer: | ||
def __new__(cls, name, messages, *args, **kwargs): | ||
if messages is None: # This deals with the case if no messages | ||
return None | ||
return super(ConversationAnalyzer, cls).__new__(cls, *args, **kwargs) | ||
|
||
def __init__(self, name, messages): | ||
self.name = name | ||
self.df = messages | ||
|
||
def __str__(self): | ||
return f'{self.name}: {list(self.df.index)}' | ||
|
||
@property | ||
def stats(self): | ||
return self.get_stats(self.df) | ||
|
||
# TODO has to be tested | ||
def get_time_series_data(self, subject='all', **kwargs): | ||
time_series = generate_time_series(**kwargs) | ||
return get_stats_for_intervals(self.get_stats, self.df, time_series, subject=subject) | ||
|
||
def get_plotable_time_series_data(self, interval_stats, statistic): | ||
for k, v in interval_stats.items(): | ||
if isinstance(v, ConversationStats): | ||
interval_stats[k] = getattr(v, statistic) | ||
return interval_stats | ||
|
||
def get_stats(self, df, subject='all', start=None, end=None, period=None): | ||
df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period) | ||
stats = ConversationStats(df) | ||
return stats | ||
|
||
@staticmethod | ||
@subject_checker | ||
@date_checker | ||
@period_checker | ||
def filter_by_input(df, subject='all', start=None, end=None, period=None): | ||
if subject == 'me': | ||
df = df[df.sender_name == 'Levente Csőke'] | ||
elif subject == 'partner': | ||
df = df[df.sender_name != 'Levente Csőke'] | ||
if start and end: | ||
df = df.loc[start:end] | ||
elif start and not end: | ||
df = df.loc[start:start + period] | ||
elif not start and end: | ||
df = df.loc[end - period:end] | ||
return df | ||
|
||
|
||
class ConversationStats: | ||
""" | ||
Statistics of conversation with one person. | ||
""" | ||
|
||
# TODO do we need this or not?!?! smh | ||
# def __new__(cls, df, *args, **kwargs): | ||
# if not len(df.index): # This deals with the case if input df is empty | ||
# return None | ||
# return super(ConversationStats, cls).__new__(cls, *args, **kwargs) | ||
|
||
def __init__(self, df): | ||
self.df = df | ||
|
||
def __repr__(self): | ||
return f'{self.msg_count}' | ||
|
||
@property | ||
def messages(self): | ||
return self.df.content.dropna() | ||
|
||
@property | ||
def words(self): | ||
return self.get_words() | ||
|
||
# 1. | ||
@property | ||
def msg_count(self): | ||
return len(self.df) | ||
|
||
# 2. | ||
@property | ||
def unique_msg_count(self): | ||
return len(self.messages.unique()) | ||
|
||
# 3. | ||
@property | ||
def most_used_msgs(self): | ||
# TODO first few (1-10) messages | ||
return self.messages.value_counts() | ||
|
||
# 4. | ||
@property | ||
def msg_frequency(self): | ||
# TODO this has been most likely depracated | ||
pass | ||
|
||
# 5. | ||
@property | ||
def word_count(self): | ||
return len(self.words) | ||
|
||
# 6. | ||
@property | ||
def unique_word_count(self): | ||
return len(set(self.words)) | ||
|
||
# 7. | ||
@property | ||
def most_used_words(self): | ||
s = pd.Series(self.words) | ||
return s.value_counts() | ||
|
||
# 8. | ||
@property | ||
def word_frequency(self): | ||
pass | ||
|
||
# 9. | ||
@property | ||
def char_count(self): | ||
char_count = 0 | ||
for word in self.words: | ||
char_count += len(word) | ||
return char_count | ||
|
||
# 10. | ||
@property | ||
def most_used_chars(self): | ||
return None # TODO or not https://stackoverflow.com/questions/4131123/finding-the-most-frequent-character-in-a-string | ||
|
||
# 11. | ||
@property | ||
def rate_of_media_messages(self): | ||
pass # TODO what? | ||
|
||
def get_words(self): | ||
token_list = self.messages.str.lower().str.split() | ||
words = [] | ||
for tokens in token_list: | ||
# print(tokens) | ||
if not isinstance(tokens, list): | ||
print('WARNING! Not a list!') | ||
continue # TODO ??? check this | ||
for token in tokens: | ||
words.append(token) | ||
return words |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import os | ||
from FacebookData import FacebookData | ||
import pandas as pd | ||
|
||
from datetime import datetime | ||
|
||
MESSAGE_SUBPATH = 'messages/inbox' | ||
|
||
|
||
class Conversations: | ||
def __init__(self, data_path): | ||
self.data_path = f'{data_path}/{MESSAGE_SUBPATH}' | ||
|
||
def get_people(self): | ||
json_paths = self.walk_directory_and_search('.json') | ||
return self.extract_names_from_convos(json_paths) | ||
|
||
def walk_directory_and_search(self, extension): | ||
paths = [] | ||
for root, dirs, files in os.walk(self.data_path): | ||
for name in files: | ||
if name.endswith(extension): | ||
paths.append(os.path.join(root, name)) | ||
return paths | ||
|
||
# TODO simplify this function!! also this takes very long | ||
@staticmethod | ||
def extract_names_from_convos(jsons): | ||
name_data_map = {} | ||
count = 0 | ||
for file in jsons: | ||
msg = Messages(file) | ||
for participant in msg.participants: | ||
key = participant if msg.ttype == 'Regular' else f'group_{count}' | ||
if key == 'Facebook User': # TODO ?? what to do with this?? | ||
continue | ||
if name_data_map.get(key) and key.startswith( | ||
'group'): # making sure run only once even if it is a group | ||
continue | ||
if name_data_map.get(key): | ||
dfs = [name_data_map[key]['messages'], msg.df] | ||
name_data_map[key]['messages'] = pd.concat(dfs, ignore_index=False).sort_index() | ||
else: | ||
name_data_map[key] = { | ||
'title': msg.title, | ||
'compact_name': msg.compact_names, # TODO is list ok for if length is only 1?? | ||
# 'participants': msg.participants + ['Levente Csőke'], | ||
'participants': msg.participants, | ||
'messages': msg.df, | ||
'friend': None, | ||
'messages_dir': msg.messages_dir, | ||
'media_dir': msg.media_dir | ||
} | ||
if msg.ttype == 'RegularGroup': | ||
count += 1 | ||
|
||
return name_data_map | ||
|
||
|
||
class Messages(FacebookData): | ||
def __init__(self, json_path): | ||
super().__init__(json_path) | ||
self.to_df() | ||
self.set_date_as_index() | ||
|
||
def to_df(self): | ||
self._df = pd.DataFrame(self.decoded.get('messages')) | ||
|
||
def set_date_as_index(self): | ||
# TODO maybe not needed; could calculate real time | ||
date_series = self._df.timestamp_ms.apply(self.ts_to_date) | ||
self._df = self._df.set_index(date_series).iloc[::-1] | ||
|
||
@property | ||
def names(self): | ||
return pd.DataFrame(self.participants)[0] | ||
|
||
@property | ||
def participants(self): | ||
participants = self.decoded.get('participants') | ||
# TODO I should be IN | ||
# but this breaks stuff at TestMessagingAnalyzer | ||
return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke'] | ||
# return [p.get('name') for p in participants if p.get('name')] | ||
|
||
@property | ||
def title(self): | ||
return self.decoded.get('title') | ||
|
||
@property | ||
def ttype(self): | ||
return self.decoded.get('thread_type') | ||
|
||
@property | ||
def messages_dir(self): | ||
thread_path = self.decoded.get('thread_path') | ||
if not thread_path.startswith('inbox/'): | ||
raise ValueError('Something is not okay.') | ||
# TODO here or in the upper function where we extract names | ||
return thread_path.split('/')[1].lower() | ||
|
||
@property | ||
def media_dir(self): | ||
# todo what should the path contain | ||
for media in ['photos', 'gifs', 'files', 'videos', 'audio']: | ||
if media in self._df.columns: | ||
media_in_msg = list(self._df[media][self._df[media].notnull()]) | ||
# if len(media_in_msg) > 1: # TODO is this ok. i think it is. think multiple photos sent once | ||
# print('Media in msg is bigger than 1') | ||
uri = media_in_msg[0][0].get('uri') | ||
return os.path.dirname(os.path.dirname(uri)) | ||
return None | ||
|
||
@staticmethod | ||
def ts_to_date(date): | ||
return datetime.fromtimestamp(date / 1000) # .strftime('%Y-%m-%d') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from utils import read_json, decode_text, accents_map | ||
|
||
|
||
class FacebookData: | ||
def __init__(self, json_path): | ||
self.json_path = json_path | ||
self._df = None | ||
|
||
@property | ||
def df(self): | ||
return self._df | ||
|
||
@property | ||
def decoded(self): | ||
return decode_text(self.json) | ||
|
||
@property | ||
def json(self): | ||
return read_json(self.json_path) | ||
|
||
@property | ||
def compact_names(self): | ||
# NOTE this is the place where we change pd/np to builtin | ||
# do we have to do this? | ||
name_list = list(self.without_accent_and_whitespace(self.lower_names(self.names))) | ||
return name_list[0] if len(name_list) == 1 else name_list | ||
|
||
@staticmethod | ||
def lower_names(col): | ||
return col.str.lower() | ||
|
||
@staticmethod | ||
def without_accent_and_whitespace(col): | ||
def replace_accents(text): | ||
for char in accents_map.keys(): | ||
if char in text: | ||
text = text.replace(char, accents_map[char]) | ||
return text.replace(' ', '') | ||
|
||
return col.apply(replace_accents) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import pandas as pd | ||
import os | ||
from FacebookData import FacebookData | ||
from utils import accents_map | ||
|
||
|
||
class Friends(FacebookData): | ||
|
||
def __init__(self, *args): | ||
super().__init__(*args) | ||
|
||
# self.path = 'data/friends' | ||
# self.json_path = f'{self.path}/friends.json' | ||
|
||
self.to_df() | ||
|
||
def get_people(self): | ||
names = {} | ||
for name, compact in zip(self.names, self.compact_names): | ||
names[name] = { | ||
'title': name, | ||
'compact_name': compact, | ||
'messages': None, | ||
'friend': True, | ||
'participants': None, | ||
'messages_dir': None, | ||
'media_dir': None | ||
} | ||
return names | ||
|
||
def to_df(self): | ||
self._df = pd.DataFrame(self.decoded.get('friends')) | ||
|
||
@property | ||
def names(self): | ||
return self.df.name |
Oops, something went wrong.