Skip to content

Commit

Permalink
refactored analyzer; added some documentation; resolved TODOs
Browse files Browse the repository at this point in the history
  • Loading branch information
tardigrde committed Aug 16, 2020
1 parent b2f725e commit 55ba0d0
Show file tree
Hide file tree
Showing 17 changed files with 166 additions and 113 deletions.
108 changes: 51 additions & 57 deletions miner/Analyzer.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,45 @@
import pandas as pd

from miner.ConversationStats import ConversationStats
from miner import utils
import pandas as pd


class Analyzer:
# TODO do we need to override __subclasscheck__ ?
"""
Analyzer for analyzing specific and/or all conversations
# def __new__(cls, name, messages, *args, **kwargs):
# if messages is None: # This deals with the case if no messages
# return None
# return super(Analyzer, cls).__new__(cls, *args, **kwargs)
"""

def __init__(self, people):
self.people = people
self.people_data = people.data
self.names = people.names
self.names = list(people.names)
self.multi = len(self.people_data) > 1

if self.multi:
self.df = self.stack_dfs()
self.df = self.stack_dfs(self.people_data)
else:
self.df = self.people_data.get(self.names[0]).messages

def __str__(self):
if self.multi:
return self.names
else:
# TODO solve this hand in hand with the __new__ method. too ugly
self.df = self.people_data.get(list(self.names)[0]).messages
return f'{self.names[0]}: {list(self.df.index)}'

def get_stats_for_intervals(self, time_series, subject='all'):
@property
def stats(self):
return self.get_stats()

def get_stats_for_intervals(self, time_series, period, subject='all'):
data = {}
for i in range(len(time_series) - 1): # only looping len - 1 times
for i in range(len(time_series)):
start = time_series[i]
end = time_series[i + 1]
data[start] = self.get_stats(self.df, subject=subject, start=start, end=end)
try: # with this solution we will have data for the very last moments until datetime.now()
end = time_series[i + 1]
except IndexError:
end = None
data[start] = self.get_stats(df=self.df, subject=subject, start=start, end=end, period=period)
return data

def get_stats(self, df=None, subject='all', start=None, end=None, period=None):
Expand All @@ -37,39 +48,13 @@ def get_stats(self, df=None, subject='all', start=None, end=None, period=None):
stats = ConversationStats(df)
return stats

@staticmethod
def get_plottable_time_series_data(interval_stats, statistic):
for k, v in interval_stats.items():
if isinstance(v, ConversationStats):
interval_stats[k] = getattr(v, statistic)
return interval_stats

@property
def stats(self):
return self.get_stats()

def __str__(self):
if self.multi:
return self.names
else:
return f'{self.names[0]}: {list(self.df.index)}'

def stack_dfs(self):
dfs = []
for data in self.people_data.values():
if data.messages is not None:
dfs.append(data.messages)
return pd.concat(dfs).sort_index()

# 1. Total count of messages/words/characters (also by year/month/day/hour)
# 2. Total count of messages/words/characters sent (also by year/month/day/hour)
# 3. Total count of messages/words/characters received (also by year/month)
def get_count(self, attribute, subject='all', start=None, end=None, period=None):
stats = self.get_stats(subject=subject, start=start, end=end, period=period)
return getattr(stats, attribute)

#################

# 4. Most used messages/words in convos by me/partner (also by year/month/day/hour)
def most_used_messages_(self, **kwargs):
"""
Expand All @@ -88,38 +73,47 @@ def most_used_messages_(self, **kwargs):
pass

# 5. Number of messages sent/got on busiest period (by year/month/day/hour)
def stat_per_period(self, period, attribute, **kwargs):
def stat_per_period(self, period, statistic, **kwargs):
interval_stats = self.get_time_series_data(period, **kwargs)
# TODO attribute is one of (msg, word, char)
time_series_data = self.get_plottable_time_series_data(interval_stats, statistic=attribute)
time_series_data = self.get_stat_count(interval_stats, statistic=statistic)
return utils.count_stat_for_period(time_series_data, period)

# 6. Time series: dict of 'year/month/day/hour : number of messages/words/characters (also sent/got) for user/all convos'
# 6. Time series: dict of 'y/m/d/h : number of messages/words/characters (also sent/got) for user/all convos'
def get_time_series_data(self, period, subject='all', **kwargs):
time_series = utils.generate_date_series(period, **kwargs)
return self.get_stats_for_intervals(self.df, time_series, subject=subject)
time_series = utils.generate_date_series(period=period, **kwargs)
return self.get_stats_for_intervals(time_series, period, subject=subject)

# # 7. Ranking of friends by messages by y/m/d/h, by different stats, by sent/got
def get_ranking_of_friends_by_messages(self, attribute='msg_count', subject='all', start=None, end=None,
period=None):
# TODO almost the same function as get_count
# # 7. Ranking of partners by messages by y/m/d/h, by different stats, by sent/got
def get_ranking_of_partners_by_messages(self, statistic='msg_count', **kwargs):
count_dict = {}
for name in self.names:
# analyzer = Analyzer({name: self.people.get(name)}) # this has to be a people instance?! OR?
# analyzer = Analyzer(People(self.people.data_path, name=name)) # this has to be a people instance?! OR?
df = self.df[self.df.partner == name]
stats = self.get_stats(df=df, subject=subject, start=start, end=end, period=period)
stats = self.get_stats(df=df, **kwargs)
if stats is not None:
count_dict = utils.fill_dict(count_dict, name, getattr(stats, attribute))

count_dict = {key: value for key, value in sorted(count_dict.items(), key=lambda item: item[1], reverse=True)}
count_dict = utils.fill_dict(count_dict, name, getattr(stats, statistic))
return count_dict

@staticmethod
def stack_dfs(people_data):
dfs = []
for data in people_data.values():
if data.messages is not None:
dfs.append(data.messages)
return pd.concat(dfs).sort_index()

@staticmethod
@utils.attribute_checker
def get_stat_count(interval_stats, statistic='msg_count'):
for k, v in interval_stats.items():
interval_stats[k] = getattr(v, statistic)
return interval_stats

@staticmethod
@utils.subject_checker
@utils.date_checker
@utils.period_checker
@utils.start_end_period_checker
def filter_by_input(df, subject='all', start=None, end=None, period=None):

if subject == 'me':
df = df[df.sender_name == 'Levente Csőke']
elif subject == 'partner':
Expand Down
11 changes: 8 additions & 3 deletions miner/App.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
from miner.Analyzer import Analyzer
import os

from miner.Analyzer import Analyzer
from miner.People import People

DATA_PATH = '/home/levente/projects/facebook-data-miner/data'
DATA_PATH = f'{os.getcwd()}/data'


class App:
"""
Entrypoint. Not yet used extensively.
# TODO LATER turn it into a cli
"""
def __init__(self):
pass

Expand All @@ -14,7 +19,7 @@ def analyze_messages():
p = People(path=DATA_PATH)

analyzer = Analyzer(p)
rank = analyzer.get_ranking_of_friends_by_messages(attribute='char_count')
rank = analyzer.get_ranking_of_partners_by_messages(attribute='char_count')


if __name__ == '__main__':
Expand Down
16 changes: 9 additions & 7 deletions miner/Conversations.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import pandas as pd
import os


from miner.Messages import Messages
from miner.Individual import Individual

from miner import utils


class Conversations:
"""
Class for managing and parsing conversations
"""

def __init__(self, data_path):
self.private_convo_paths = {}
self.group_convo_paths = {} # TODO fill this as well
self.group_convo_paths = {} # TODO LATER fill this as well
self.deleted_user_convo_paths = [] # NOTE these are collected but not yet used

self.data_path = f'{data_path}/{utils.MESSAGE_SUBPATH}'
Expand Down Expand Up @@ -41,7 +44,7 @@ def differentiate_paths(self, jsons):

def register_paths(self):
utils.dump_to_json(self.private_convo_paths, f'{self.data_path}/private_messages.json')

def read_paths(self, file):
self.private_convo_paths = utils.read_json(file)
print()
Expand All @@ -55,6 +58,8 @@ def map_private_convo_files(self, msg, file):

def map_group_convo_files(self, msg, file):
for participant in msg.participants:
if participant == 'Levente Csőke':
continue
if self.group_convo_paths.get(file):
self.group_convo_paths[file].append(participant)
else:
Expand Down Expand Up @@ -100,7 +105,4 @@ def group_membership(name):
return None

def get_people_from_group_messages(self):
pass # TODO for v0.0.4



pass
6 changes: 5 additions & 1 deletion miner/FacebookData.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@


class FacebookData:
"""
Base class for reading in tabular data from JSONs.
"""

def __init__(self, json_path):
self.json_path = json_path
self._df = None
Expand All @@ -21,7 +25,7 @@ def json(self):

@property
def compact_names(self):
name_list = list(utils.without_accent_and_whitespace(utils.lower_names(self.names))) # should be just fine
name_list = list(utils.without_accent_and_whitespace(utils.lower_names(self.names)))
return name_list[0] if len(name_list) == 1 else name_list

def to_df(self, field=None):
Expand Down
3 changes: 3 additions & 0 deletions miner/Friends.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@


class Friends(FacebookData):
"""
Class for storing data in friends.json
"""

def __init__(self, *args):
super().__init__(*args)
Expand Down
8 changes: 4 additions & 4 deletions miner/Group.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@


class Group:
"""
Class for holding a group-message's data
"""

def __init__(self, name=None, title=None, messages=None, compact=None, messages_dir=None, media_dir=None,
members=None):
self._name = name
Expand All @@ -26,10 +30,6 @@ def title(self):
def messages(self):
return self._messages

# @property
# def get_message_jsons(self):
# return self._messages

@property
def media_dir(self):
return self._media_dir
Expand Down
4 changes: 4 additions & 0 deletions miner/Individual.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
class Individual:
"""
Class for holding a person's data the user ever interacted with
"""

def __init__(self, name=None, compact=None, messages=None, friend=None, messages_dir=None,
media_dir=None,
member_of=None):
Expand Down
4 changes: 4 additions & 0 deletions miner/Me.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@


class Me(FacebookData):
"""
Class for storing basic data about the user
"""

def __init__(self, *args):
super().__init__(*args)

Expand Down
10 changes: 5 additions & 5 deletions miner/Messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@


class Messages(FacebookData):
"""
Class for representing data of all the messages with a user or a group
"""

def __init__(self, json_path):
super().__init__(json_path)
self.to_df('messages')
Expand All @@ -15,7 +19,6 @@ def __init__(self, json_path):

@property
def names(self):
# TODO ugly
try:
return pd.DataFrame(self.participants)[0]
except KeyError:
Expand All @@ -24,10 +27,7 @@ def names(self):
@property
def participants(self):
participants = self.decoded.get('participants')
# TODO I should be IN
# but this breaks stuff at TestMessagingAnalyzer
return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke']
# return [p.get('name') for p in participants if p.get('name')]
return [p.get('name') for p in participants if p.get('name')]

@property
def title(self):
Expand Down
11 changes: 8 additions & 3 deletions miner/People.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import time
import os

from miner.Conversations import Conversations
from miner.Friends import Friends

DATA_PATH = '/home/levente/projects/facebook-data-miner/data'
DATA_PATH = f'{os.getcwd()}/data'


class People:
"""
Class that manages and represents people from different kind of interactions
# TODO LATER abstractional flaw?! people? person? indie?
"""

def __init__(self, path=None, name=None):
self.data_path = path if path else DATA_PATH
self._groups = []
Expand All @@ -19,7 +25,7 @@ def data(self):

@property
def names(self):
return self._names #if len(self._names) > 1 else self._names[0]
return self._names # if len(self._names) > 1 else self._names[0]

@property
def groups(self):
Expand All @@ -31,7 +37,6 @@ def get_people(self, name=None):
friends = friend.get_people(name=name)
print('friends: ', time.time() - start)

# TODO LATER too slow -> store in file
start = time.time()
conversations = Conversations(self.data_path)
print('convos1: ', time.time() - start)
Expand Down
4 changes: 0 additions & 4 deletions miner/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
numpy==1.18.1
pandas==1.0.3
dateparser==0.7.6
seaborn==0.10.1
matplotlib==3.2.1
plotly==4.8.2
miner==0.0.0
Pillow==7.2.0
python_dateutil==2.8.1
Loading

0 comments on commit 55ba0d0

Please sign in to comment.