Skip to content

Commit

Permalink
refactored MsgA and ConvoA into Analyzer; added lot of functionalitie…
Browse files Browse the repository at this point in the history
…s for gathering data for plotting
  • Loading branch information
tardigrde committed Aug 14, 2020
1 parent 81ecfba commit b2f725e
Show file tree
Hide file tree
Showing 21 changed files with 670 additions and 677 deletions.
21 changes: 8 additions & 13 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -228,24 +228,19 @@ dmypy.json

# End of https://www.toptal.com/developers/gitignore/api/python,jupyternotebook,pycharm,visualstudiocode

# ignoring data folder
data

# ignoring data
data
# ignoring jupyter notebook
tests/playground.py

# ignoring todo
# ignoring various files created during development
plots
*.png
todo.md


# ignoring trash file
trash.py


# ignoring jupyter notebook
explore.ipynb


# ignoring jupyter notebook
tests/playground.py
tests/test_data/messages/inbox/private_messages.json

tests/.pytest_cache
.pytest_cache
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,10 @@ More info soon...
## Contribution
Help is more than welcome. If somebody feel the urge to contribute, I would share my plans with them.

Ideas are welcome too. Feel free to open a new issue.
Ideas are welcome too. Feel free to open a new issue.


For running VIsualizer CLI:
```shell script
export PYTHONPATH="$PWD"
```
133 changes: 133 additions & 0 deletions miner/Analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from miner.ConversationStats import ConversationStats
from miner import utils
import pandas as pd


class Analyzer:
# TODO do we need to override __subclasscheck__ ?

# def __new__(cls, name, messages, *args, **kwargs):
# if messages is None: # This deals with the case if no messages
# return None
# return super(Analyzer, cls).__new__(cls, *args, **kwargs)

def __init__(self, people):
self.people = people
self.people_data = people.data
self.names = people.names
self.multi = len(self.people_data) > 1

if self.multi:
self.df = self.stack_dfs()
else:
# TODO solve this hand in hand with the __new__ method. too ugly
self.df = self.people_data.get(list(self.names)[0]).messages

def get_stats_for_intervals(self, time_series, subject='all'):
data = {}
for i in range(len(time_series) - 1): # only looping len - 1 times
start = time_series[i]
end = time_series[i + 1]
data[start] = self.get_stats(self.df, subject=subject, start=start, end=end)
return data

def get_stats(self, df=None, subject='all', start=None, end=None, period=None):
df = self.df if df is None else df
df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period)
stats = ConversationStats(df)
return stats

@staticmethod
def get_plottable_time_series_data(interval_stats, statistic):
for k, v in interval_stats.items():
if isinstance(v, ConversationStats):
interval_stats[k] = getattr(v, statistic)
return interval_stats

@property
def stats(self):
return self.get_stats()

def __str__(self):
if self.multi:
return self.names
else:
return f'{self.names[0]}: {list(self.df.index)}'

def stack_dfs(self):
dfs = []
for data in self.people_data.values():
if data.messages is not None:
dfs.append(data.messages)
return pd.concat(dfs).sort_index()

# 1. Total count of messages/words/characters (also by year/month/day/hour)
# 2. Total count of messages/words/characters sent (also by year/month/day/hour)
# 3. Total count of messages/words/characters received (also by year/month)
def get_count(self, attribute, subject='all', start=None, end=None, period=None):
stats = self.get_stats(subject=subject, start=start, end=end, period=period)
return getattr(stats, attribute)

#################

# 4. Most used messages/words in convos by me/partner (also by year/month/day/hour)
def most_used_messages_(self, **kwargs):
"""
>>> s1 = pd.Series([3, 1, 2, 3, 4, 1, 1])
>>> s2 = pd.Series([3, 2, 1, 1])
>>> s1_vc = s1.value_counts()
>>> s2_vc = s2.value_counts()
TODO LATER most used is already a problem:
- because its a series of all the unique messages/words ever used in a convo
- it contains strings like ':d', ':p' and 'xd'
- from all the convos the result of value_counts has to be cleared
and has to be truncated (that is not use the 200th most used word, only top10 let's say)
- then these series has to be merged in a way that the same string's counts are added up
- what about typos????!
"""
pass

# 5. Number of messages sent/got on busiest period (by year/month/day/hour)
def stat_per_period(self, period, attribute, **kwargs):
interval_stats = self.get_time_series_data(period, **kwargs)
# TODO attribute is one of (msg, word, char)
time_series_data = self.get_plottable_time_series_data(interval_stats, statistic=attribute)
return utils.count_stat_for_period(time_series_data, period)

# 6. Time series: dict of 'year/month/day/hour : number of messages/words/characters (also sent/got) for user/all convos'
def get_time_series_data(self, period, subject='all', **kwargs):
time_series = utils.generate_date_series(period, **kwargs)
return self.get_stats_for_intervals(self.df, time_series, subject=subject)

# # 7. Ranking of friends by messages by y/m/d/h, by different stats, by sent/got
def get_ranking_of_friends_by_messages(self, attribute='msg_count', subject='all', start=None, end=None,
period=None):
# TODO almost the same function as get_count
count_dict = {}
for name in self.names:
# analyzer = Analyzer({name: self.people.get(name)}) # this has to be a people instance?! OR?
# analyzer = Analyzer(People(self.people.data_path, name=name)) # this has to be a people instance?! OR?
df = self.df[self.df.partner == name]
stats = self.get_stats(df=df, subject=subject, start=start, end=end, period=period)
if stats is not None:
count_dict = utils.fill_dict(count_dict, name, getattr(stats, attribute))

count_dict = {key: value for key, value in sorted(count_dict.items(), key=lambda item: item[1], reverse=True)}
return count_dict

@staticmethod
@utils.subject_checker
@utils.date_checker
@utils.period_checker
def filter_by_input(df, subject='all', start=None, end=None, period=None):
if subject == 'me':
df = df[df.sender_name == 'Levente Csőke']
elif subject == 'partner':
df = df[df.sender_name != 'Levente Csőke']
if start and end:
df = df.loc[start:end]
elif start and not end:
df = df.loc[start:start + period]
elif not start and end:
df = df.loc[end - period:end]
return df
26 changes: 4 additions & 22 deletions miner/App.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from miner.ConversationAnalyzer import ConversationAnalyzer
from miner.MessagingAnalyzer import MessagingAnalyzer
from miner.Analyzer import Analyzer

from miner.People import People

DATA_PATH = '/home/levente/projects/facebook-data-miner/data'
Expand All @@ -13,26 +13,8 @@ def __init__(self):
def analyze_messages():
p = People(path=DATA_PATH)

stats = {}

for name, person in p.data.items():
if person.messages is None:
stats[person.name] = None
continue
analyzer = ConversationAnalyzer(person.name, person.messages)
stats[person.name] = analyzer.stats
# if stats[person.name].get('message_count').get('me') > 5000:
# top[person.name] = stats[person.name]
print()

# print('LEN: ', len(top.keys()))
# top_all = {name: data.get('message_count').get('all') for name, data in top.items()}
# analyzer.visualize_stats(top)

@staticmethod
def analyze_messaging():
people = People(path=DATA_PATH)
msg_analyzer = MessagingAnalyzer(people)
analyzer = Analyzer(p)
rank = analyzer.get_ranking_of_friends_by_messages(attribute='char_count')


if __name__ == '__main__':
Expand Down
146 changes: 0 additions & 146 deletions miner/ConversationAnalyzer.py

This file was deleted.

Loading

0 comments on commit b2f725e

Please sign in to comment.