Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added more test data and hot fixes #3

Merged
merged 1 commit into from
Aug 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions ConversationAnalyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,22 @@ def __str__(self):
def stats(self):
return self.get_stats(self.df)

# TODO has to be tested
def get_stats(self, df, subject='all', start=None, end=None, period=None):
df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period)
stats = ConversationStats(df)
return stats

def get_time_series_data(self, subject='all', **kwargs):
time_series = generate_time_series(**kwargs)
return get_stats_for_intervals(self.get_stats, self.df, time_series, subject=subject)

def get_plotable_time_series_data(self, interval_stats, statistic):
@staticmethod
def get_plottable_time_series_data(interval_stats, statistic):
for k, v in interval_stats.items():
if isinstance(v, ConversationStats):
interval_stats[k] = getattr(v, statistic)
return interval_stats

def get_stats(self, df, subject='all', start=None, end=None, period=None):
df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period)
stats = ConversationStats(df)
return stats

@staticmethod
@subject_checker
@date_checker
Expand Down Expand Up @@ -91,13 +91,13 @@ def unique_msg_count(self):
# 3.
@property
def most_used_msgs(self):
# TODO first few (1-10) messages
# TODO LATER first few (1-10) messages
return self.messages.value_counts()

# 4.
@property
def msg_frequency(self):
# TODO this has been most likely depracated
# NOTE this has been most likely depracated OR?
pass

# 5.
Expand Down Expand Up @@ -132,12 +132,12 @@ def char_count(self):
# 10.
@property
def most_used_chars(self):
return None # TODO or not https://stackoverflow.com/questions/4131123/finding-the-most-frequent-character-in-a-string
return None # TODO LATER or not https://stackoverflow.com/questions/4131123/finding-the-most-frequent-character-in-a-string

# 11.
@property
def rate_of_media_messages(self):
pass # TODO what?
pass # NOTE what?

def get_words(self):
token_list = self.messages.str.lower().str.split()
Expand Down
4 changes: 2 additions & 2 deletions Conversations.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def extract_names_from_convos(jsons):
else:
name_data_map[key] = {
'title': msg.title,
'compact_name': msg.compact_names, # TODO is list ok for if length is only 1??
'compact_name': msg.compact_names,
# 'participants': msg.participants + ['Levente Csőke'],
'participants': msg.participants,
'messages': msg.df,
Expand All @@ -67,7 +67,7 @@ def to_df(self):
self._df = pd.DataFrame(self.decoded.get('messages'))

def set_date_as_index(self):
# TODO maybe not needed; could calculate real time
# NOTE maybe not needed; could calculate real time
date_series = self._df.timestamp_ms.apply(self.ts_to_date)
self._df = self._df.set_index(date_series).iloc[::-1]

Expand Down
2 changes: 1 addition & 1 deletion Group.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# TODO groups should be searched by looking into jsons unfortunately :(
# TODO LATER groups should be searched by looking into jsons unfortunately :(
# because of directory says others
# maybe we dont use groups right away?

Expand Down
25 changes: 10 additions & 15 deletions MessagingAnalyzer.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,22 @@
from utils import year_converter, month_converter, generate_time_series, get_stats_for_intervals
from utils import year_converter, month_converter, generate_date_series, get_stats_for_intervals
from datetime import datetime, date, timedelta
import pandas as pd
from ConversationAnalyzer import ConversationAnalyzer

"""

"""


class MessagingAnalyzer:
def __init__(self, names, people):
# TODO input people only. class ill know what to do
# TODO input people only. class will know what to do
self.names = names
self.people = people

def time_series_analysis_for_all(self, subject=None, **kwargs):
time_series = generate_time_series(**kwargs)
time_series = generate_date_series(**kwargs)
stacked_df = self.stack_dfs(self.people)
interval_stats = get_stats_for_intervals(self.get_stats, stacked_df, time_series, subject=subject)

def get_stats(self, df, subject='all', start=None, end=None, period=None):
# TODO
# TODO LATER
# here you have to do something with it
pass

Expand All @@ -29,14 +25,14 @@ def get_count(self, attribute, subject='all', start=None, end=None, period=None)
# we have a list of names we want to iterate over
for name in self.names:
stats = self.get_conversation_stats(name=name, subject=subject, start=start, end=end, period=period)
if stats is not None: # TODO too explicit; needed because it is possible that None will be returned, if t got an empty df
if stats is not None:
count += getattr(stats, attribute)
return count

def get_conversation_stats(self, name, subject='all', start=None, end=None, period=None):
messages = self.people.get(name).messages
analyzer = ConversationAnalyzer(name, messages)
if analyzer is None: # TODO this is too explicit ?!
if analyzer is None:
return None
return analyzer.get_stats(messages, subject=subject, start=start, end=end, period=period)

Expand Down Expand Up @@ -80,7 +76,7 @@ def most_used_messages_by_me(self, **kwargs):
>>> s2 = pd.Series([3, 2, 1, 1])
>>> s1_vc = s1.value_counts()
>>> s2_vc = s2.value_counts()
TODO (later) most used is already a problem:
TODO LATER most used is already a problem:
- because its a series of all the unique messages/words ever used in a convo
- it contains strings like ':d', ':p' and 'xd'
- from all the convos the result of value_counts has to be cleared
Expand All @@ -101,14 +97,14 @@ def most_used_words_by_partners(self, **kwargs):

# 5. Number of messages sent/got on busiest period (by year/month/day/hour)
def days_when_most_messages_sent(self):
# TODO hard algorithmic problem
# TODO LATER hard algorithmic problem
pass

def days_when_most_messages_received(self):
pass

def hours_when_most_messages_sent(self):
# TODO
# TODO LATER
# is this referring to the absolute hour most messages sent??
# like: 2014.07.25. 15h-16h
# OR
Expand All @@ -131,5 +127,4 @@ def stack_dfs(people):
for data in people.values():
if data.messages is not None:
dfs.append(data.messages)
# TODO do I need to sort by index (date)? yes!
return pd.concat(dfs).sort_index() # TODO why ignore_index??
return pd.concat(dfs).sort_index()
36 changes: 20 additions & 16 deletions Visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,37 @@
from People import People
from ConversationAnalyzer import ConversationAnalyzer


# plt.rcParams.update({'figure.figsize': (10, 7), 'figure.dpi': 120})

TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data'


class Visualizer:
def __init__(self):
pass

def plot_time_series(self, x, y, title="Time series", xlabel='Date', ylabel='Value', dpi=100):
def plot_convos(self, names):
people = People(path=TEST_DATA_PATH)
for name in names:
data = self.set_up_data(people, name, period='d')
df = pd.DataFrame(data.items(), columns=['date', 'value'])
v.plot_time_series(x=df.date, y=df.value, title=name)

@staticmethod
def set_up_data(people, name, period='y'):
analyzer = ConversationAnalyzer(name, people.individuals.get(name).messages)
interval_stats = analyzer.get_time_series_data(subject='all', start=None, end=None, period=period)
return analyzer.get_plottable_time_series_data(interval_stats, statistic='msg_count')

@staticmethod
def plot_time_series(x, y, title="Time series", xlabel='Date', ylabel='Value', dpi=100):
plt.figure(figsize=(16, 5), dpi=dpi)
plt.plot(x, y, color='tab:red')
plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
plt.show()


def set_up(people, name, interval='y'):
analyzer = ConversationAnalyzer(name, people.individuals.get(name).messages)
interval_stats = analyzer.get_time_series_data()
stats = interval_stats.get(interval)
return analyzer.get_plotable_time_series_data(stats, statistic='msg_count')


if __name__ == "__main__":
v = Visualizer()
TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data'
people = People(path=TEST_DATA_PATH)
names = ['Teflon Musk', 'Tőke Hal']
for name in names:
data = set_up(people, name, interval='d')
df = pd.DataFrame(data.items(), columns=['date', 'value'])
v.plot_time_series(x=df.date, y=df.value, title=name)
v = Visualizer()
v.plot_convos(names)
9 changes: 6 additions & 3 deletions tests/TestConversationAnalyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def _stats(name, **kwargs):
return _stats


# TODO extend all functions with all the data
# TODO LATER or not extend all functions with all the data
def test_stats_toke_hal_all(statistics):
stats = statistics('Tőke Hal')

Expand Down Expand Up @@ -133,8 +133,8 @@ def test_stats_toke_hal_me_2014_12(statistics):
def test_stats_teflon_musk(statistics):
stats = statistics('Teflon Musk')
assert stats.msg_count == 6
assert stats.unique_msg_count == 2 # TODO this does not count media messages
# assert stats.most_used_msgs == 0 # TODO should only return the most used or e.g. top10 most used
assert stats.unique_msg_count == 2
# assert stats.most_used_msgs == 0 # TODO LATER should only return the most used or e.g. top10 most used
# assert stats.msg_frequency == 0
assert stats.word_count == 14
assert stats.unique_word_count == 7
Expand Down Expand Up @@ -224,6 +224,9 @@ def test_stats_teflon_musk_all_2014_12(statistics):
assert stats.char_count == 0
# assert stats.most_used_chars == 0

class TestConversationAnalyzer: # Foo Bar
pass


def test_time_series_analysis_for_user(analyze):
analyzer = analyze('Teflon Musk')
Expand Down
Loading