diff --git a/.gitignore b/.gitignore index 5b8858a..4755795 100644 --- a/.gitignore +++ b/.gitignore @@ -228,24 +228,19 @@ dmypy.json # End of https://www.toptal.com/developers/gitignore/api/python,jupyternotebook,pycharm,visualstudiocode +# ignoring data folder +data -# ignoring data -data +# ignoring jupyter notebook +tests/playground.py -# ignoring todo +# ignoring various files created during development +plots +*.png todo.md - - -# ignoring trash file trash.py - - -# ignoring jupyter notebook explore.ipynb - - -# ignoring jupyter notebook -tests/playground.py +tests/test_data/messages/inbox/private_messages.json tests/.pytest_cache .pytest_cache \ No newline at end of file diff --git a/README.md b/README.md index 943a123..c4b3b6a 100644 --- a/README.md +++ b/README.md @@ -11,4 +11,10 @@ More info soon... ## Contribution Help is more than welcome. If somebody feel the urge to contribute, I would share my plans with them. -Ideas are welcome too. Feel free to open a new issue. \ No newline at end of file +Ideas are welcome too. Feel free to open a new issue. + + +For running VIsualizer CLI: +```shell script +export PYTHONPATH="$PWD" +``` diff --git a/miner/Analyzer.py b/miner/Analyzer.py new file mode 100644 index 0000000..b0e61bc --- /dev/null +++ b/miner/Analyzer.py @@ -0,0 +1,133 @@ +from miner.ConversationStats import ConversationStats +from miner import utils +import pandas as pd + + +class Analyzer: + # TODO do we need to override __subclasscheck__ ? + + # def __new__(cls, name, messages, *args, **kwargs): + # if messages is None: # This deals with the case if no messages + # return None + # return super(Analyzer, cls).__new__(cls, *args, **kwargs) + + def __init__(self, people): + self.people = people + self.people_data = people.data + self.names = people.names + self.multi = len(self.people_data) > 1 + + if self.multi: + self.df = self.stack_dfs() + else: + # TODO solve this hand in hand with the __new__ method. too ugly + self.df = self.people_data.get(list(self.names)[0]).messages + + def get_stats_for_intervals(self, time_series, subject='all'): + data = {} + for i in range(len(time_series) - 1): # only looping len - 1 times + start = time_series[i] + end = time_series[i + 1] + data[start] = self.get_stats(self.df, subject=subject, start=start, end=end) + return data + + def get_stats(self, df=None, subject='all', start=None, end=None, period=None): + df = self.df if df is None else df + df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period) + stats = ConversationStats(df) + return stats + + @staticmethod + def get_plottable_time_series_data(interval_stats, statistic): + for k, v in interval_stats.items(): + if isinstance(v, ConversationStats): + interval_stats[k] = getattr(v, statistic) + return interval_stats + + @property + def stats(self): + return self.get_stats() + + def __str__(self): + if self.multi: + return self.names + else: + return f'{self.names[0]}: {list(self.df.index)}' + + def stack_dfs(self): + dfs = [] + for data in self.people_data.values(): + if data.messages is not None: + dfs.append(data.messages) + return pd.concat(dfs).sort_index() + + # 1. Total count of messages/words/characters (also by year/month/day/hour) + # 2. Total count of messages/words/characters sent (also by year/month/day/hour) + # 3. Total count of messages/words/characters received (also by year/month) + def get_count(self, attribute, subject='all', start=None, end=None, period=None): + stats = self.get_stats(subject=subject, start=start, end=end, period=period) + return getattr(stats, attribute) + + ################# + + # 4. Most used messages/words in convos by me/partner (also by year/month/day/hour) + def most_used_messages_(self, **kwargs): + """ + >>> s1 = pd.Series([3, 1, 2, 3, 4, 1, 1]) + >>> s2 = pd.Series([3, 2, 1, 1]) + >>> s1_vc = s1.value_counts() + >>> s2_vc = s2.value_counts() + TODO LATER most used is already a problem: + - because its a series of all the unique messages/words ever used in a convo + - it contains strings like ':d', ':p' and 'xd' + - from all the convos the result of value_counts has to be cleared + and has to be truncated (that is not use the 200th most used word, only top10 let's say) + - then these series has to be merged in a way that the same string's counts are added up + - what about typos????! + """ + pass + + # 5. Number of messages sent/got on busiest period (by year/month/day/hour) + def stat_per_period(self, period, attribute, **kwargs): + interval_stats = self.get_time_series_data(period, **kwargs) + # TODO attribute is one of (msg, word, char) + time_series_data = self.get_plottable_time_series_data(interval_stats, statistic=attribute) + return utils.count_stat_for_period(time_series_data, period) + + # 6. Time series: dict of 'year/month/day/hour : number of messages/words/characters (also sent/got) for user/all convos' + def get_time_series_data(self, period, subject='all', **kwargs): + time_series = utils.generate_date_series(period, **kwargs) + return self.get_stats_for_intervals(self.df, time_series, subject=subject) + + # # 7. Ranking of friends by messages by y/m/d/h, by different stats, by sent/got + def get_ranking_of_friends_by_messages(self, attribute='msg_count', subject='all', start=None, end=None, + period=None): + # TODO almost the same function as get_count + count_dict = {} + for name in self.names: + # analyzer = Analyzer({name: self.people.get(name)}) # this has to be a people instance?! OR? + # analyzer = Analyzer(People(self.people.data_path, name=name)) # this has to be a people instance?! OR? + df = self.df[self.df.partner == name] + stats = self.get_stats(df=df, subject=subject, start=start, end=end, period=period) + if stats is not None: + count_dict = utils.fill_dict(count_dict, name, getattr(stats, attribute)) + + count_dict = {key: value for key, value in sorted(count_dict.items(), key=lambda item: item[1], reverse=True)} + return count_dict + + @staticmethod + @utils.subject_checker + @utils.date_checker + @utils.period_checker + def filter_by_input(df, subject='all', start=None, end=None, period=None): + if subject == 'me': + df = df[df.sender_name == 'Levente Csőke'] + elif subject == 'partner': + df = df[df.sender_name != 'Levente Csőke'] + if start and end: + df = df.loc[start:end] + elif start and not end: + df = df.loc[start:start + period] + elif not start and end: + df = df.loc[end - period:end] + return df diff --git a/miner/App.py b/miner/App.py index c2560bd..4989e8b 100644 --- a/miner/App.py +++ b/miner/App.py @@ -1,5 +1,5 @@ -from miner.ConversationAnalyzer import ConversationAnalyzer -from miner.MessagingAnalyzer import MessagingAnalyzer +from miner.Analyzer import Analyzer + from miner.People import People DATA_PATH = '/home/levente/projects/facebook-data-miner/data' @@ -13,26 +13,8 @@ def __init__(self): def analyze_messages(): p = People(path=DATA_PATH) - stats = {} - - for name, person in p.data.items(): - if person.messages is None: - stats[person.name] = None - continue - analyzer = ConversationAnalyzer(person.name, person.messages) - stats[person.name] = analyzer.stats - # if stats[person.name].get('message_count').get('me') > 5000: - # top[person.name] = stats[person.name] - print() - - # print('LEN: ', len(top.keys())) - # top_all = {name: data.get('message_count').get('all') for name, data in top.items()} - # analyzer.visualize_stats(top) - - @staticmethod - def analyze_messaging(): - people = People(path=DATA_PATH) - msg_analyzer = MessagingAnalyzer(people) + analyzer = Analyzer(p) + rank = analyzer.get_ranking_of_friends_by_messages(attribute='char_count') if __name__ == '__main__': diff --git a/miner/ConversationAnalyzer.py b/miner/ConversationAnalyzer.py deleted file mode 100644 index a5928bf..0000000 --- a/miner/ConversationAnalyzer.py +++ /dev/null @@ -1,146 +0,0 @@ -import pandas as pd -from miner.utils import date_checker, period_checker, subject_checker, generate_date_series, get_stats_for_intervals - - -class ConversationAnalyzer: - def __new__(cls, name, messages, *args, **kwargs): - if messages is None: # This deals with the case if no messages - return None - return super(ConversationAnalyzer, cls).__new__(cls, *args, **kwargs) - - def __init__(self, name, messages): - self.name = name - self.df = messages - - def __str__(self): - return f'{self.name}: {list(self.df.index)}' - - @property - def stats(self): - return self.get_stats(self.df) - - def get_stats(self, df, subject='all', start=None, end=None, period=None): - df = self.filter_by_input(df, subject=subject, start=start, end=end, period=period) - stats = ConversationStats(df) - return stats - - def get_time_series_data(self, subject='all', **kwargs): - time_series = generate_date_series(**kwargs) - return get_stats_for_intervals(self.get_stats, self.df, time_series, subject=subject) - - @staticmethod - def get_plottable_time_series_data(interval_stats, statistic): - for k, v in interval_stats.items(): - if isinstance(v, ConversationStats): - interval_stats[k] = getattr(v, statistic) - return interval_stats - - @staticmethod - @subject_checker - @date_checker - @period_checker - def filter_by_input(df, subject='all', start=None, end=None, period=None): - if subject == 'me': - df = df[df.sender_name == 'Levente Csőke'] - elif subject == 'partner': - df = df[df.sender_name != 'Levente Csőke'] - if start and end: - df = df.loc[start:end] - elif start and not end: - df = df.loc[start:start + period] - elif not start and end: - df = df.loc[end - period:end] - return df - - -class ConversationStats: - """ - Statistics of conversation with one person. - """ - - def __init__(self, df): - self.df = df - - def __repr__(self): - return f'{self.msg_count}' - - @property - def messages(self): - return self.df.content.dropna() - - @property - def words(self): - return self.get_words() - - # 1. - @property - def msg_count(self): - return len(self.df) - - # 2. - @property - def unique_msg_count(self): - return len(self.messages.unique()) - - # 3. - @property - def most_used_msgs(self): - # TODO LATER first few (1-10) messages - return self.messages.value_counts() - - # 4. - @property - def msg_frequency(self): - # NOTE this has been most likely depracated OR? - pass - - # 5. - @property - def word_count(self): - return len(self.words) - - # 6. - @property - def unique_word_count(self): - return len(set(self.words)) - - # 7. - @property - def most_used_words(self): - s = pd.Series(self.words) - return s.value_counts() - - # 8. - @property - def word_frequency(self): - pass - - # 9. - @property - def char_count(self): - char_count = 0 - for word in self.words: - char_count += len(word) - return char_count - - # 10. - @property - def rate_of_media_messages(self): - """ - TODO LATER - search for media messages all 5 of them - rate is only the second or third abstraction - """ - pass - - def get_words(self): - token_list = self.messages.str.lower().str.split() - words = [] - for tokens in token_list: - # print(tokens) - if not isinstance(tokens, list): - print('WARNING! Not a list!') - continue # TODO ??? check this - for token in tokens: - words.append(token) - return words diff --git a/miner/ConversationStats.py b/miner/ConversationStats.py new file mode 100644 index 0000000..040bd17 --- /dev/null +++ b/miner/ConversationStats.py @@ -0,0 +1,89 @@ + +class ConversationStats: + """ + Statistics of conversation with one person. + """ + + def __init__(self, df): + self.df = df + + def __repr__(self): + return f'{self.msg_count}' + + @property + def messages(self): + return self.df.content.dropna() + + @property + def words(self): + return self.get_words() + + # 1. + @property + def msg_count(self): + return len(self.df) + + # 2. + @property + def unique_msg_count(self): + return len(self.messages.unique()) + + # 3. + @property + def most_used_msgs(self): + return self.messages.value_counts() + + # 4. + @property + def msg_frequency(self): + # NOTE this has been most likely depracated OR? + pass + + # 5. + @property + def word_count(self): + return len(self.words) + + # 6. + @property + def unique_word_count(self): + return len(set(self.words)) + + # 7. + @property + def most_used_words(self): + return pd.Series(self.words).value_counts() + + # 8. + @property + def word_frequency(self): + pass + + # 9. + @property + def char_count(self): + char_count = 0 + for word in self.words: + char_count += len(word) + return char_count + + # 10. + @property + def rate_of_media_messages(self): + """ + TODO LATER + search for media messages all 5 of them + rate is only the second or third abstraction + """ + pass + + def get_words(self): + token_list = self.messages.str.lower().str.split() + words = [] + for tokens in token_list: + if not isinstance(tokens, list): + print('WARNING! Not a list!') + continue + for token in tokens: + words.append(token) + return words diff --git a/miner/Conversations.py b/miner/Conversations.py index 4f61ad9..7a373d4 100644 --- a/miner/Conversations.py +++ b/miner/Conversations.py @@ -1,64 +1,93 @@ +import pandas as pd import os -from miner.Group import Group + + +from miner.Messages import Messages from miner.Individual import Individual -from miner.FacebookData import FacebookData -import pandas as pd + from miner import utils -from datetime import datetime class Conversations: def __init__(self, data_path): - self.indie_convo_paths = [] - self.group_convo_paths = [] - self.deleted_user_convo_paths = [] + self.private_convo_paths = {} + self.group_convo_paths = {} # TODO fill this as well + self.deleted_user_convo_paths = [] # NOTE these are collected but not yet used self.data_path = f'{data_path}/{utils.MESSAGE_SUBPATH}' self.order_paths() def order_paths(self): - json_paths = utils.walk_directory_and_search(self.data_path, '.json') + paths_map = f'{self.data_path}/private_messages.json' + if os.path.isfile(paths_map): + self.read_paths(paths_map) + return + json_paths = utils.walk_directory_and_search(self.data_path, '.json', contains_string='message_') self.differentiate_paths(json_paths) + self.register_paths() def differentiate_paths(self, jsons): for file in jsons: msg = Messages(file) if msg.title == 'Facebook User': - self.deleted_user_convo_paths.append(file) + self.deleted_user_convo_paths.append(msg.messages_dir) elif msg.ttype == 'RegularGroup': - self.group_convo_paths.append(file) + self.map_group_convo_files(msg, file) elif msg.ttype == 'Regular': - self.indie_convo_paths.append(file) + # self.private_convo_paths[msg.title] = msg.messages_dir + self.map_private_convo_files(msg, file) else: raise ValueError('Should not happen!') + def register_paths(self): + utils.dump_to_json(self.private_convo_paths, f'{self.data_path}/private_messages.json') + + def read_paths(self, file): + self.private_convo_paths = utils.read_json(file) + print() + + def map_private_convo_files(self, msg, file): + name = msg.title + if self.private_convo_paths.get(name): + self.private_convo_paths[name].append(file) + else: + self.private_convo_paths[name] = [file] + + def map_group_convo_files(self, msg, file): + for participant in msg.participants: + if self.group_convo_paths.get(file): + self.group_convo_paths[file].append(participant) + else: + self.group_convo_paths[file] = [participant] + def get_people_from_private_messages(self, name=None, membership=True): name_data_map = {} - paths = self.indie_convo_paths - if name is not None: - paths = self.filter_by_name(name) - for file in paths: - messages = Messages(file) - name = messages.title - if name_data_map.get(name) is not None: - dfs = [name_data_map[name].messages, messages.df] - name_data_map[name].messages = pd.concat(dfs).sort_index() - else: - # TODO we may also want to get group messages where name is present - name_data_map[name] = self.create_individual(messages, membership=membership) + convo_path_map = self.filter_by_name(name) if name is not None else self.private_convo_paths.values() + for paths in convo_path_map: + for file in paths: + messages = Messages(file) + name = messages.title + if name_data_map.get(name) is not None: + dfs = [name_data_map[name].messages, messages.df] + name_data_map[name].messages = pd.concat(dfs).sort_index() + else: + name_data_map[name] = self.create_individual(messages, membership=membership) return name_data_map def filter_by_name(self, name): filtered_paths = [] - compact_name = None if name is None else utils.replace_accents(name.lower()) - for path in self.indie_convo_paths: - if compact_name in os.path.basename(os.path.dirname(os.path.normpath(path))): - filtered_paths.append(path) + names = [] + if isinstance(name, str): + names = [name] + elif isinstance(name, list): + names = name + for name in names: + filtered_paths.append(self.private_convo_paths.get(name)) return filtered_paths def create_individual(self, messages, membership=None): return Individual( - name=messages.title, title=messages.title, # TODO depracate one of (name, title) + name=messages.title, compact=messages.compact_names, messages=messages.df, messages_dir=messages.messages_dir, @@ -66,74 +95,12 @@ def create_individual(self, messages, membership=None): member_of=self.group_membership(messages.title) if membership else None, ) - @staticmethod - def fill_data_map(message): - return { - 'title': message.title, - 'compact_name': message.compact_names, - # 'participants': msg.participants + ['Levente Csőke'], - 'participants': message.participants, - 'messages': message.df, - 'friend': None, - 'messages_dir': message.messages_dir, - 'media_dir': message.media_dir - } - @staticmethod def group_membership(name): return None - @staticmethod - def json_is_a_group_msg(file): - msg = Messages(file) - return msg.ttype == 'RegularGroup' - - -class Messages(FacebookData): - def __init__(self, json_path): - super().__init__(json_path) - self.to_df('messages') - self.set_date_as_index() - - @property - def names(self): - return pd.DataFrame(self.participants)[0] - - @property - def participants(self): - participants = self.decoded.get('participants') - # TODO I should be IN - # but this breaks stuff at TestMessagingAnalyzer - return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke'] - # return [p.get('name') for p in participants if p.get('name')] - - @property - def title(self): - return self.decoded.get('title') - - @property - def ttype(self): - return self.decoded.get('thread_type') - - @property - def messages_dir(self): - thread_path = self.decoded.get('thread_path') - if not thread_path.startswith('inbox/'): - raise ValueError('Field `thread_path` should start with `inbox/`.') - return thread_path.split('inbox/')[1] - - @property - def media_dir(self): - for media in utils.MEDIA_DIRS: - if media in self._df.columns: - media_in_msg = list(self._df[media][self._df[media].notnull()]) - uri = media_in_msg[0][0].get('uri') - return os.path.dirname(os.path.dirname(uri)).split('inbox/')[1] - - def set_date_as_index(self): - date_series = self._df.timestamp_ms.apply(self.ts_to_date) - self._df = self._df.set_index(date_series).iloc[::-1] + def get_people_from_group_messages(self): + pass # TODO for v0.0.4 + + - @staticmethod - def ts_to_date(date): - return datetime.fromtimestamp(date / 1000) # .strftime('%Y-%m-%d') diff --git a/miner/FacebookData.py b/miner/FacebookData.py index babe74d..81b946f 100644 --- a/miner/FacebookData.py +++ b/miner/FacebookData.py @@ -21,8 +21,6 @@ def json(self): @property def compact_names(self): - # NOTE this is the place where we change pd/np to builtin - # do we have to do this? name_list = list(utils.without_accent_and_whitespace(utils.lower_names(self.names))) # should be just fine return name_list[0] if len(name_list) == 1 else name_list diff --git a/miner/Friends.py b/miner/Friends.py index 7950155..98d995f 100644 --- a/miner/Friends.py +++ b/miner/Friends.py @@ -1,5 +1,3 @@ -import pandas as pd -import os from miner.FacebookData import FacebookData from miner.Individual import Individual @@ -16,7 +14,7 @@ def get_people(self, name=None): if name is not None and name != full_name: # filtering for name continue names[full_name] = Individual( - name=full_name, title=full_name, # TODO depracate one of (name, title) + name=full_name, compact=compact, friend=True, ) diff --git a/miner/Individual.py b/miner/Individual.py index 84d63b5..4518a5f 100644 --- a/miner/Individual.py +++ b/miner/Individual.py @@ -1,9 +1,8 @@ class Individual: - def __init__(self, name=None, title=None, compact=None, messages=None, friend=None, messages_dir=None, + def __init__(self, name=None, compact=None, messages=None, friend=None, messages_dir=None, media_dir=None, member_of=None): self._name = name - self._title = title self._compact_name = compact self._messages = messages self._friend = friend @@ -16,8 +15,7 @@ def __repr__(self): def __add__(self, other): return Individual( - name=self.title if self.title else other.title, - title=self.title if self.title else other.title, # TODO depracate one of (name, title) + name=self.name if self.name else other.name, friend=self.friend if self.friend else other.friend, compact=self.compact_name if self.compact_name else other.compact_name, messages=self.messages if len(self.messages) else other.messages, @@ -30,10 +28,6 @@ def __add__(self, other): def name(self): return self._name - @property - def title(self): - return self._title - @property def messages(self): return self._messages diff --git a/miner/Me.py b/miner/Me.py index 377c3a9..b10356d 100644 --- a/miner/Me.py +++ b/miner/Me.py @@ -7,4 +7,4 @@ def __init__(self, *args): @property def name(self): - return 'Levente Csőke' + return '' diff --git a/miner/Messages.py b/miner/Messages.py new file mode 100644 index 0000000..6fbc9d3 --- /dev/null +++ b/miner/Messages.py @@ -0,0 +1,64 @@ +from datetime import datetime +import pandas as pd +import os + +from miner.FacebookData import FacebookData +from miner import utils + + +class Messages(FacebookData): + def __init__(self, json_path): + super().__init__(json_path) + self.to_df('messages') + self.set_date_as_index() + self.add_partner_column() + + @property + def names(self): + # TODO ugly + try: + return pd.DataFrame(self.participants)[0] + except KeyError: + return pd.Series({0: 'Facebook User'}) + + @property + def participants(self): + participants = self.decoded.get('participants') + # TODO I should be IN + # but this breaks stuff at TestMessagingAnalyzer + return [p.get('name') for p in participants if p.get('name') != 'Levente Csőke'] + # return [p.get('name') for p in participants if p.get('name')] + + @property + def title(self): + return self.decoded.get('title') + + @property + def ttype(self): + return self.decoded.get('thread_type') + + @property + def messages_dir(self): + thread_path = self.decoded.get('thread_path') + if not thread_path.startswith('inbox/'): + raise ValueError('Field `thread_path` should start with `inbox/`.') + return thread_path.split('inbox/')[1] + + @property + def media_dir(self): + for media in utils.MEDIA_DIRS: + if media in self._df.columns: + media_in_msg = list(self._df[media][self._df[media].notnull()]) + uri = media_in_msg[0][0].get('uri') + return os.path.dirname(os.path.dirname(uri)).split('inbox/')[1] + + def set_date_as_index(self): + date_series = self._df.timestamp_ms.apply(self.ts_to_date) + self._df = self._df.set_index(date_series).iloc[::-1] + + def add_partner_column(self): + self._df['partner'] = self.title + + @staticmethod + def ts_to_date(date): + return datetime.fromtimestamp(date / 1000) diff --git a/miner/MessagingAnalyzer.py b/miner/MessagingAnalyzer.py deleted file mode 100644 index 8e8ddd4..0000000 --- a/miner/MessagingAnalyzer.py +++ /dev/null @@ -1,127 +0,0 @@ -from miner.utils import year_converter, month_converter, generate_date_series, get_stats_for_intervals -from datetime import datetime, date, timedelta -import pandas as pd -from miner.ConversationAnalyzer import ConversationAnalyzer - - -class MessagingAnalyzer: - def __init__(self, people): - self.names = people.names - self.people = people.data - - def get_stats(self, df, subject='all', start=None, end=None, period=None): - # TODO LATER - # here you have to do something with it - pass - - def get_count(self, attribute, subject='all', start=None, end=None, period=None): - count = 0 - # we have a list of names we want to iterate over - for name in self.names: - stats = self.get_conversation_stats(name=name, subject=subject, start=start, end=end, period=period) - if stats is not None: - count += getattr(stats, attribute) - return count - - def get_conversation_stats(self, name, subject='all', start=None, end=None, period=None): - messages = self.people.get(name).messages - analyzer = ConversationAnalyzer(name, messages) - if analyzer is None: - return None - return analyzer.get_stats(messages, subject=subject, start=start, end=end, period=period) - - def total_number_of_(self, attribute, subject='all', **kwargs): - return self.get_count(attribute=attribute, subject=subject, **kwargs) - - # 1. Ranking of friends by total count of messages/words/characters (also by year/month/day/hour) - def total_number_of_messages(self, **kwargs): - return self.total_number_of_(attribute='msg_count', **kwargs) - - def total_number_of_words(self, **kwargs): - return self.total_number_of_(attribute='word_count', **kwargs) - - def total_number_of_characters(self, **kwargs): - return self.total_number_of_(attribute='char_count', **kwargs) - - # 2. Ranking of friends who I sent the most messages/words/characters (also by year/month/day/hour) - def total_number_of_messages_sent(self, **kwargs): - return self.total_number_of_(attribute='msg_count', subject='me', **kwargs) - - def total_number_of_words_sent(self, **kwargs): - return self.total_number_of_(attribute='word_count', subject='me', **kwargs) - - def total_number_of_characters_sent(self, **kwargs): - return self.total_number_of_(attribute='char_count', subject='me', **kwargs) - - # 3. Ranking of friends who sent the most messages/words/characters (also by year/month) - def total_number_of_messages_received(self, **kwargs): - return self.total_number_of_(attribute='msg_count', subject='partner', **kwargs) - - def total_number_of_words_received(self, **kwargs): - return self.total_number_of_(attribute='word_count', subject='partner', **kwargs) - - def total_number_of_characters_received(self, **kwargs): - return self.total_number_of_(attribute='char_count', subject='partner', **kwargs) - - # 4. Most used messages/words in convos by me/partner (also by year/month/day/hour) - def most_used_messages_by_me(self, **kwargs): - """ - >>> s1 = pd.Series([3, 1, 2, 3, 4, 1, 1]) - >>> s2 = pd.Series([3, 2, 1, 1]) - >>> s1_vc = s1.value_counts() - >>> s2_vc = s2.value_counts() - TODO LATER most used is already a problem: - - because its a series of all the unique messages/words ever used in a convo - - it contains strings like ':d', ':p' and 'xd' - - from all the convos the result of value_counts has to be cleared - and has to be truncated (that is not use the 200th most used word, only top10 let's say) - - then these series has to be merged in a way that the same string's counts are added up - - what about typos????! - """ - pass - - def most_used_messages_by_partners(self, **kwargs): - pass - - def most_used_words_by_me(self, **kwargs): - pass - - def most_used_words_by_partners(self, **kwargs): - pass - - # 5. Number of messages sent/got on busiest period (by year/month/day/hour) - def days_when_most_messages_sent(self): - pass - - def days_when_most_messages_received(self): - pass - - def hours_when_most_messages_sent(self): - # TODO LATER hard algorithmic problem - # is this referring to the absolute hour most messages sent?? - # like: 2014.07.25. 15h-16h - # OR - # the pattern of most messages sent between this and this hours - # like: 20h-21h - # ACTUALLY BOTH - # for years/months/days/hours - # BUT this comes from the time series analysis - pass - - def hours_when_most_messages_received(self): - pass - - # 6. Time series: dict of 'year/month/day/hour : number of messages/words/characters (also sent/got) for user/all convos' - def time_series_analysis_for_all(self, subject=None, **kwargs): - time_series = generate_date_series(**kwargs) - stacked_df = self.stack_dfs(self.people) - interval_stats = get_stats_for_intervals(self.get_stats, stacked_df, time_series, subject=subject) - # TODO finsh this for time series for all - - @staticmethod - def stack_dfs(people): - dfs = [] - for data in people.values(): - if data.messages is not None: - dfs.append(data.messages) - return pd.concat(dfs).sort_index() diff --git a/miner/People.py b/miner/People.py index 707c6f5..2970390 100644 --- a/miner/People.py +++ b/miner/People.py @@ -3,8 +3,6 @@ from miner.Conversations import Conversations from miner.Friends import Friends -# from Me import Me - DATA_PATH = '/home/levente/projects/facebook-data-miner/data' @@ -21,7 +19,7 @@ def data(self): @property def names(self): - return self._names + return self._names #if len(self._names) > 1 else self._names[0] @property def groups(self): @@ -36,9 +34,10 @@ def get_people(self, name=None): # TODO LATER too slow -> store in file start = time.time() conversations = Conversations(self.data_path) - individuals = conversations.get_people_from_private_messages() - - print('convos: ', time.time() - start) + print('convos1: ', time.time() - start) + start = time.time() + individuals = conversations.get_people_from_private_messages(name=name) + print('convos2: ', time.time() - start) return self.unify_people(friends, individuals) diff --git a/miner/requirements.txt b/miner/requirements.txt new file mode 100644 index 0000000..1262ec9 --- /dev/null +++ b/miner/requirements.txt @@ -0,0 +1,9 @@ +numpy==1.18.1 +pandas==1.0.3 +dateparser==0.7.6 +seaborn==0.10.1 +matplotlib==3.2.1 +plotly==4.8.2 +miner==0.0.0 +Pillow==7.2.0 +python_dateutil==2.8.1 diff --git a/miner/utils.py b/miner/utils.py index c80001b..cfa7644 100644 --- a/miner/utils.py +++ b/miner/utils.py @@ -1,6 +1,5 @@ import os import json -import pandas as pd import dateparser from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta @@ -9,6 +8,13 @@ MEDIA_DIRS = ['photos', 'gifs', 'files', 'videos', 'audio'] MONTHS = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] +WEEKDAYS = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"] +PERIOD_MAP = { + 'y': None, + 'm': MONTHS, + 'd': WEEKDAYS, + 'h': None, +} DELTA_MAP = { 'y': relativedelta(years=+1), 'm': relativedelta(months=+1), @@ -34,17 +40,14 @@ def read_json(file): def dump_to_json(data=None, file=None): - with open(file, 'w') as f: - json.dump(data, f) + with open(file, 'w', encoding='utf8') as f: + json.dump(data, f, ensure_ascii=False) def order_list_of_dicts(lst, key='timestamp_ms'): return sorted(lst, key=lambda k: k[key]) -# - - def year_converter(func): """ Higher-order function that converts @year param passed to @func into numeric version. @@ -121,31 +124,22 @@ def wrapper(*args, **kwargs): return wrapper -def generate_date_series(start=None, end=None, period=None): +def generate_date_series(period, start=None, end=None): if period is None or DELTA_MAP.get(period) is None: raise ValueError('Parameter `period` should be one of {y, m, d, h}') start = start or datetime(year=2009, month=10, day=2, hour=0) # TODO LATER change this to date when user joined FB end = end or datetime.now() + # TODO THIS HAS A PROBLEM. msgs happened in 2020 getting assigned to 2019 because: 2019 + 1 year + start.month + start.day < now() + # TODO serious problem! dates = [] intermediate = start - while intermediate <= end: + while intermediate <= (end + DELTA_MAP.get(period)): # means that we want to have the end in it as well dates.append(intermediate) intermediate = intermediate + DELTA_MAP.get(period) return dates -def get_stats_for_intervals(func, df, time_series, subject='all'): - data = {} - for i in range(len(time_series) - 1): # only looping len - 1 times - start = time_series[i] - # TODO test it with new data injected/modified at runtime <- this is hard - # what is this about actually? - end = time_series[i + 1] - data[start] = func(df, subject=subject, start=start, end=end) - return data - - def dt(year: int = 2004, month: int = 1, day: int = 1, hour: int = 0): return datetime(year=year, month=month, day=day, hour=hour) @@ -191,10 +185,42 @@ def without_accent_and_whitespace(col): return col.apply(replace_accents) -def walk_directory_and_search(path, extension): +def walk_directory_and_search(path, extension, contains_string=None): paths = [] for root, dirs, files in os.walk(path): for file_name in files: if file_name.endswith(extension): - paths.append(os.path.join(root, file_name)) + if contains_string is not None and contains_string in file_name: + paths.append(os.path.join(root, file_name)) return paths + + +def fill_dict(dictionary, key, value): + if dictionary.get(key) is not None: + dictionary[key] += value + else: + dictionary[key] = value + return dictionary + + +def month_sorter(x): + return MONTHS.index(x[0]) + + +def count_stat_for_period(data, period): + # TODO sort by lists + periods = {} + for key, value in data.items(): + if period == 'y': + periods = fill_dict(periods, key.year, value) + periods = dict(sorted(periods.items())) + elif period == 'm': + periods = fill_dict(periods, MONTHS[key.month - 1], value) + periods = dict(sorted(periods.items(), key=lambda x: MONTHS.index(x[0]))) + elif period == 'd': + periods = fill_dict(periods, WEEKDAYS[key.weekday()], value) + periods = dict(sorted(periods.items(), key=lambda x: WEEKDAYS.index(x[0]))) + elif period == 'h': + periods = fill_dict(periods, key.hour, value) + periods = dict(sorted(periods.items())) + return periods diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..76f8eef --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +seaborn==0.10.1 +dateparser==0.7.6 +pandas==1.0.3 +matplotlib==3.2.1 +python_dateutil==2.8.1 diff --git a/tests/test_ConversationAnalyzer.py b/tests/test_ConversationAnalyzer.py index f7d679f..9d11e46 100644 --- a/tests/test_ConversationAnalyzer.py +++ b/tests/test_ConversationAnalyzer.py @@ -1,36 +1,35 @@ import pytest -from miner.ConversationAnalyzer import ConversationAnalyzer -from miner.People import People + +from miner.Analyzer import Analyzer from miner.utils import dt TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' -@pytest.fixture(scope='session') -def person(get_people): - def _person(name): - people = get_people(name) - return people.data[name] - - return _person +# @pytest.fixture(scope='session') +# def person(get_people): +# def _person(name): +# people = get_people(name) +# return people.data[name] +# +# return _person @pytest.fixture(scope='session') -def analyze(person): +def analyze(get_people): def _analyze(name): - individual = person(name) - return ConversationAnalyzer(name, individual.messages) + people = get_people(name) + return Analyzer(people) return _analyze @pytest.fixture(scope='session') -def statistics(person, analyze): +def statistics(analyze): def _stats(name, **kwargs): - individual = person(name) analyzer = analyze(name) if 'subject' in kwargs or 'start' in kwargs or 'end' in kwargs: # and others - return analyzer.get_stats(individual.messages, **kwargs) + return analyzer.get_stats(**kwargs) else: return analyzer.stats @@ -219,10 +218,6 @@ def test_stats_teflon_musk_all_2014_12(statistics): # assert stats.most_used_chars == 0 -class TestConversationAnalyzer: # Foo Bar - pass - - def test_time_series_analysis_for_user(analyze): analyzer = analyze('Teflon Musk') analyzer.get_time_series_data(subject='all', period='y') diff --git a/tests/test_Conversations.py b/tests/test_Conversations.py index d066361..ef9fdc3 100644 --- a/tests/test_Conversations.py +++ b/tests/test_Conversations.py @@ -1,72 +1,76 @@ import pandas as pd import pytest from miner.Conversations import Conversations +from miner.Individual import Individual from miner import utils import os + TEST_DATA_PATH = '/home/levente/projects/facebook-data-miner/tests/test_data' @pytest.fixture() -def convos(): - convo = Conversations(f'{TEST_DATA_PATH}') - return convo.get_people_from_private_messages() +def conversations(): + return Conversations(f'{TEST_DATA_PATH}') + + +@pytest.fixture +def people_from_private_convos(conversations): + return conversations.get_people_from_private_messages() + + +def test_if_paths_are_registered(conversations): + assert len(conversations.private_convo_paths) == 4 + assert len(conversations.group_convo_paths) == 3 + assert len(conversations.deleted_user_convo_paths) == 0 -def test_get_all_people_from_convo(convos): +def test_get_all_people_from_private_messages(people_from_private_convos): + people = list(people_from_private_convos.keys()) + expected = ['Foo Bar', 'Teflon Musk', 'Benedek Elek', 'Tőke Hal'] + assert sorted(people) == sorted(expected) + + +def test_get_all_people_from_convo(conversations): people = [] - # TODO make this work - for convo in convos.keys(): - if convo.startswith('group'): - people += [p for p in convos[convo].get('participants')] - else: - people.append(convo) - people = list(set(people)) + # indie + people += list(conversations.private_convo_paths.keys()) + # group + people_from_groups = [p for people in conversations.group_convo_paths.values() for p in people] + + people += people_from_groups expected = ['Dér Dénes', 'Facebook User', 'Foo Bar', 'John Doe', 'Teflon Musk', 'Benedek Elek', 'Donald Duck', 'Tőke Hal'] - # TODO LATER what to do with Facebook User?????? - assert sorted(people) == sorted(expected) + assert sorted(list(set(people))) == sorted(expected) + + +def test_people_are_individual_instances(people_from_private_convos): + assert all([isinstance(person, Individual) for person in people_from_private_convos.values()]) + + +def test_all_individual_have_messages_df(people_from_private_convos): + assert all([isinstance(data.messages, pd.DataFrame) for data in people_from_private_convos.values()]) -def test_all_convos_have_dir(convos): - assert all([data.messages_dir for data in convos.values()]) +def test_all_individual_have_dir(people_from_private_convos): + assert all([data.messages_dir for data in people_from_private_convos.values()]) -def test_all_convos_have_messages_df(convos): - assert all([isinstance(data.messages, pd.DataFrame) for data in convos.values()]) +def test_some_individual_as_media_dir(people_from_private_convos): + assert people_from_private_convos.get('Teflon Musk').media_dir + assert not people_from_private_convos.get('Benedek Elek').media_dir -def test_some_convos_as_media_dir(convos): - assert convos.get('Teflon Musk').media_dir - assert not convos.get('Benedek Elek').media_dir -def test_convo_media_has_one_folder_of_possibles(convos): - listed_dir = os.listdir(f"{TEST_DATA_PATH}/{utils.MESSAGE_SUBPATH}/{convos.get('Teflon Musk').media_dir}") +def test_individual_media_has_one_folder_of_possibles(people_from_private_convos): + listed_dir = os.listdir( + f"{TEST_DATA_PATH}/{utils.MESSAGE_SUBPATH}/{people_from_private_convos.get('Teflon Musk').media_dir}") assert 'files' in listed_dir assert 'photos' in listed_dir assert 'audio' not in listed_dir -def test_groups_have_more_than_two_participates(convos): - groups = {convo: data for convo, data in convos.items() if convo.startswith('group')} + +def test_groups_have_more_than_two_participates(people_from_private_convos): + groups = {convo: data for convo, data in people_from_private_convos.items() if convo.startswith('group')} # TODO participants should contain the user itself as well assert all([len(data.get('participants')) > 2 for data in groups.values()]) - - - -""" -testcases: -- individual convos contain all names, compact_names, message folders and media folders - - media folders are a big question. how do you get it? actually once you have the thread_path then from that you can guess, - OR better off use the uri in the messages... fuck seems complicated -- friends contain all names and compact names, -- convos and friends has a common set, and the set is identical -- people gets assigned with all the unique friends and individual/group convos - -gonna test: -- assigning messages to friends, -- deal with multiple directories, IF there are multiple directories, -- -concerns: -- what to do with non-friends, -- I assume multiple directories are because of files sent, -""" diff --git a/tests/test_MessagingAnalyzer.py b/tests/test_MessagingAnalyzer.py index ec03497..969dc5d 100644 --- a/tests/test_MessagingAnalyzer.py +++ b/tests/test_MessagingAnalyzer.py @@ -1,244 +1,245 @@ import pytest -from miner.MessagingAnalyzer import MessagingAnalyzer + +from miner.Analyzer import Analyzer from miner.utils import dt @pytest.fixture(scope='session') def analyzer(get_people): people = get_people() - return MessagingAnalyzer(people) + return Analyzer(people) def test_total_number_of_messages(analyzer): - assert analyzer.total_number_of_messages() == 29 + assert analyzer.get_count(attribute='msg_count', ) == 29 - assert analyzer.total_number_of_messages(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_messages(start=dt(year=2014), period='y') == 11 - assert analyzer.total_number_of_messages(start=dt(year=2018), period='y') == 3 - assert analyzer.total_number_of_messages(start=dt(year=2020), period='y') == 15 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2014), period='y') == 11 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2018), period='y') == 3 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020), period='y') == 15 - assert analyzer.total_number_of_messages(start=dt(year=2011, month=11), period='m') == 0 - assert analyzer.total_number_of_messages(start=dt(year=2014, month=9), period='m') == 1 - assert analyzer.total_number_of_messages(start=dt(year=2014, month=11), period='m') == 8 - assert analyzer.total_number_of_messages(start=dt(year=2014, month=12), period='m') == 2 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2011, month=11), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2014, month=9), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2014, month=11), period='m') == 8 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2014, month=12), period='m') == 2 - assert analyzer.total_number_of_messages(start=dt(year=2018, month=1), period='m') == 3 - assert analyzer.total_number_of_messages(start=dt(year=2018, month=5), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2018, month=1), period='m') == 3 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2018, month=5), period='m') == 0 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=2), period='m') == 10 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=3), period='m') == 1 # jpg - assert analyzer.total_number_of_messages(start=dt(year=2020, month=4), period='m') == 2 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=5), period='m') == 1 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=8), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=2), period='m') == 10 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=3), period='m') == 1 # jpg + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=4), period='m') == 2 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=8), period='m') == 1 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=2, day=13), period='d') == 2 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 2 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=2, day=13), period='d') == 2 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=2, day=13, hour=6), period='h') == 2 - assert analyzer.total_number_of_messages(start=dt(year=2020, month=2, day=13, hour=6), period='d') == 4 + assert analyzer.get_count(attribute='msg_count', start=dt(year=2020, month=2, day=13, hour=6), period='d') == 4 def test_total_number_of_words(analyzer): - assert analyzer.total_number_of_words() == 86 + assert analyzer.get_count(attribute='word_count', ) == 86 - assert analyzer.total_number_of_words(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_words(start=dt(year=2014), period='y') == 20 - assert analyzer.total_number_of_words(start=dt(year=2018), period='y') == 32 - assert analyzer.total_number_of_words(start=dt(year=2020), period='y') == 34 + assert analyzer.get_count(attribute='word_count', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='word_count', start=dt(year=2014), period='y') == 20 + assert analyzer.get_count(attribute='word_count', start=dt(year=2018), period='y') == 32 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020), period='y') == 34 - assert analyzer.total_number_of_words(start=dt(year=2014, month=9), period='m') == 6 - assert analyzer.total_number_of_words(start=dt(year=2014, month=11), period='m') == 13 - assert analyzer.total_number_of_words(start=dt(year=2014, month=12), period='m') == 1 + assert analyzer.get_count(attribute='word_count', start=dt(year=2014, month=9), period='m') == 6 + assert analyzer.get_count(attribute='word_count', start=dt(year=2014, month=11), period='m') == 13 + assert analyzer.get_count(attribute='word_count', start=dt(year=2014, month=12), period='m') == 1 - assert analyzer.total_number_of_words(start=dt(year=2018, month=1), period='m') == 32 - assert analyzer.total_number_of_words(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.get_count(attribute='word_count', start=dt(year=2018, month=1), period='m') == 32 + assert analyzer.get_count(attribute='word_count', start=dt(year=2018, month=2), period='m') == 0 - assert analyzer.total_number_of_words(start=dt(year=2020, month=2), period='m') == 27 - assert analyzer.total_number_of_words(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_words(start=dt(year=2020, month=4), period='m') == 4 - assert analyzer.total_number_of_words(start=dt(year=2020, month=5), period='m') == 1 - assert analyzer.total_number_of_words(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_words(start=dt(year=2020, month=8), period='m') == 2 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=2), period='m') == 27 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=4), period='m') == 4 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=8), period='m') == 2 - assert analyzer.total_number_of_words(start=dt(year=2020, month=2, day=13), period='d') == 14 - assert analyzer.total_number_of_words(start=dt(year=2020, month=2, day=13, hour=5), period='d') == 14 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=2, day=13), period='d') == 14 + assert analyzer.get_count(attribute='word_count', start=dt(year=2020, month=2, day=13, hour=5), period='d') == 14 def test_total_number_of_characters(analyzer): - assert analyzer.total_number_of_characters() == 379 + assert analyzer.get_count(attribute='char_count', ) == 379 - assert analyzer.total_number_of_characters(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_characters(start=dt(year=2014), period='y') == 69 - assert analyzer.total_number_of_characters(start=dt(year=2018), period='y') == 170 - assert analyzer.total_number_of_characters(start=dt(year=2020), period='y') == 140 + assert analyzer.get_count(attribute='char_count', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='char_count', start=dt(year=2014), period='y') == 69 + assert analyzer.get_count(attribute='char_count', start=dt(year=2018), period='y') == 170 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020), period='y') == 140 - assert analyzer.total_number_of_characters(start=dt(year=2014, month=9), period='m') == 24 - assert analyzer.total_number_of_characters(start=dt(year=2014, month=11), period='m') == 42 - assert analyzer.total_number_of_characters(start=dt(year=2014, month=12), period='m') == 3 + assert analyzer.get_count(attribute='char_count', start=dt(year=2014, month=9), period='m') == 24 + assert analyzer.get_count(attribute='char_count', start=dt(year=2014, month=11), period='m') == 42 + assert analyzer.get_count(attribute='char_count', start=dt(year=2014, month=12), period='m') == 3 - assert analyzer.total_number_of_characters(start=dt(year=2018, month=1), period='m') == 170 - assert analyzer.total_number_of_characters(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.get_count(attribute='char_count', start=dt(year=2018, month=1), period='m') == 170 + assert analyzer.get_count(attribute='char_count', start=dt(year=2018, month=2), period='m') == 0 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=2), period='m') == 114 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=4), period='m') == 17 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=5), period='m') == 4 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_characters(start=dt(year=2020, month=8), period='m') == 5 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=2), period='m') == 114 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=4), period='m') == 17 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=5), period='m') == 4 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='char_count', start=dt(year=2020, month=8), period='m') == 5 def test_total_number_of_messages_sent(analyzer): - assert analyzer.total_number_of_messages_sent() == 17 - assert analyzer.total_number_of_messages_sent(start=dt(year=2014), period='y') == 6 - assert analyzer.total_number_of_messages_sent(start=dt(year=2018), period='y') == 2 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020), period='y') == 9 + assert analyzer.get_count(attribute='msg_count', subject='me', ) == 17 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2014), period='y') == 6 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2018), period='y') == 2 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020), period='y') == 9 - assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=9), period='m') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=11), period='m') == 4 - assert analyzer.total_number_of_messages_sent(start=dt(year=2014, month=12), period='m') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2018, month=1), period='m') == 2 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2014, month=9), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2014, month=11), period='m') == 4 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2014, month=12), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2018, month=1), period='m') == 2 - assert analyzer.total_number_of_messages_sent(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2011, month=11), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2018, month=5), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2011, month=11), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2018, month=5), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2), period='m') == 6 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=4), period='m') == 2 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=5), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=8), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=2), period='m') == 6 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=4), period='m') == 2 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=5), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=8), period='m') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2, day=13), period='d') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 1 - assert analyzer.total_number_of_messages_sent(start=dt(year=2020, month=2, day=13, hour=18), period='h') == 0 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=2, day=13), period='d') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=2, day=13, hour=6), period='h') == 1 + assert analyzer.get_count(attribute='msg_count', subject='me', start=dt(year=2020, month=2, day=13, hour=18), period='h') == 0 def test_total_number_of_words_sent(analyzer): - assert analyzer.total_number_of_words_sent() == 69 + assert analyzer.get_count(attribute='word_count', subject='me', ) == 69 - assert analyzer.total_number_of_words_sent(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2014), period='y') == 16 - assert analyzer.total_number_of_words_sent(start=dt(year=2018), period='y') == 31 - assert analyzer.total_number_of_words_sent(start=dt(year=2020), period='y') == 22 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2014), period='y') == 16 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2018), period='y') == 31 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020), period='y') == 22 - assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=9), period='m') == 6 - assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=11), period='m') == 9 - assert analyzer.total_number_of_words_sent(start=dt(year=2014, month=12), period='m') == 1 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2014, month=9), period='m') == 6 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2014, month=11), period='m') == 9 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2014, month=12), period='m') == 1 - assert analyzer.total_number_of_words_sent(start=dt(year=2018, month=1), period='m') == 31 - assert analyzer.total_number_of_words_sent(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2018, month=1), period='m') == 31 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2018, month=2), period='m') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2), period='m') == 16 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=4), period='m') == 4 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=5), period='m') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=8), period='m') == 2 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=2), period='m') == 16 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=4), period='m') == 4 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=5), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=8), period='m') == 2 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2, day=13), period='d') == 5 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 5 - assert analyzer.total_number_of_words_sent(start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=2, day=13), period='d') == 5 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=2, day=13, hour=6), period='h') == 5 + assert analyzer.get_count(attribute='word_count', subject='me', start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 def test_total_number_of_characters_sent(analyzer): - assert analyzer.total_number_of_characters_sent() == 311 + assert analyzer.get_count(attribute='char_count', subject='me', ) == 311 - assert analyzer.total_number_of_characters_sent(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2014), period='y') == 60 - assert analyzer.total_number_of_characters_sent(start=dt(year=2018), period='y') == 167 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020), period='y') == 84 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2014), period='y') == 60 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2018), period='y') == 167 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020), period='y') == 84 - assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=9), period='m') == 24 - assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=11), period='m') == 33 - assert analyzer.total_number_of_characters_sent(start=dt(year=2014, month=12), period='m') == 3 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2014, month=9), period='m') == 24 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2014, month=11), period='m') == 33 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2014, month=12), period='m') == 3 - assert analyzer.total_number_of_characters_sent(start=dt(year=2018, month=1), period='m') == 167 - assert analyzer.total_number_of_characters_sent(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2018, month=1), period='m') == 167 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2018, month=2), period='m') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2), period='m') == 62 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=4), period='m') == 17 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=5), period='m') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=6), period='m') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=8), period='m') == 5 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2), period='m') == 62 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=4), period='m') == 17 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=5), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=6), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=8), period='m') == 5 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=6), period='d') == 21 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=7), period='d') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2, day=13, hour=6), period='d') == 21 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2, day=13, hour=7), period='d') == 0 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=6), period='h') == 21 - assert analyzer.total_number_of_characters_sent(start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2, day=13, hour=6), period='h') == 21 + assert analyzer.get_count(attribute='char_count', subject='me', start=dt(year=2020, month=2, day=13, hour=7), period='h') == 0 def test_total_number_of_messages_received(analyzer): - assert analyzer.total_number_of_messages_received() == 12 - assert analyzer.total_number_of_messages_received(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2014), period='y') == 5 - assert analyzer.total_number_of_messages_received(start=dt(year=2018), period='y') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2020), period='y') == 6 + assert analyzer.get_count(attribute='msg_count', subject='partner', ) == 12 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2014), period='y') == 5 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2018), period='y') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020), period='y') == 6 - assert analyzer.total_number_of_messages_received(start=dt(year=2011, month=11), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2011, month=11), period='m') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=9), period='m') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=11), period='m') == 4 - assert analyzer.total_number_of_messages_received(start=dt(year=2014, month=12), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2014, month=9), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2014, month=11), period='m') == 4 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2014, month=12), period='m') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2018, month=1), period='m') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2018, month=5), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2018, month=1), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2018, month=5), period='m') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2), period='m') == 4 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=3), period='m') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=4), period='m') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=5), period='m') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=8), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=2), period='m') == 4 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=3), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=4), period='m') == 0 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=8), period='m') == 0 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2, day=13), period='d') == 1 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2, day=14), period='d') == 2 - assert analyzer.total_number_of_messages_received(start=dt(year=2020, month=2, day=18), period='d') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=2, day=13), period='d') == 1 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=2, day=14), period='d') == 2 + assert analyzer.get_count(attribute='msg_count', subject='partner', start=dt(year=2020, month=2, day=18), period='d') == 1 def test_total_number_of_words_received(analyzer): - assert analyzer.total_number_of_words_received() == 17 + assert analyzer.get_count(attribute='word_count', subject='partner', ) == 17 - assert analyzer.total_number_of_words_received(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_words_received(start=dt(year=2014), period='y') == 4 - assert analyzer.total_number_of_words_received(start=dt(year=2018), period='y') == 1 - assert analyzer.total_number_of_words_received(start=dt(year=2020), period='y') == 12 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2014), period='y') == 4 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2018), period='y') == 1 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020), period='y') == 12 - assert analyzer.total_number_of_words_received(start=dt(year=2014, month=9), period='m') == 0 - assert analyzer.total_number_of_words_received(start=dt(year=2014, month=11), period='m') == 4 - assert analyzer.total_number_of_words_received(start=dt(year=2014, month=12), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2014, month=9), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2014, month=11), period='m') == 4 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2014, month=12), period='m') == 0 - assert analyzer.total_number_of_words_received(start=dt(year=2018, month=1), period='m') == 1 - assert analyzer.total_number_of_words_received(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2018, month=1), period='m') == 1 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2018, month=2), period='m') == 0 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2), period='m') == 11 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=5), period='m') == 1 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=2), period='m') == 11 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=5), period='m') == 1 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2, day=13), period='d') == 9 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2, day=14), period='d') == 2 - assert analyzer.total_number_of_words_received(start=dt(year=2020, month=2, day=18), period='d') == 0 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=2, day=13), period='d') == 9 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=2, day=14), period='d') == 2 + assert analyzer.get_count(attribute='word_count', subject='partner', start=dt(year=2020, month=2, day=18), period='d') == 0 def test_total_number_of_characters_received(analyzer): - assert analyzer.total_number_of_characters_received() == 68 + assert analyzer.get_count(attribute='char_count', subject='partner', ) == 68 - assert analyzer.total_number_of_characters_received(start=dt(year=2000), period='y') == 0 - assert analyzer.total_number_of_characters_received(start=dt(year=2014), period='y') == 9 - assert analyzer.total_number_of_characters_received(start=dt(year=2018), period='y') == 3 - assert analyzer.total_number_of_characters_received(start=dt(year=2020), period='y') == 56 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2000), period='y') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2014), period='y') == 9 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2018), period='y') == 3 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020), period='y') == 56 - assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=9), period='m') == 0 - assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=11), period='m') == 9 - assert analyzer.total_number_of_characters_received(start=dt(year=2014, month=12), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2014, month=9), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2014, month=11), period='m') == 9 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2014, month=12), period='m') == 0 - assert analyzer.total_number_of_characters_received(start=dt(year=2018, month=1), period='m') == 3 - assert analyzer.total_number_of_characters_received(start=dt(year=2018, month=2), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2018, month=1), period='m') == 3 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2018, month=2), period='m') == 0 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2), period='m') == 52 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=3), period='m') == 0 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=5), period='m') == 4 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=2), period='m') == 52 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=3), period='m') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=5), period='m') == 4 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2, day=13), period='d') == 30 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2, day=14), period='d') == 22 - assert analyzer.total_number_of_characters_received(start=dt(year=2020, month=2, day=18), period='d') == 0 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=2, day=13), period='d') == 30 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=2, day=14), period='d') == 22 + assert analyzer.get_count(attribute='char_count', subject='partner', start=dt(year=2020, month=2, day=18), period='d') == 0 diff --git a/tests/test_utils.py b/tests/test_utils.py index 4d11263..3e57e17 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -65,19 +65,20 @@ def test_generate_date_series(): + # TODO resolve start = datetime(2020, 1, 1, 0, 0) end = datetime(2021, 1, 1, 0, 0) - date_range_year = generate_date_series(start, end, 'y') + date_range_year = generate_date_series('y', start, end) assert len(date_range_year) == 1 + 1 - date_range_month = generate_date_series(start, end, 'm') + date_range_month = generate_date_series('m', start, end) assert len(date_range_month) == 12 + 1 - date_range_day = generate_date_series(start, end, 'd') + date_range_day = generate_date_series('d', start, end) assert len(date_range_day) == 366 + 1 - date_range_hour = generate_date_series(start, end, 'h') + date_range_hour = generate_date_series('h', start, end) assert len(date_range_hour) == (366 * 24) + 1 for day in date_range_day: