diff --git a/data_extractor/tests/data/_chat.txt b/data_extractor/tests/data/_chat.txt new file mode 100644 index 0000000..2ffeac2 --- /dev/null +++ b/data_extractor/tests/data/_chat.txt @@ -0,0 +1,11 @@ +[16/03/2022, 15:20:25] user1: Hi shiva! +[16/03/2022, 15:25:38] user2: Hi 👋 +[16/03/2022, 15:26:48] user3: Hoi! +[16/03/2022, 18:39:29] user2: https://youtu.be/KBmUTY6mK_E +[16/03/2022, 18:35:51] user1: ‎Location: https://maps.google.com/?q=52.089451,5.108469 +[20/03/2022, 20:08:51] user4‬: I’m about to generate some very random messages so that I can make some screenshots for the explanation to participants +[24/03/2022, 20:19:38] user1‬: @user3 if you remove your Profile picture for a moment I will redo the screenshots 😁 +[26/03/2022, 18:52:15] user2: Well done Utrecht 😁 +Opkomst provinciesteden rond 20 procent, Utrecht hoogste opkomst van G4 +https://nos.nl/l/2421368#UPDATE-container-60145354 +[14/07/2020, 22:05:54] user4: 👍Bedankt \ No newline at end of file diff --git a/data_extractor/tests/data/account_info.zip b/data_extractor/tests/data/account_info.zip new file mode 100644 index 0000000..0489b54 Binary files /dev/null and b/data_extractor/tests/data/account_info.zip differ diff --git a/data_extractor/tests/data/whatsapp_chat.zip b/data_extractor/tests/data/whatsapp_chat.zip new file mode 100644 index 0000000..b3f6edf Binary files /dev/null and b/data_extractor/tests/data/whatsapp_chat.zip differ diff --git a/data_extractor/tests/test_whatsapp.py b/data_extractor/tests/test_whatsapp.py deleted file mode 100644 index aa05945..0000000 --- a/data_extractor/tests/test_whatsapp.py +++ /dev/null @@ -1,22 +0,0 @@ -from example import process -from pathlib import Path - -import pandas as pd -from pandas.testing import assert_frame_equal - -DATA_PATH = Path(__file__).parent / "data" - -EXPECTED = [ - {'filename': 'hello/', 'compressed size': 0, 'size': 0}, - {'filename': 'hello/world.txt', 'compressed size': 267, 'size': 471} -] - - -def test_hello_world(): - df_expected = pd.DataFrame(EXPECTED) - - result = process(DATA_PATH.joinpath("hello.zip").open("rb")) - assert len(result) == 1 - assert result[0]["id"] == 'overview' - assert result[0]["title"] == 'The following files where read:' - assert_frame_equal(result[0]["data_frame"], df_expected) diff --git a/data_extractor/tests/test_whatsapp_account_info.py b/data_extractor/tests/test_whatsapp_account_info.py new file mode 100644 index 0000000..d5ecb2e --- /dev/null +++ b/data_extractor/tests/test_whatsapp_account_info.py @@ -0,0 +1,28 @@ +import pandas as pd +from pandas.testing import assert_frame_equal + +from whatsapp_account_info import process +from pathlib import Path + + +DATA_PATH = Path(__file__).parent / "data" +EXPECTED = [ + {'number_of_groups': 4, + 'number_of_contacts': 3 + } +] + + +def test_process(): + """ Test process function. + compares the expected dataframe with the output of the process function to check if all the columns are matched. + Raises + ------- + AssertionError: When provided expected dataframe could not match the participants dataframe + """ + df_expected = pd.DataFrame(EXPECTED) + + result = process(DATA_PATH.joinpath("account_info.zip")) + df_result = result[0]["data_frame"] + assert_frame_equal(df_result, df_expected) + diff --git a/data_extractor/tests/test_whatsapp_chat.py b/data_extractor/tests/test_whatsapp_chat.py new file mode 100644 index 0000000..6d5a6a6 --- /dev/null +++ b/data_extractor/tests/test_whatsapp_chat.py @@ -0,0 +1,46 @@ +from whatsapp_chat import process +from whatsapp_chat import anonymize_participants +from pathlib import Path +import pandas as pd +from pandas.testing import assert_frame_equal + + +DATA_PATH = Path(__file__).parent / "data" + +EXPECTED = [ + {'username': 'user1', 'total_words_no': 20, 'url_no': 1, 'location_no': 1, 'file_no': 0, 'message_no': 3, + 'out_degree': 2, 'in_degree': 3, 'user_reply2': 'user2', 'reply_2_user': 'user2'}, + + {'username': 'user2', 'total_words_no': 18, 'url_no': 2, 'location_no': 0, 'file_no': 0, 'message_no': 3, + 'out_degree': 3, 'in_degree': 3, 'user_reply2': 'user1', 'reply_2_user': 'user1'}, + + {'username': 'user3', 'total_words_no': 1, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 1, + 'out_degree': 1, 'in_degree': 1, 'user_reply2': 'user2', 'reply_2_user': 'user2'}, + + {'username': 'user4', 'total_words_no': 21, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 2, + 'out_degree': 2, 'in_degree': 1, 'user_reply2': 'user1', 'reply_2_user': 'user1'} +] + + +def test_process(): + """ Test process function. + compares the expected dataframe with the output of the process function to check if all the columns are match. + Raises + ------- + AssertionError: When provided expected dataframe could not match the participants dataframe + """ + + df_expected = pd.DataFrame(EXPECTED) + df_expected = anonymize_participants(df_expected) + df_expected['message_no'] = df_expected['message_no'].astype('int64') + df_expected['url_no'] = df_expected['url_no'].astype('int32') + df_expected['location_no'] = df_expected['location_no'].astype('int32') + df_expected['file_no'] = df_expected['file_no'].astype('int32') + + # result = process(DATA_PATH.joinpath("whatsapp_chat.zip")) + result = process(DATA_PATH.joinpath("_chat.txt")) + assert len(result) == 1 + df_result = result[0]["data_frame"] + assert_frame_equal(df_result, df_expected) + + diff --git a/data_extractor/whatsapp/__init__.py b/data_extractor/whatsapp/__init__.py deleted file mode 100644 index 63f5f31..0000000 --- a/data_extractor/whatsapp/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -__version__ = '0.2.0' - -import zipfile -import pandas as pd - - -def process(file_data): - names = [] - zfile = zipfile.ZipFile(file_data) - data = [] - for name in zfile.namelist(): - names.append(name) - info = zfile.getinfo(name) - data.append((name, info.compress_size, info.file_size)) - - return [{ - "id": "overview", - "title": "The following files where read:", - "data_frame": pd.DataFrame(data, columns=["filename", "compressed size", "size"]) - }] diff --git a/data_extractor/whatsapp_account_info/__init__.py b/data_extractor/whatsapp_account_info/__init__.py new file mode 100644 index 0000000..26cbd7e --- /dev/null +++ b/data_extractor/whatsapp_account_info/__init__.py @@ -0,0 +1,96 @@ +__version__ = '0.2.0' + +import zipfile +import re +import pandas as pd +import json + +HIDDEN_FILE_RE = re.compile(r".*__MACOSX*") +FILE_RE = re.compile(r".*.json$") + + +class ColnamesDf: + GROUPS = 'groups' + """Groups column""" + + CONTACTS = 'contacts' + """Contacts column""" + + +COLNAMES_DF = ColnamesDf() + + +def format_results(df): + results = [] + results.append( + { + "id": "Whatsapp account info", + "title": "The account information file is read:", + "data_frame": df + } + ) + return results + + +def format_errors(errors): + data_frame = pd.DataFrame() + data_frame["Messages"] = pd.Series(errors, name="Messages") + return {"id": "extraction_log", "title": "Extraction log", "data_frame": data_frame} + + +def extract_data(log_error, data): + # data = pd.read_csv('whatsapp/df_chat.csv') + # return 1,1 + groups_no = 0 + contacts_no = 0 + try: + groups_no = len(data[COLNAMES_DF.GROUPS]) + except (TypeError, KeyError) as e: + print("No group is available") + try: + contacts_no = len(data[COLNAMES_DF.CONTACTS]) + except (TypeError, KeyError) as e: + print("No contact is available") + + if (groups_no == 0) and (contacts_no == 0): + log_error("Neither group nor contact is available") + return groups_no, contacts_no + + +def parse_records(log_error, f): + try: + data = json.load(f) + except json.JSONDecodeError: + log_error(f"Could not parse: {f.name}") + else: + return data + + +def parse_zipfile(log_error, zfile): + for name in zfile.namelist(): + if HIDDEN_FILE_RE.match(name): + continue + if not FILE_RE.match(name): + continue + return parse_records(log_error, zfile.open(name)) + log_error("No Json file is available") + + +def process(file_data): + errors = [] + log_error = errors.append + zfile = zipfile.ZipFile(file_data) + data = parse_zipfile(log_error, zfile) + + if data is not None: + groups_no, contacts_no = extract_data(log_error, data) + + if errors: + return [format_errors(errors)] + + d = {'number_of_groups': [groups_no], 'number_of_contacts': [contacts_no]} + df = pd.DataFrame(data=d) + formatted_results = format_results(df) + + return formatted_results + diff --git a/data_extractor/whatsapp_chat/__init__.py b/data_extractor/whatsapp_chat/__init__.py new file mode 100644 index 0000000..a7673d3 --- /dev/null +++ b/data_extractor/whatsapp_chat/__init__.py @@ -0,0 +1,627 @@ +"""Parser utils. +The main part is extracted from https://github.com/lucasrodes/whatstk.git +""" +__version__ = '0.2.0' + +import os +import re +from datetime import datetime +import pandas as pd +import hashlib +import zipfile +from pathlib import Path + + +URL_PATTERN = r'(https?://\S+)' +LOCATION_PATTERN = r'(Location: https?://\S+)' +ATTACH_FILE_PATTERN = r'()' +FILE_RE = re.compile(r".*chat.*.txt$") +HIDDEN_FILE_RE = re.compile(r".*__MACOSX*") + +hformats = ['%m/%d/%y, %H:%M - %name:', '[%d/%m/%y, %H:%M:%S] %name:', '%d-%m-%y %H:%M - %name:'] + + +class ColnamesDf: + """Access class constants using variable ``utils.COLNAMES_DF``.""" + + DATE = 'date' + """Date column""" + + USERNAME = 'username' + """Username column""" + + MESSAGE = 'message' + """Message column""" + + MESSAGE_LENGTH = 'message_length' + """Message length column""" + + FirstMessage = 'first_message_date' + """Date of first message column""" + + LastMessage = 'last_message_date' + """Date of last message column""" + + MESSAGE_NO = 'message_no' + """Number of Message column""" + + WORDS_NO = 'total_words_no' + """Total number of words column""" + + REPLY_2USER = 'reply_2_user' + """Who replies to the user the most column""" + + MAX_REPLY_2 = 'max_reply_2' + """User replies to who the most column""" + + USER_REPLY2 = 'user_reply2' + """User replies to who the most column""" + + URL_NO = 'url_no' + """Number of URLs column""" + + LOCATION_NO = 'location_no' + """Number of locations column""" + + FILE_NO = 'file_no' + """Number of files column""" + + OUT_DEGREE = 'out_degree' + """Total number of sent message column""" + + IN_DEGREE = 'in_degree' + """Total number of received message column""" + + EMOJI_NO = 'emoji_no' + """Total number of emojies column""" + + EMOJI_Fav = 'emoji_fav' + """Favorite emojies column""" + + +COLNAMES_DF = ColnamesDf() + +# *** parsing functions *** +regex_simplifier = { + '%Y': r'(?P\d{2,4})', + '%y': r'(?P\d{2,4})', + '%m': r'(?P\d{1,2})', + '%d': r'(?P\d{1,2})', + '%H': r'(?P\d{1,2})', + '%I': r'(?P\d{1,2})', + '%M': r'(?P\d{2})', + '%S': r'(?P\d{2})', + '%P': r'(?P[AaPp].? ?[Mm].?)', + '%p': r'(?P[AaPp].? ?[Mm].?)', + '%name': fr'(?P<{COLNAMES_DF.USERNAME}>[^:]*)' +} + + +def generate_regex(log_error, hformat): + r"""Generate regular expression from hformat. + Parameters + ---------- + log_error : list + List of error messages + hformat :str + Simplified syntax for the header, e.g. ``'%y-%m-%d, %H:%M:%S - %name:'``. + Returns + ------- + str + Regular expression corresponding to the specified syntax + """ + items = re.findall(r'\%\w*', hformat) + + for i in items: + try: + hformat = hformat.replace(i, regex_simplifier[i]) + except KeyError: + log_error(f"Could find regular expression for : {i}") + + hformat = hformat + ' ' + hformat_x = hformat.split('(?P[^:]*)')[0] + return hformat, hformat_x + + +def add_schema(df): + """Add default chat schema to df. + Parameters + ---------- + df : pandas.DataFrame + Chat DataFrame. + Returns + ------- + pandas.DataFrame + Chat DataFrame with correct dtypes + """ + df = df.astype({ + COLNAMES_DF.DATE: pd.StringDtype(), + COLNAMES_DF.USERNAME: pd.StringDtype(), + COLNAMES_DF.MESSAGE: pd.StringDtype() + }) + return df + + +def parse_line(text, headers, i): + """Get date, username and message from the i:th intervention. + Parameters + ---------- + text : str + Whole log chat text + headers : list + All headers. + i : int + Index denoting the message number + Returns + ------- + dict + ith date, username and message. + """ + result_ = headers[i].groupdict() + if 'ampm' in result_: + hour = int(result_['hour']) + mode = result_.get('ampm').lower() + if hour == 12 and mode == 'am': + hour = 0 + elif hour != 12 and mode == 'pm': + hour += 12 + else: + hour = int(result_['hour']) + + # Check format of year. If year is 2-digit represented we add 2000 + if len(result_['year']) == 2: + year = int(result_['year']) + 2000 + else: + year = int(result_['year']) + + if 'seconds' not in result_: + date = datetime(year, int(result_['month']), int(result_['day']), hour, + int(result_['minutes'])) + else: + date = datetime(year, int(result_['month']), int(result_['day']), hour, + int(result_['minutes']), int(result_['seconds'])) + username = result_[COLNAMES_DF.USERNAME] + message = get_message(text, headers, i) + line_dict = { + COLNAMES_DF.DATE: date, + COLNAMES_DF.USERNAME: username, + COLNAMES_DF.MESSAGE: message + } + return line_dict + + +def remove_alerts_from_df(r_x, df): + """Try to get rid of alert/notification messages. + Parameters + ---------- + r_x : str + Regular expression to detect whatsapp warnings + df : pandas.DataFrame + pandas.DataFrame with all interventions + Returns + ------- + pandas.DataFrame + Fixed version of input DataFrame + """ + df_new = df.copy() + df_new.loc[:, COLNAMES_DF.MESSAGE] = df_new[COLNAMES_DF.MESSAGE].apply(lambda x: remove_alerts_from_line(r_x, x)) + return df_new + + +def remove_alerts_from_line(r_x, line_df): + """Remove line content that is not desirable (automatic alerts etc.). + Parameters + ---------- + r_x : str + Regula expression to detect WhatsApp warnings + line_df : str + Message sent as string + Returns + ------- + str + Cleaned message string + """ + if re.search(r_x, line_df): + return line_df[:re.search(r_x, line_df).start()] + else: + return line_df + + +def get_message(text, headers, i): + """Get i:th message from text. + Parameters + ---------- + text : str + Whole log chat text + headers : list + All headers + i : int + Index denoting the message number + Returns + ------- + str + ith message. + """ + msg_start = headers[i].end() + msg_end = headers[i + 1].start() if i < len(headers) - 1 else headers[i].endpos + msg = text[msg_start:msg_end].strip() + return msg + + +def parse_text(text, regex): + """Parse chat using given regex. + Parameters + ---------- + text : str + Whole log chat text + regex : str + Regular expression + Returns + ------- + pandas.DataFrame + pandas.DataFrame with messages sent by users, index is the date the messages was sent. + Raises + ------ + RegexError + When provided regex could not match the text + """ + result = [] + headers = list(re.finditer(regex, text)) + try: + for i in range(len(headers)): + line_dict = parse_line(text, headers, i) + result.append(line_dict) + except KeyError: + print("Could not match the provided regex with provided text. No match was found.") + return None + + df_chat = pd.DataFrame.from_records(result) + df_chat = df_chat[[COLNAMES_DF.DATE,COLNAMES_DF.USERNAME, COLNAMES_DF.MESSAGE]] + + # clean username + df_chat[COLNAMES_DF.USERNAME] = df_chat[COLNAMES_DF.USERNAME].apply(lambda u: u.strip('\u202c')) + + return df_chat + + +def make_chat_df(log_error, text, hformat): + """Load chat as a DataFrame. + Parameters + ---------- + log_error : list + List of error messages + text : str + Text of the chat + hformat : str + Simplified syntax for the header, e.g. ``'%y-%m-%d, %H:%M:%S - %name:'`` + Returns + ------- + pandas.DataFrame + A pandas.DataFrame with three columns, i.e. 'date', 'username', and 'message' + """ + # Bracket is reserved character in RegEx, add backslash before them. + hformat = hformat.replace('[', r'\[').replace(']', r'\]') + + # Generate regex for given hformat + r, r_x = generate_regex(log_error,hformat=hformat) + + # Parse chat to DataFrame + try: + df = parse_text(text, r) + df = remove_alerts_from_df(r_x, df) + df = add_schema(df) + + return df + except: + print(f"hformat : {hformat} is not match with the given text") + return None + + +def parse_chat(log_error, data): + """Parse chat and test it with defined hformats. + Parameters + ---------- + log_error : list + List of error messages. + data : str + Data read from the chat file + Returns + ------- + pandas.dataframe + A pandas.DataFrame with three columns, i.e. 'date', 'username', and 'message' + """ + for hformat in hformats: + # Build DataFrame + df = make_chat_df(log_error, data, hformat) + if df is not None: + return df + log_error("hformats did not match the provided text. No match was found") + return None + + +def decode_chat(log_error, f, filename): + """Parse the given zip file. + Parameters + ---------- + log_error : list + List of error messages. + f : bytes + bytes of the file name in the zip file + filename : str + Name of a compressed file in the zip file. + Returns + ------- + pandas.DataFrame + A pandas.DataFrame which includes the content of the given chat file. + """ + try: + data = f.decode("utf-8") + except: + log_error(f"Could not decode to utf-8: {filename}") + else: + return parse_chat(log_error, data) + + +def parse_zipfile(log_error, zfile): + """Parse the given zip file. + Parameters + ---------- + log_error : list + List of error messages + zfile : ZipFile object + Regular expression + Returns + ------- + list + A list of pandas.DataFrames which include the content of chat files. + """ + results = [] + for name in zfile.namelist(): + if HIDDEN_FILE_RE.match(name): + continue + if not FILE_RE.match(name): + continue + chats = decode_chat(log_error,zfile.read(name),name) + results.append(chats) + if len(results)==0: + log_error("No valid chat file is available") + return results + +# *** test related function *** + + +def input_df(data_path): + """Create inputs df_chats and df_participants, used for test purposes. + Parameters + ---------- + data_path : str + File path of zip file + Returns + ------- + pandas.DataFrame + df_chats and df_participants + """ + errors = [] + log_error = errors.append + fp = os.path.join(data_path, "whatsapp_chat.zip") + zfile = zipfile.ZipFile(fp) + chats = parse_zipfile(log_error, zfile) + participants = extract_participants_features(chats, anonymize=False) + return chats[0], participants[0] + +# *** analysis functions *** + + +def get_response_matrix(df_chat): + """Create a response matrix for the usernames mentioned in the given DataFrame. + Parameters + ---------- + df_chat: padas.DataFrame + A DataFrame including chat data + Returns + ------- + pandas.DataFrame + A DataFrame with senders in the rows and receivers in the columns + """ + users = set(df_chat[COLNAMES_DF.USERNAME]) + users = sorted(users) + + # Get list of username transitions and initialize dicitonary with counts + user_transitions = df_chat[COLNAMES_DF.USERNAME].tolist() + responses = {user: dict(zip(users, [0] * len(users))) for user in users} + # Fill count dictionary + for i in range(1, len(user_transitions)): + sender = user_transitions[i] + receiver = user_transitions[i - 1] + if sender != receiver: + responses[sender][receiver] += 1 + + responses = pd.DataFrame.from_dict(responses, orient='index') + return responses + + +# def make_salt(): +# """Return an string as salt for anonym_txt function. +# Returns +# ------- +# str +# The salt value is deliberately set to be a fixed value for all the usernames, because then we can generate the +# same hashed value for the same value in the UERNAME, REPLY_2USER, and USER_REPLY2 columns. +# """ +# return str.encode('WhatsAppProject@2022') + + +def anonymize_participants(df_participants): + """Anonymize text data. + Anonymize USERNAME, REPLY_2USER, and USER_REPLY2 columns of the given DataFrame. + Parameters + ---------- + df_participants : pandas.DataFrame + A DataFrame including participants data + Returns + ------- + pandas.DataFrame + An anonymized DataFrame + """ + # salt = make_salt() + # df_participants[COLNAMES_DF.USERNAME] = df_participants[COLNAMES_DF.USERNAME].apply(lambda u: anonym_txt(u, salt)) + # df_participants[COLNAMES_DF.REPLY_2USER] = df_participants[COLNAMES_DF.REPLY_2USER].apply(lambda u: anonym_txt(u,salt)) + # df_participants[COLNAMES_DF.USER_REPLY2] = df_participants[COLNAMES_DF.USER_REPLY2].apply(lambda u: anonym_txt(u,salt)) + # df_participants[['username', 'user_reply2']] = df_participants[['username', 'user_reply2']].stack().rank(method='dense').unstack() + + stacked = df_participants[['username', 'user_reply2', 'reply_2_user']].stack() + df_participants[['username', 'user_reply2', 'reply_2_user']] = pd.Series(stacked.factorize()[0], + index=stacked.index).unstack() + df_participants[['username', 'user_reply2', 'reply_2_user']] = 'person' + df_participants[['username', 'user_reply2', + 'reply_2_user']].astype(str) + return df_participants + + +def get_participants_features(df_chat): + """Calculate participant features from the given chat. + Parameter + ---------- + df_chat : pandas.DataFrame + A DataFrame including chat data + Returns + ------- + pandas.DataFrame + A DataFrame which includes participants and their features + """ + # Calculate the number of words in messages + df_chat[COLNAMES_DF.WORDS_NO] = df_chat['message'].apply(lambda x: len(x.split())) + # number of ulrs + df_chat[COLNAMES_DF.URL_NO] = df_chat["message"].apply(lambda x: len(re.findall(URL_PATTERN, x))).astype(int) + # number of locations + df_chat[COLNAMES_DF.LOCATION_NO] = df_chat["message"].apply( + lambda x: len(re.findall(LOCATION_PATTERN, x))).astype(int) + # number of files + df_chat[COLNAMES_DF.FILE_NO] = df_chat["message"].apply(lambda x: len(re.findall(ATTACH_FILE_PATTERN, x))).astype( + int) + # number of messages + df_chat[COLNAMES_DF.MESSAGE_NO] = 1 + + df_participants = df_chat.groupby(COLNAMES_DF.USERNAME).agg({ + COLNAMES_DF.WORDS_NO: 'sum', + COLNAMES_DF.URL_NO: 'sum', + COLNAMES_DF.LOCATION_NO: 'sum', + COLNAMES_DF.FILE_NO: 'sum', + COLNAMES_DF.MESSAGE_NO: 'sum' + }).reset_index() + + response_matrix = get_response_matrix(df_chat) + out_degree = response_matrix.sum(axis=1) + in_degree = response_matrix.T.sum(axis=1) + user_reply2 = response_matrix.idxmax(axis=1) + reply2_user = response_matrix.T.idxmax(axis=1) + + response_matrix[COLNAMES_DF.OUT_DEGREE] = out_degree + response_matrix[COLNAMES_DF.IN_DEGREE] = in_degree + response_matrix[COLNAMES_DF.USER_REPLY2] = user_reply2 + response_matrix[COLNAMES_DF.REPLY_2USER] = reply2_user + response_matrix.index.name = COLNAMES_DF.USERNAME + response_matrix = response_matrix.loc[:, + [COLNAMES_DF.OUT_DEGREE, COLNAMES_DF.IN_DEGREE, COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] + response_matrix = response_matrix.reset_index() + + df_participants = pd.merge(df_participants, response_matrix, how="left", on=COLNAMES_DF.USERNAME, validate="1:1") + + return df_participants + + +def extract_participants_features(chats, anonymize=True): + """Parse the given zip file. + Parameters + ---------- + chats : list + List of DataFrames including chat data + anonymize : bool + Indicates if usernames should be anonymized + Returns + ------- + list + A list of DataFrames which include participant features + """ + results = [] + for chat in chats: + df = get_participants_features(chat) + if anonymize: + df = anonymize_participants(df) + results.append(df) + return results + +# ***** end of analysis functions ***** + + +def format_results(df_list): + """Format results to the standard format. + Parameters + ---------- + df_list: pandas.dataframe + Returns + ------- + pandas.dataframe + """ + results = [] + for df in df_list: + results.append( + { + "id": "overview", + "title": "The following data is extracted from the file:", + "data_frame": df + } + ) + return results + + +def format_errors(errors): + """Return errors in the format of dataframe. + Parameters + ---------- + errors: str + Returns + ------- + pandas.dataframe + """ + data_frame = pd.DataFrame() + data_frame["Messages"] = pd.Series(errors, name="Messages") + return {"id": "extraction_log", "title": "Extraction log", "data_frame": data_frame} + + +def process(file_data): + """Convert whatsapp chat_file.zip to participants dataframe. + This is the main function which extracts the participants + information from the row chat_file.zip provided by data-donators. + Parameters + ---------- + file_data : str + The path of the chat_file.zip + Returns + ------- + pandas.dataframe + Extracted data from the chat_file + """ + errors = [] + log_error = errors.append + zfile = None + chats = [] + try: + zfile = zipfile.ZipFile(file_data) + except: + if FILE_RE.match(file_data.name): + zfile = open(file_data, encoding="utf8") + chat = parse_chat(log_error, zfile.read()) + chats.append(chat) + else: + log_error("There is not a valid file format.") + return [format_errors(errors)] + else: + chats = parse_zipfile(log_error, zfile) + if errors: + return [format_errors(errors)] + + participants = extract_participants_features(chats) + formatted_results = format_results(participants) + + return formatted_results + diff --git a/data_extractor/whatsapp/index.html b/data_extractor/whatsapp_chat/index.html similarity index 100% rename from data_extractor/whatsapp/index.html rename to data_extractor/whatsapp_chat/index.html