diff --git a/data_extractor/tests/data/_chat.txt b/data_extractor/tests/data/_chat.txt index c43d07a..845305d 100644 --- a/data_extractor/tests/data/_chat.txt +++ b/data_extractor/tests/data/_chat.txt @@ -1,3 +1,4 @@ +[16/03/2022, 15:10:17] Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more. [16/03/2022, 15:20:25] person1: Hi shiva! [16/03/2022, 15:25:38] person2: Hi 👋 [16/03/2022, 15:26:48] person3: Hoi! diff --git a/data_extractor/tests/data/whatsapp_chat.zip b/data_extractor/tests/data/whatsapp_chat.zip index b3f6edf..c40699d 100644 Binary files a/data_extractor/tests/data/whatsapp_chat.zip and b/data_extractor/tests/data/whatsapp_chat.zip differ diff --git a/data_extractor/tests/test_whatsapp_chat.py b/data_extractor/tests/test_whatsapp_chat.py index b10b0b6..87f3768 100644 --- a/data_extractor/tests/test_whatsapp_chat.py +++ b/data_extractor/tests/test_whatsapp_chat.py @@ -1,59 +1,75 @@ -from data_extractor.whatsapp_chat import process -from data_extractor.whatsapp_chat import anonymize_participants -from data_extractor.whatsapp_chat import get_df_per_participant from pathlib import Path import pandas as pd -from pandas.testing import assert_frame_equal +from whatsapp_chat import process +from whatsapp_chat import anonymize_participants +from pandas.testing import assert_frame_equal DATA_PATH = Path(__file__).parent / "data" EXPECTED = [ - {'username': 'person1', 'Total number of words': 20, 'Number of URLs': 1, 'Number of shared locations': 1, 'file_no': 0, 'Number of messages': 3, - 'Date first message': pd.to_datetime('2022-03-16 15:20:25'), 'Date last message': pd.to_datetime('2022-03-24 20:19:38')}, - - {'username': 'person2', 'Total number of words': 7, 'Number of URLs': 1, 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 3, - 'Date first message': pd.to_datetime('2022-03-16 15:25:38'), 'Date last message': pd.to_datetime('2022-03-26 18:52:15')}, - - {'username': 'person3', 'Total number of words': 1, 'Number of URLs': 0, 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 1, - 'Date first message': pd.to_datetime('2022-03-16 15:26:48'), 'Date last message': pd.to_datetime('2022-03-16 15:26:48')}, - - {'username': 'person4', 'Total number of words': 21, 'Number of URLs': 0, 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 2, - 'Date first message': pd.to_datetime('2020-07-14 22:05:54'), 'Date last message': pd.to_datetime('2022-03-20 20:08:51')} + {'username': 'person1', 'Total number of words': 20, 'Number of URLs': 1, + 'Number of shared locations': 1, 'file_no': 0, 'Number of messages': 3, + 'Date first message': pd.to_datetime('2022-03-16 15:20:25'), + 'Date last message': pd.to_datetime('2022-03-24 20:19:38'), + 'user_reply2': 'person2', 'reply_2_user': 'person2'}, + + {'username': 'person2', 'Total number of words': 7, 'Number of URLs': 1, + 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 3, + 'Date first message': pd.to_datetime('2022-03-16 15:25:38'), + 'Date last message': pd.to_datetime('2022-03-26 18:52:15'), + 'user_reply2': 'person1', 'reply_2_user': 'person1'}, + + {'username': 'person3', 'Total number of words': 1, 'Number of URLs': 0, + 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 1, + 'Date first message': pd.to_datetime('2022-03-16 15:26:48'), + 'Date last message': pd.to_datetime('2022-03-16 15:26:48'), + 'user_reply2': 'person2', 'reply_2_user': 'person2'}, + + {'username': 'person4', 'Total number of words': 21, 'Number of URLs': 0, + 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 2, + 'Date first message': pd.to_datetime('2020-07-14 22:05:54'), + 'Date last message': pd.to_datetime('2022-03-20 20:08:51'), + 'user_reply2': 'person1', 'reply_2_user': 'person1'} ] -# EXPECTED_1 = {'Description': ['Total number of words', 'Number of messages', 'Date first message', 'Date last message', -# 'Number of URLs', 'Number of shared locations'], -# 'Value': [20, 3, pd.to_datetime('2022-03-16 15:20:25'), pd.to_datetime('2022-03-24 20:19:38'), 1, 1]} -# -# df_expected_1 = pd.DataFrame(data=EXPECTED_1) - def test_process(): """ Test process function. - compares the expected dataframe with the output of the process function to check if all the columns are match. + compares the expected dataframe with the output of the process function + to check if all the columns are match. Raises ------- - AssertionError: When provided expected dataframe could not match the participants dataframe + AssertionError: When provided expected dataframe could not match the + participants dataframe """ df_expected = pd.DataFrame(EXPECTED) df_expected = anonymize_participants(df_expected) df_expected['Number of messages'] = df_expected['Number of messages'].astype('int64') df_expected['Number of URLs'] = df_expected['Number of URLs'].astype('int32') - df_expected['Number of shared locations'] = df_expected['Number of shared locations'].astype('int32') + df_expected['Number of shared locations'] = \ + df_expected['Number of shared locations'].astype('int32') df_expected['file_no'] = df_expected['file_no'].astype('int32') results = [] df_melt = pd.melt(df_expected, id_vars=["username"], - value_vars=["Total number of words", "Number of messages", "Date first message", "Date last message", - "Number of URLs", "file_no", "Number of shared locations"], var_name='Description', value_name='Value') + value_vars=["Total number of words", + "Number of messages", + "Date first message", + "Date last message", + "Number of URLs", + "file_no", + "Number of shared locations", + "reply_2_user", + "user_reply2"], + var_name='Description', value_name='Value') usernames = df_melt["username"].unique() - for u in usernames: - df_user = df_melt[(df_melt["username"] == u) & df_melt["Value"] != 0] + for user in usernames: + df_user = df_melt[(df_melt["username"] == user) & df_melt["Value"] != 0] results.append(df_user) expected_results = [] @@ -66,29 +82,10 @@ def test_process(): "data_frame": df[["Description", "Value"]].reset_index(drop=True) } ) - # print(type(expected_results[0]["data_frame"])) - # print(expected_results[1]["data_frame"]) - # print('******') - # result = process(DATA_PATH.joinpath("whatsapp_chat.zip")) df_result = process(DATA_PATH.joinpath("_chat.txt")) - # print(df_result[1]["data_frame"]) + assert_frame_equal(df_result[0]["data_frame"], expected_results[0]["data_frame"]) assert_frame_equal(df_result[1]["data_frame"], expected_results[1]["data_frame"]) assert_frame_equal(df_result[2]["data_frame"], expected_results[2]["data_frame"]) assert_frame_equal(df_result[3]["data_frame"], expected_results[3]["data_frame"]) - - # print(type(df_result[0]["data_frame"])) - # print(df_result[0]["data_frame"]) - # print(df_expected_1) - # - # # assert len(result_file) == 1 - # - # - # assert_frame_equal(df_result[0]["data_frame"], df_expected_1) - - -if __name__ == "__main__": - test_process() - - diff --git a/data_extractor/whatsapp_chat/__init__.py b/data_extractor/whatsapp_chat/__init__.py index d679838..f6b2045 100644 --- a/data_extractor/whatsapp_chat/__init__.py +++ b/data_extractor/whatsapp_chat/__init__.py @@ -7,11 +7,7 @@ import re from datetime import datetime import pandas as pd -import hashlib import zipfile -from pathlib import Path -#from nltk.corpus import stopwords -#from sklearn.feature_extraction.text import TfidfVectorizer URL_PATTERN = r'(https?://\S+)' @@ -20,10 +16,7 @@ FILE_RE = re.compile(r".*.txt$") HIDDEN_FILE_RE = re.compile(r".*__MACOSX*") -SYSTEM_MESSAGES=[ - 'Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.', - 'Berichten en gesprekken worden end-to-end versleuteld. Niemand buiten deze chat kan ze lezen of beluisteren, zelfs WhatsApp niet.' -] +SYSTEM_MESSAGES = ['end-to-end','WhatsApp'] hformats = ['%m/%d/%y, %H:%M - %name:', '[%d/%m/%y, %H:%M:%S] %name:', '%d-%m-%y %H:%M - %name:', '[%d-%m-%y %H:%M:%S] %name:'] @@ -42,42 +35,33 @@ class ColnamesDf: MESSAGE_LENGTH = 'message_length' """Message length column""" - FirstMessage = 'Date first message' #'first_message_date' + FirstMessage = 'Date first message' """Date of first message column""" - LastMessage = 'Date last message' #'last_message_date' + LastMessage = 'Date last message' """Date of last message column""" - MESSAGE_NO = 'Number of messages' #'message_no' + MESSAGE_NO = 'Number of messages' """Number of Message column""" - WORDS_NO = 'Total number of words' #'total_words_no' + WORDS_NO = 'Total number of words' """Total number of words column""" - REPLY_2USER = 'reply_2_user' + REPLY_2USER = 'Who replies to you the most often?' """Who replies to the user the most column""" - MAX_REPLY_2 = 'max_reply_2' + USER_REPLY2 = 'Who do you most often reply to?' """User replies to who the most column""" - USER_REPLY2 = 'user_reply2' - """User replies to who the most column""" - - URL_NO = 'Number of URLs'#,'url_no' + URL_NO = 'Number of URLs' """Number of URLs column""" - LOCATION_NO = 'Number of shared locations'#'location_no' + LOCATION_NO = 'Number of shared locations' """Number of locations column""" - FILE_NO = 'Number of shared files'#'file_no' + FILE_NO = 'Number of shared files' """Number of files column""" - OUT_DEGREE = 'out_degree' - """Total number of sent message column""" - - IN_DEGREE = 'in_degree' - """Total number of received message column""" - EMOJI_NO = 'emoji_no' """Total number of emojies column""" @@ -215,9 +199,11 @@ def remove_alerts_from_df(r_x, df): pandas.DataFrame Fixed version of input DataFrame """ + + alerts_no = count_alerts(r_x, df) df_new = df.copy() df_new.loc[:, COLNAMES_DF.MESSAGE] = df_new[COLNAMES_DF.MESSAGE].apply(lambda x: remove_alerts_from_line(r_x, x)) - return df_new + return df_new,alerts_no def remove_alerts_from_line(r_x, line_df): @@ -234,11 +220,32 @@ def remove_alerts_from_line(r_x, line_df): Cleaned message string """ if re.search(r_x, line_df): + print(line_df[:re.search(r_x, line_df).start()]) return line_df[:re.search(r_x, line_df).start()] else: return line_df +def count_alerts(r_x, df): + """Count line content that is not desirable (automatic alerts etc.). + Parameters + ---------- + r_x : str + Regula expression to detect WhatsApp warnings + df : pandas.DataFrame + pandas.DataFrame with all interventions + + Returns + ------- + int + Number of line contents that is not desirable + """ + + # alerts_count = df[COLNAMES_DF.MESSAGE].apply(lambda x: (re.search(r_x, x) is not None)) + alerts_count = df[COLNAMES_DF.MESSAGE].apply(lambda x: re.findall(r_x, x)) + return alerts_count.str.len().sum() + + def get_message(text, headers, i): """Get i:th message from text. Parameters @@ -320,9 +327,12 @@ def make_chat_df(log_error, text, hformat): # Parse chat to DataFrame try: df = parse_text(text, r) - df = remove_alerts_from_df(r_x, df) + df, alerts_no = remove_alerts_from_df(r_x, df) df = add_schema(df) + if alerts_no>0: + log_error("Number of unprocessed system messages: "+str(alerts_no)) + return df except: print(f"hformat : {hformat} is not match with the given text") @@ -481,21 +491,15 @@ def anonymize_participants(df_participants): # df_participants[COLNAMES_DF.USER_REPLY2] = df_participants[COLNAMES_DF.USER_REPLY2].apply(lambda u: anonym_txt(u,salt)) # df_participants[['username', 'user_reply2']] = df_participants[['username', 'user_reply2']].stack().rank(method='dense').unstack() - # stacked = df_participants[['username', 'user_reply2', 'reply_2_user']].stack() - # df_participants[['username', 'user_reply2', 'reply_2_user']] = pd.Series(stacked.factorize()[0], - # index=stacked.index).unstack() - # df_participants[['username', 'user_reply2', 'reply_2_user']] = 'person' + df_participants[['username', 'user_reply2', - # 'reply_2_user']].astype(str) - # - - df_participants['username'] = pd.factorize(df_participants.username)[0] + 1 - df_participants['username'] = 'person' + df_participants['username'].astype(str) - - + stacked = df_participants[[COLNAMES_DF.USERNAME,COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]].stack() + df_participants[[COLNAMES_DF.USERNAME,COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] = \ + pd.Series(stacked.factorize()[0], index=stacked.index).unstack() + df_participants[[COLNAMES_DF.USERNAME,COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] = \ + 'person' + df_participants[[COLNAMES_DF.USERNAME,COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]].astype(str) return df_participants -def get_df_per_participant(df, anonymize): +def get_wide_to_long_participant(df): """Generate one dataframe for each participant . Parameter ---------- @@ -514,16 +518,16 @@ def get_df_per_participant(df, anonymize): df_melt = pd.melt(df, id_vars=[COLNAMES_DF.USERNAME], value_vars=[COLNAMES_DF.WORDS_NO, COLNAMES_DF.MESSAGE_NO, COLNAMES_DF.FirstMessage, COLNAMES_DF.LastMessage, COLNAMES_DF.URL_NO, COLNAMES_DF.FILE_NO, - COLNAMES_DF.LOCATION_NO], + COLNAMES_DF.LOCATION_NO, + COLNAMES_DF.REPLY_2USER, + COLNAMES_DF.USER_REPLY2], var_name=COLNAMES_DF.DESCRIPTION, value_name=COLNAMES_DF.VALUE) - # usernames = set(df_melt[COLNAMES_DF.USERNAME]) - usernames = df_melt[COLNAMES_DF.USERNAME].unique() + usernames = sorted(set(df_melt[COLNAMES_DF.USERNAME])) for u in usernames: df_user = df_melt[(df_melt[COLNAMES_DF.USERNAME] == u) & df_melt[COLNAMES_DF.VALUE] != 0] - # if anonymize: - # df_user = anonymize_participants(df_user) + results.append(df_user) return results @@ -568,25 +572,20 @@ def get_participants_features(df_chat): }).reset_index() response_matrix = get_response_matrix(df_chat) - out_degree = response_matrix.sum(axis=1) - in_degree = response_matrix.T.sum(axis=1) user_reply2 = response_matrix.idxmax(axis=1) reply2_user = response_matrix.T.idxmax(axis=1) - response_matrix[COLNAMES_DF.OUT_DEGREE] = out_degree - response_matrix[COLNAMES_DF.IN_DEGREE] = in_degree response_matrix[COLNAMES_DF.USER_REPLY2] = user_reply2 response_matrix[COLNAMES_DF.REPLY_2USER] = reply2_user response_matrix.index.name = COLNAMES_DF.USERNAME - response_matrix = response_matrix.loc[:, - [COLNAMES_DF.OUT_DEGREE, COLNAMES_DF.IN_DEGREE, COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] + response_matrix = response_matrix.loc[:,[COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] response_matrix = response_matrix.reset_index() df_participants = pd.merge(df_participants, response_matrix, how="left", on=COLNAMES_DF.USERNAME, validate="1:1") return df_participants -def remove_system_messages(chat): +def remove_system_messages(log_error, chat): """Removes system messages from chat Parameters ---------- @@ -597,11 +596,14 @@ def remove_system_messages(chat): pandas.DataFrame A filtered dataframe """ - # print(chat.loc[0,COLNAMES_DF.MESSAGE]) - # print(SYSTEM_MESSAGES[1]) - for m in SYSTEM_MESSAGES: - group_name = chat.loc[chat[COLNAMES_DF.MESSAGE]==m,COLNAMES_DF.USERNAME] - # print(group_name) + + message0 = chat.loc[0, COLNAMES_DF.MESSAGE] + is_system_message = True if all(s in message0 for s in SYSTEM_MESSAGES) else False + if is_system_message: + group_name = chat.loc[0, COLNAMES_DF.USERNAME] + log_error("Identified group name:"+group_name) + chat = chat.loc[chat[COLNAMES_DF.USERNAME] != group_name,] + return chat def extract_participants_features(chat, anonymize=True): @@ -619,13 +621,16 @@ def extract_participants_features(chat, anonymize=True): """ df = get_participants_features(chat) - results = get_df_per_participant(df, anonymize) + if anonymize: + df= anonymize_participants(df) + + results = get_wide_to_long_participant(df) return results # ***** end of analysis functions ***** -def format_results(df_list): +def format_results(df_list, error): """Format results to the standard format. Parameters ---------- @@ -639,11 +644,13 @@ def format_results(df_list): user_name = pd.unique(df[COLNAMES_DF.USERNAME])[0] results.append( { - "id": user_name,#"overview", - "title": user_name,#"The following data is extracted from the file:", + "id": user_name, + "title": user_name, "data_frame": df[[COLNAMES_DF.DESCRIPTION,COLNAMES_DF.VALUE]].reset_index(drop=True) } ) + if len(error)>0: + results = results+error return results @@ -656,46 +663,49 @@ def format_errors(errors): ------- pandas.dataframe """ + if len(errors) == 0: + return [] data_frame = pd.DataFrame() data_frame["Messages"] = pd.Series(errors, name="Messages") - return {"id": "extraction_log", "title": "Extraction log", "data_frame": data_frame} + return [{"id": "extraction_log", "title": "Extraction log", "data_frame": data_frame}] def process(file_data): - """Convert whatsapp chat_file.zip to participants dataframe. + """Convert whatsapp chat file to participant dataframes. This is the main function which extracts the participants - information from the row chat_file.zip provided by data-donators. + information from the row chat file provided by data-donators. Parameters ---------- file_data : str - The path of the chat_file.zip + The path of the chat file. It can be in zip or txt format. Returns ------- pandas.dataframe - Extracted data from the chat_file + Extracted data from the chat file """ errors = [] log_error = errors.append - zfile = None - #chats = [] + try: zfile = zipfile.ZipFile(file_data) except: if FILE_RE.match(file_data.name): tfile = open(file_data, encoding="utf8") chat = parse_chat(log_error, tfile.read()) - #chats.append(chat) + else: log_error("There is not a valid file format.") - return [format_errors(errors)] + return format_errors(errors) else: chat = parse_zipfile(log_error, zfile) - if errors: - return [format_errors(errors)] - # print(chat) - chat = remove_system_messages(chat) - participants = extract_participants_features(chat) - formatted_results = format_results(participants) + if chat is not None: + chat = remove_system_messages(log_error,chat) + participants = extract_participants_features(chat) + + formatted_results = format_results(participants, format_errors(errors)) + + else: + return format_errors(errors) return formatted_results