diff --git a/data_extractor/tests/data/_chat.txt b/data_extractor/tests/data/_chat.txt index 2ffeac2..845305d 100644 --- a/data_extractor/tests/data/_chat.txt +++ b/data_extractor/tests/data/_chat.txt @@ -1,11 +1,10 @@ -[16/03/2022, 15:20:25] user1: Hi shiva! -[16/03/2022, 15:25:38] user2: Hi 👋 -[16/03/2022, 15:26:48] user3: Hoi! -[16/03/2022, 18:39:29] user2: https://youtu.be/KBmUTY6mK_E -[16/03/2022, 18:35:51] user1: ‎Location: https://maps.google.com/?q=52.089451,5.108469 -[20/03/2022, 20:08:51] user4‬: I’m about to generate some very random messages so that I can make some screenshots for the explanation to participants -[24/03/2022, 20:19:38] user1‬: @user3 if you remove your Profile picture for a moment I will redo the screenshots 😁 -[26/03/2022, 18:52:15] user2: Well done Utrecht 😁 -Opkomst provinciesteden rond 20 procent, Utrecht hoogste opkomst van G4 -https://nos.nl/l/2421368#UPDATE-container-60145354 -[14/07/2020, 22:05:54] user4: 👍Bedankt \ No newline at end of file +[16/03/2022, 15:10:17] Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more. +[16/03/2022, 15:20:25] person1: Hi shiva! +[16/03/2022, 15:25:38] person2: Hi 👋 +[16/03/2022, 15:26:48] person3: Hoi! +[16/03/2022, 18:39:29] person2: https://youtu.be/KBmUTY6mK_E +[16/03/2022, 18:35:51] person1: ‎Location: https://maps.google.com/?q=52.089451,5.108469 +[20/03/2022, 20:08:51] person4: I’m about to generate some very random messages so that I can make some screenshots for the explanation to participants +[24/03/2022, 20:19:38] person1: @user3 if you remove your Profile picture for a moment I will redo the screenshots 😁 +[26/03/2022, 18:52:15] person2: Well done Utrecht 😁 +[14/07/2020, 22:05:54] person4: 👍Bedankt \ No newline at end of file diff --git a/data_extractor/tests/data/whatsapp_chat.zip b/data_extractor/tests/data/whatsapp_chat.zip index b3f6edf..c40699d 100644 Binary files a/data_extractor/tests/data/whatsapp_chat.zip and b/data_extractor/tests/data/whatsapp_chat.zip differ diff --git a/data_extractor/tests/test_whatsapp_chat.py b/data_extractor/tests/test_whatsapp_chat.py index 6d5a6a6..87f3768 100644 --- a/data_extractor/tests/test_whatsapp_chat.py +++ b/data_extractor/tests/test_whatsapp_chat.py @@ -1,46 +1,91 @@ -from whatsapp_chat import process -from whatsapp_chat import anonymize_participants from pathlib import Path import pandas as pd + +from whatsapp_chat import process +from whatsapp_chat import anonymize_participants + from pandas.testing import assert_frame_equal DATA_PATH = Path(__file__).parent / "data" EXPECTED = [ - {'username': 'user1', 'total_words_no': 20, 'url_no': 1, 'location_no': 1, 'file_no': 0, 'message_no': 3, - 'out_degree': 2, 'in_degree': 3, 'user_reply2': 'user2', 'reply_2_user': 'user2'}, + {'username': 'person1', 'Total number of words': 20, 'Number of URLs': 1, + 'Number of shared locations': 1, 'file_no': 0, 'Number of messages': 3, + 'Date first message': pd.to_datetime('2022-03-16 15:20:25'), + 'Date last message': pd.to_datetime('2022-03-24 20:19:38'), + 'user_reply2': 'person2', 'reply_2_user': 'person2'}, - {'username': 'user2', 'total_words_no': 18, 'url_no': 2, 'location_no': 0, 'file_no': 0, 'message_no': 3, - 'out_degree': 3, 'in_degree': 3, 'user_reply2': 'user1', 'reply_2_user': 'user1'}, + {'username': 'person2', 'Total number of words': 7, 'Number of URLs': 1, + 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 3, + 'Date first message': pd.to_datetime('2022-03-16 15:25:38'), + 'Date last message': pd.to_datetime('2022-03-26 18:52:15'), + 'user_reply2': 'person1', 'reply_2_user': 'person1'}, - {'username': 'user3', 'total_words_no': 1, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 1, - 'out_degree': 1, 'in_degree': 1, 'user_reply2': 'user2', 'reply_2_user': 'user2'}, + {'username': 'person3', 'Total number of words': 1, 'Number of URLs': 0, + 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 1, + 'Date first message': pd.to_datetime('2022-03-16 15:26:48'), + 'Date last message': pd.to_datetime('2022-03-16 15:26:48'), + 'user_reply2': 'person2', 'reply_2_user': 'person2'}, - {'username': 'user4', 'total_words_no': 21, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 2, - 'out_degree': 2, 'in_degree': 1, 'user_reply2': 'user1', 'reply_2_user': 'user1'} + {'username': 'person4', 'Total number of words': 21, 'Number of URLs': 0, + 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 2, + 'Date first message': pd.to_datetime('2020-07-14 22:05:54'), + 'Date last message': pd.to_datetime('2022-03-20 20:08:51'), + 'user_reply2': 'person1', 'reply_2_user': 'person1'} ] def test_process(): """ Test process function. - compares the expected dataframe with the output of the process function to check if all the columns are match. + compares the expected dataframe with the output of the process function + to check if all the columns are match. Raises ------- - AssertionError: When provided expected dataframe could not match the participants dataframe + AssertionError: When provided expected dataframe could not match the + participants dataframe """ df_expected = pd.DataFrame(EXPECTED) df_expected = anonymize_participants(df_expected) - df_expected['message_no'] = df_expected['message_no'].astype('int64') - df_expected['url_no'] = df_expected['url_no'].astype('int32') - df_expected['location_no'] = df_expected['location_no'].astype('int32') + df_expected['Number of messages'] = df_expected['Number of messages'].astype('int64') + df_expected['Number of URLs'] = df_expected['Number of URLs'].astype('int32') + df_expected['Number of shared locations'] = \ + df_expected['Number of shared locations'].astype('int32') df_expected['file_no'] = df_expected['file_no'].astype('int32') - # result = process(DATA_PATH.joinpath("whatsapp_chat.zip")) - result = process(DATA_PATH.joinpath("_chat.txt")) - assert len(result) == 1 - df_result = result[0]["data_frame"] - assert_frame_equal(df_result, df_expected) + results = [] + df_melt = pd.melt(df_expected, id_vars=["username"], + value_vars=["Total number of words", + "Number of messages", + "Date first message", + "Date last message", + "Number of URLs", + "file_no", + "Number of shared locations", + "reply_2_user", + "user_reply2"], + var_name='Description', value_name='Value') + + usernames = df_melt["username"].unique() + for user in usernames: + df_user = df_melt[(df_melt["username"] == user) & df_melt["Value"] != 0] + results.append(df_user) + + expected_results = [] + for df in results: + user_name = pd.unique(df["username"])[0] + expected_results.append( + { + "id": user_name, # "overview", + "title": user_name, # "The following data is extracted from the file:", + "data_frame": df[["Description", "Value"]].reset_index(drop=True) + } + ) + df_result = process(DATA_PATH.joinpath("_chat.txt")) + assert_frame_equal(df_result[0]["data_frame"], expected_results[0]["data_frame"]) + assert_frame_equal(df_result[1]["data_frame"], expected_results[1]["data_frame"]) + assert_frame_equal(df_result[2]["data_frame"], expected_results[2]["data_frame"]) + assert_frame_equal(df_result[3]["data_frame"], expected_results[3]["data_frame"]) diff --git a/data_extractor/whatsapp_chat/__init__.py b/data_extractor/whatsapp_chat/__init__.py index a7673d3..f6b2045 100644 --- a/data_extractor/whatsapp_chat/__init__.py +++ b/data_extractor/whatsapp_chat/__init__.py @@ -7,18 +7,17 @@ import re from datetime import datetime import pandas as pd -import hashlib import zipfile -from pathlib import Path URL_PATTERN = r'(https?://\S+)' LOCATION_PATTERN = r'(Location: https?://\S+)' ATTACH_FILE_PATTERN = r'()' -FILE_RE = re.compile(r".*chat.*.txt$") +FILE_RE = re.compile(r".*.txt$") HIDDEN_FILE_RE = re.compile(r".*__MACOSX*") -hformats = ['%m/%d/%y, %H:%M - %name:', '[%d/%m/%y, %H:%M:%S] %name:', '%d-%m-%y %H:%M - %name:'] +SYSTEM_MESSAGES = ['end-to-end','WhatsApp'] +hformats = ['%m/%d/%y, %H:%M - %name:', '[%d/%m/%y, %H:%M:%S] %name:', '%d-%m-%y %H:%M - %name:', '[%d-%m-%y %H:%M:%S] %name:'] class ColnamesDf: @@ -36,48 +35,45 @@ class ColnamesDf: MESSAGE_LENGTH = 'message_length' """Message length column""" - FirstMessage = 'first_message_date' + FirstMessage = 'Date first message' """Date of first message column""" - LastMessage = 'last_message_date' + LastMessage = 'Date last message' """Date of last message column""" - MESSAGE_NO = 'message_no' + MESSAGE_NO = 'Number of messages' """Number of Message column""" - WORDS_NO = 'total_words_no' + WORDS_NO = 'Total number of words' """Total number of words column""" - REPLY_2USER = 'reply_2_user' + REPLY_2USER = 'Who replies to you the most often?' """Who replies to the user the most column""" - MAX_REPLY_2 = 'max_reply_2' + USER_REPLY2 = 'Who do you most often reply to?' """User replies to who the most column""" - USER_REPLY2 = 'user_reply2' - """User replies to who the most column""" - - URL_NO = 'url_no' + URL_NO = 'Number of URLs' """Number of URLs column""" - LOCATION_NO = 'location_no' + LOCATION_NO = 'Number of shared locations' """Number of locations column""" - FILE_NO = 'file_no' + FILE_NO = 'Number of shared files' """Number of files column""" - OUT_DEGREE = 'out_degree' - """Total number of sent message column""" - - IN_DEGREE = 'in_degree' - """Total number of received message column""" - EMOJI_NO = 'emoji_no' """Total number of emojies column""" EMOJI_Fav = 'emoji_fav' """Favorite emojies column""" + DESCRIPTION = 'Description' + """Variable column in melted dataframe""" + + VALUE = 'Value' + """Value column in melted dataframe""" + COLNAMES_DF = ColnamesDf() @@ -203,9 +199,11 @@ def remove_alerts_from_df(r_x, df): pandas.DataFrame Fixed version of input DataFrame """ + + alerts_no = count_alerts(r_x, df) df_new = df.copy() df_new.loc[:, COLNAMES_DF.MESSAGE] = df_new[COLNAMES_DF.MESSAGE].apply(lambda x: remove_alerts_from_line(r_x, x)) - return df_new + return df_new,alerts_no def remove_alerts_from_line(r_x, line_df): @@ -222,11 +220,32 @@ def remove_alerts_from_line(r_x, line_df): Cleaned message string """ if re.search(r_x, line_df): + print(line_df[:re.search(r_x, line_df).start()]) return line_df[:re.search(r_x, line_df).start()] else: return line_df +def count_alerts(r_x, df): + """Count line content that is not desirable (automatic alerts etc.). + Parameters + ---------- + r_x : str + Regula expression to detect WhatsApp warnings + df : pandas.DataFrame + pandas.DataFrame with all interventions + + Returns + ------- + int + Number of line contents that is not desirable + """ + + # alerts_count = df[COLNAMES_DF.MESSAGE].apply(lambda x: (re.search(r_x, x) is not None)) + alerts_count = df[COLNAMES_DF.MESSAGE].apply(lambda x: re.findall(r_x, x)) + return alerts_count.str.len().sum() + + def get_message(text, headers, i): """Get i:th message from text. Parameters @@ -308,9 +327,12 @@ def make_chat_df(log_error, text, hformat): # Parse chat to DataFrame try: df = parse_text(text, r) - df = remove_alerts_from_df(r_x, df) + df, alerts_no = remove_alerts_from_df(r_x, df) df = add_schema(df) + if alerts_no>0: + log_error("Number of unprocessed system messages: "+str(alerts_no)) + return df except: print(f"hformat : {hformat} is not match with the given text") @@ -372,20 +394,20 @@ def parse_zipfile(log_error, zfile): Regular expression Returns ------- - list - A list of pandas.DataFrames which include the content of chat files. + pandas.DataFrame + A pandas.DataFrames which include the content of the chat file. """ - results = [] for name in zfile.namelist(): if HIDDEN_FILE_RE.match(name): continue if not FILE_RE.match(name): continue - chats = decode_chat(log_error,zfile.read(name),name) - results.append(chats) - if len(results)==0: + chat = decode_chat(log_error,zfile.read(name),name) + + if chat is None: log_error("No valid chat file is available") - return results + + return chat # *** test related function *** @@ -405,9 +427,9 @@ def input_df(data_path): log_error = errors.append fp = os.path.join(data_path, "whatsapp_chat.zip") zfile = zipfile.ZipFile(fp) - chats = parse_zipfile(log_error, zfile) - participants = extract_participants_features(chats, anonymize=False) - return chats[0], participants[0] + chat = parse_zipfile(log_error, zfile) + participants = extract_participants_features(chat, anonymize=False) + return chat, participants # *** analysis functions *** @@ -469,14 +491,48 @@ def anonymize_participants(df_participants): # df_participants[COLNAMES_DF.USER_REPLY2] = df_participants[COLNAMES_DF.USER_REPLY2].apply(lambda u: anonym_txt(u,salt)) # df_participants[['username', 'user_reply2']] = df_participants[['username', 'user_reply2']].stack().rank(method='dense').unstack() - stacked = df_participants[['username', 'user_reply2', 'reply_2_user']].stack() - df_participants[['username', 'user_reply2', 'reply_2_user']] = pd.Series(stacked.factorize()[0], - index=stacked.index).unstack() - df_participants[['username', 'user_reply2', 'reply_2_user']] = 'person' + df_participants[['username', 'user_reply2', - 'reply_2_user']].astype(str) + stacked = df_participants[[COLNAMES_DF.USERNAME,COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]].stack() + df_participants[[COLNAMES_DF.USERNAME,COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] = \ + pd.Series(stacked.factorize()[0], index=stacked.index).unstack() + df_participants[[COLNAMES_DF.USERNAME,COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] = \ + 'person' + df_participants[[COLNAMES_DF.USERNAME,COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]].astype(str) return df_participants +def get_wide_to_long_participant(df): + """Generate one dataframe for each participant . + Parameter + ---------- + df : pandas.DataFrame + A DataFrame which includes participants and their features + + anonymize : bool + Indicates if usernames should be anonymized + Returns + ------- + list pandas.DataFrame + A list of pandas.DataFrame. Each data frame includes the description of features and their values extracted + from a specific participant + """ + results = [] + df_melt = pd.melt(df, id_vars=[COLNAMES_DF.USERNAME], value_vars=[COLNAMES_DF.WORDS_NO, COLNAMES_DF.MESSAGE_NO, + COLNAMES_DF.FirstMessage, COLNAMES_DF.LastMessage, + COLNAMES_DF.URL_NO, COLNAMES_DF.FILE_NO, + COLNAMES_DF.LOCATION_NO, + COLNAMES_DF.REPLY_2USER, + COLNAMES_DF.USER_REPLY2], + var_name=COLNAMES_DF.DESCRIPTION, value_name=COLNAMES_DF.VALUE) + + usernames = sorted(set(df_melt[COLNAMES_DF.USERNAME])) + for u in usernames: + df_user = df_melt[(df_melt[COLNAMES_DF.USERNAME] == u) & + df_melt[COLNAMES_DF.VALUE] != 0] + + results.append(df_user) + + return results + + def get_participants_features(df_chat): """Calculate participant features from the given chat. Parameter @@ -488,6 +544,10 @@ def get_participants_features(df_chat): pandas.DataFrame A DataFrame which includes participants and their features """ + # Calculate first message date + df_chat[COLNAMES_DF.FirstMessage] = df_chat[COLNAMES_DF.DATE].astype('datetime64[ns]') + # Calculate last message date + df_chat[COLNAMES_DF.LastMessage] = df_chat[COLNAMES_DF.DATE].astype('datetime64[ns]') # Calculate the number of words in messages df_chat[COLNAMES_DF.WORDS_NO] = df_chat['message'].apply(lambda x: len(x.split())) # number of ulrs @@ -506,35 +566,52 @@ def get_participants_features(df_chat): COLNAMES_DF.URL_NO: 'sum', COLNAMES_DF.LOCATION_NO: 'sum', COLNAMES_DF.FILE_NO: 'sum', - COLNAMES_DF.MESSAGE_NO: 'sum' + COLNAMES_DF.MESSAGE_NO: 'sum', + COLNAMES_DF.FirstMessage: 'min', + COLNAMES_DF.LastMessage: 'max' }).reset_index() response_matrix = get_response_matrix(df_chat) - out_degree = response_matrix.sum(axis=1) - in_degree = response_matrix.T.sum(axis=1) user_reply2 = response_matrix.idxmax(axis=1) reply2_user = response_matrix.T.idxmax(axis=1) - response_matrix[COLNAMES_DF.OUT_DEGREE] = out_degree - response_matrix[COLNAMES_DF.IN_DEGREE] = in_degree response_matrix[COLNAMES_DF.USER_REPLY2] = user_reply2 response_matrix[COLNAMES_DF.REPLY_2USER] = reply2_user response_matrix.index.name = COLNAMES_DF.USERNAME - response_matrix = response_matrix.loc[:, - [COLNAMES_DF.OUT_DEGREE, COLNAMES_DF.IN_DEGREE, COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] + response_matrix = response_matrix.loc[:,[COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] response_matrix = response_matrix.reset_index() df_participants = pd.merge(df_participants, response_matrix, how="left", on=COLNAMES_DF.USERNAME, validate="1:1") return df_participants +def remove_system_messages(log_error, chat): + """Removes system messages from chat + Parameters + ---------- + chat : pandas.DataFrame + A DataFrame that includes chat data + Returns + ------- + pandas.DataFrame + A filtered dataframe + """ + + message0 = chat.loc[0, COLNAMES_DF.MESSAGE] + is_system_message = True if all(s in message0 for s in SYSTEM_MESSAGES) else False + if is_system_message: + group_name = chat.loc[0, COLNAMES_DF.USERNAME] + log_error("Identified group name:"+group_name) + chat = chat.loc[chat[COLNAMES_DF.USERNAME] != group_name,] + + return chat -def extract_participants_features(chats, anonymize=True): +def extract_participants_features(chat, anonymize=True): """Parse the given zip file. Parameters ---------- - chats : list - List of DataFrames including chat data + chat : pandas.DataFrame + A DataFrame that includes chat data anonymize : bool Indicates if usernames should be anonymized Returns @@ -542,18 +619,18 @@ def extract_participants_features(chats, anonymize=True): list A list of DataFrames which include participant features """ - results = [] - for chat in chats: - df = get_participants_features(chat) - if anonymize: - df = anonymize_participants(df) - results.append(df) + + df = get_participants_features(chat) + if anonymize: + df= anonymize_participants(df) + + results = get_wide_to_long_participant(df) return results # ***** end of analysis functions ***** -def format_results(df_list): +def format_results(df_list, error): """Format results to the standard format. Parameters ---------- @@ -564,13 +641,16 @@ def format_results(df_list): """ results = [] for df in df_list: + user_name = pd.unique(df[COLNAMES_DF.USERNAME])[0] results.append( { - "id": "overview", - "title": "The following data is extracted from the file:", - "data_frame": df + "id": user_name, + "title": user_name, + "data_frame": df[[COLNAMES_DF.DESCRIPTION,COLNAMES_DF.VALUE]].reset_index(drop=True) } ) + if len(error)>0: + results = results+error return results @@ -583,45 +663,49 @@ def format_errors(errors): ------- pandas.dataframe """ + if len(errors) == 0: + return [] data_frame = pd.DataFrame() data_frame["Messages"] = pd.Series(errors, name="Messages") - return {"id": "extraction_log", "title": "Extraction log", "data_frame": data_frame} + return [{"id": "extraction_log", "title": "Extraction log", "data_frame": data_frame}] def process(file_data): - """Convert whatsapp chat_file.zip to participants dataframe. + """Convert whatsapp chat file to participant dataframes. This is the main function which extracts the participants - information from the row chat_file.zip provided by data-donators. + information from the row chat file provided by data-donators. Parameters ---------- file_data : str - The path of the chat_file.zip + The path of the chat file. It can be in zip or txt format. Returns ------- pandas.dataframe - Extracted data from the chat_file + Extracted data from the chat file """ errors = [] log_error = errors.append - zfile = None - chats = [] + try: zfile = zipfile.ZipFile(file_data) except: if FILE_RE.match(file_data.name): - zfile = open(file_data, encoding="utf8") - chat = parse_chat(log_error, zfile.read()) - chats.append(chat) + tfile = open(file_data, encoding="utf8") + chat = parse_chat(log_error, tfile.read()) + else: log_error("There is not a valid file format.") - return [format_errors(errors)] + return format_errors(errors) else: - chats = parse_zipfile(log_error, zfile) - if errors: - return [format_errors(errors)] + chat = parse_zipfile(log_error, zfile) - participants = extract_participants_features(chats) - formatted_results = format_results(participants) + if chat is not None: + chat = remove_system_messages(log_error,chat) + participants = extract_participants_features(chat) - return formatted_results + formatted_results = format_results(participants, format_errors(errors)) + else: + return format_errors(errors) + + return formatted_results