From 5014770e7bdffcdbb43a08928d7c42730f488be9 Mon Sep 17 00:00:00 2001 From: parisa-zahedi Date: Mon, 4 Jul 2022 00:53:37 +0200 Subject: [PATCH 01/13] remove system messages --- data_extractor/whatsapp_chat/__init__.py | 71 ++++++++---------------- 1 file changed, 24 insertions(+), 47 deletions(-) diff --git a/data_extractor/whatsapp_chat/__init__.py b/data_extractor/whatsapp_chat/__init__.py index e3f6577..82023a5 100644 --- a/data_extractor/whatsapp_chat/__init__.py +++ b/data_extractor/whatsapp_chat/__init__.py @@ -10,9 +10,6 @@ import hashlib import zipfile from pathlib import Path -#from nltk.corpus import stopwords -#from sklearn.feature_extraction.text import TfidfVectorizer - URL_PATTERN = r'(https?://\S+)' LOCATION_PATTERN = r'(Location: https?://\S+)' @@ -20,10 +17,7 @@ FILE_RE = re.compile(r".*.txt$") HIDDEN_FILE_RE = re.compile(r".*__MACOSX*") -SYSTEM_MESSAGES=[ - 'Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.', - 'Berichten en gesprekken worden end-to-end versleuteld. Niemand buiten deze chat kan ze lezen of beluisteren, zelfs WhatsApp niet.' -] +SYSTEM_MESSAGES = ['end-to-end','WhatsApp'] hformats = ['%m/%d/%y, %H:%M - %name:', '[%d/%m/%y, %H:%M:%S] %name:', '%d-%m-%y %H:%M - %name:', '[%d-%m-%y %H:%M:%S] %name:'] @@ -481,15 +475,13 @@ def anonymize_participants(df_participants): # df_participants[COLNAMES_DF.USER_REPLY2] = df_participants[COLNAMES_DF.USER_REPLY2].apply(lambda u: anonym_txt(u,salt)) # df_participants[['username', 'user_reply2']] = df_participants[['username', 'user_reply2']].stack().rank(method='dense').unstack() - stacked = df_participants[['username', 'user_reply2', 'reply_2_user']].stack() - df_participants[['username', 'user_reply2', 'reply_2_user']] = pd.Series(stacked.factorize()[0], - index=stacked.index).unstack() - df_participants[['username', 'user_reply2', 'reply_2_user']] = 'person' + df_participants[['username', 'user_reply2', - 'reply_2_user']].astype(str) + stacked = df_participants[['username']].stack() #, 'user_reply2', 'reply_2_user' + df_participants[['username']] = pd.Series(stacked.factorize()[0], index=stacked.index).unstack() #, 'user_reply2', 'reply_2_user' + df_participants[['username']] = 'person' + df_participants[['username']].astype(str) return df_participants -def get_df_per_participant(df, anonymize): +def get_df_per_participant(df): """Generate one dataframe for each participant . Parameter ---------- @@ -511,12 +503,11 @@ def get_df_per_participant(df, anonymize): COLNAMES_DF.LOCATION_NO], var_name=COLNAMES_DF.DESCRIPTION, value_name=COLNAMES_DF.VALUE) - usernames = set(df_melt[COLNAMES_DF.USERNAME]) + usernames = sorted(set(df_melt[COLNAMES_DF.USERNAME])) for u in usernames: df_user = df_melt[(df_melt[COLNAMES_DF.USERNAME] == u) & df_melt[COLNAMES_DF.VALUE] != 0] - # if anonymize: - # df_user = anonymize_participants(df_user) + results.append(df_user) return results @@ -560,23 +551,6 @@ def get_participants_features(df_chat): COLNAMES_DF.LastMessage: 'max' }).reset_index() - response_matrix = get_response_matrix(df_chat) - out_degree = response_matrix.sum(axis=1) - in_degree = response_matrix.T.sum(axis=1) - user_reply2 = response_matrix.idxmax(axis=1) - reply2_user = response_matrix.T.idxmax(axis=1) - - response_matrix[COLNAMES_DF.OUT_DEGREE] = out_degree - response_matrix[COLNAMES_DF.IN_DEGREE] = in_degree - response_matrix[COLNAMES_DF.USER_REPLY2] = user_reply2 - response_matrix[COLNAMES_DF.REPLY_2USER] = reply2_user - response_matrix.index.name = COLNAMES_DF.USERNAME - response_matrix = response_matrix.loc[:, - [COLNAMES_DF.OUT_DEGREE, COLNAMES_DF.IN_DEGREE, COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] - response_matrix = response_matrix.reset_index() - - df_participants = pd.merge(df_participants, response_matrix, how="left", on=COLNAMES_DF.USERNAME, validate="1:1") - return df_participants def remove_system_messages(chat): @@ -590,11 +564,13 @@ def remove_system_messages(chat): pandas.DataFrame A filtered dataframe """ - print(chat.loc[0,COLNAMES_DF.MESSAGE]) - print(SYSTEM_MESSAGES[1]) - for m in SYSTEM_MESSAGES: - group_name = chat.loc[chat[COLNAMES_DF.MESSAGE]==m,COLNAMES_DF.USERNAME] - print(group_name) + + message0 = chat.loc[0, COLNAMES_DF.MESSAGE] + is_system_message = True if all(s in message0 for s in SYSTEM_MESSAGES) else False + if is_system_message: + group_name = chat.loc[0, COLNAMES_DF.USERNAME] + chat = chat.loc[chat[COLNAMES_DF.USERNAME] != group_name,] + return chat def extract_participants_features(chat, anonymize=True): @@ -612,7 +588,10 @@ def extract_participants_features(chat, anonymize=True): """ df = get_participants_features(chat) - results = get_df_per_participant(df, anonymize) + if anonymize: + df= anonymize_participants(df) + + results = get_df_per_participant(df) return results # ***** end of analysis functions ***** @@ -655,29 +634,28 @@ def format_errors(errors): def process(file_data): - """Convert whatsapp chat_file.zip to participants dataframe. + """Convert whatsapp chat file to participant dataframes. This is the main function which extracts the participants - information from the row chat_file.zip provided by data-donators. + information from the row chat file provided by data-donators. Parameters ---------- file_data : str - The path of the chat_file.zip + The path of the chat file. It can be in zip or txt format. Returns ------- pandas.dataframe - Extracted data from the chat_file + Extracted data from the chat file """ errors = [] log_error = errors.append - zfile = None - #chats = [] + try: zfile = zipfile.ZipFile(file_data) except: if FILE_RE.match(file_data.name): tfile = open(file_data, encoding="utf8") chat = parse_chat(log_error, tfile.read()) - #chats.append(chat) + else: log_error("There is not a valid file format.") return [format_errors(errors)] @@ -686,7 +664,6 @@ def process(file_data): if errors: return [format_errors(errors)] - print(chat) chat = remove_system_messages(chat) participants = extract_participants_features(chat) formatted_results = format_results(participants) From 986fc0849e9a88139f5cac21b6418d8137120ed0 Mon Sep 17 00:00:00 2001 From: parisa-zahedi Date: Thu, 7 Jul 2022 12:53:10 +0200 Subject: [PATCH 02/13] add reply_2_user and user_reply_2 --- data_extractor/whatsapp_chat/__init__.py | 36 ++++++++++++++++-------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/data_extractor/whatsapp_chat/__init__.py b/data_extractor/whatsapp_chat/__init__.py index 82023a5..52727b5 100644 --- a/data_extractor/whatsapp_chat/__init__.py +++ b/data_extractor/whatsapp_chat/__init__.py @@ -36,16 +36,16 @@ class ColnamesDf: MESSAGE_LENGTH = 'message_length' """Message length column""" - FirstMessage = 'Date first message' #'first_message_date' + FirstMessage = 'Date first message' """Date of first message column""" - LastMessage = 'Date last message' #'last_message_date' + LastMessage = 'Date last message' """Date of last message column""" - MESSAGE_NO = 'Number of messages' #'message_no' + MESSAGE_NO = 'Number of messages' """Number of Message column""" - WORDS_NO = 'Total number of words' #'total_words_no' + WORDS_NO = 'Total number of words' """Total number of words column""" REPLY_2USER = 'reply_2_user' @@ -57,13 +57,13 @@ class ColnamesDf: USER_REPLY2 = 'user_reply2' """User replies to who the most column""" - URL_NO = 'Number of URLs'#,'url_no' + URL_NO = 'Number of URLs' """Number of URLs column""" - LOCATION_NO = 'Number of shared locations'#'location_no' + LOCATION_NO = 'Number of shared locations' """Number of locations column""" - FILE_NO = 'Number of shared files'#'file_no' + FILE_NO = 'Number of shared files' """Number of files column""" OUT_DEGREE = 'out_degree' @@ -475,9 +475,9 @@ def anonymize_participants(df_participants): # df_participants[COLNAMES_DF.USER_REPLY2] = df_participants[COLNAMES_DF.USER_REPLY2].apply(lambda u: anonym_txt(u,salt)) # df_participants[['username', 'user_reply2']] = df_participants[['username', 'user_reply2']].stack().rank(method='dense').unstack() - stacked = df_participants[['username']].stack() #, 'user_reply2', 'reply_2_user' - df_participants[['username']] = pd.Series(stacked.factorize()[0], index=stacked.index).unstack() #, 'user_reply2', 'reply_2_user' - df_participants[['username']] = 'person' + df_participants[['username']].astype(str) + stacked = df_participants[['username','user_reply2', 'reply_2_user']].stack() + df_participants[['username','user_reply2', 'reply_2_user']] = pd.Series(stacked.factorize()[0], index=stacked.index).unstack() + df_participants[['username','user_reply2', 'reply_2_user']] = 'person' + df_participants[['username','user_reply2', 'reply_2_user']].astype(str) return df_participants @@ -500,7 +500,9 @@ def get_df_per_participant(df): df_melt = pd.melt(df, id_vars=[COLNAMES_DF.USERNAME], value_vars=[COLNAMES_DF.WORDS_NO, COLNAMES_DF.MESSAGE_NO, COLNAMES_DF.FirstMessage, COLNAMES_DF.LastMessage, COLNAMES_DF.URL_NO, COLNAMES_DF.FILE_NO, - COLNAMES_DF.LOCATION_NO], + COLNAMES_DF.LOCATION_NO, + COLNAMES_DF.REPLY_2USER, + COLNAMES_DF.USER_REPLY2], var_name=COLNAMES_DF.DESCRIPTION, value_name=COLNAMES_DF.VALUE) usernames = sorted(set(df_melt[COLNAMES_DF.USERNAME])) @@ -551,6 +553,18 @@ def get_participants_features(df_chat): COLNAMES_DF.LastMessage: 'max' }).reset_index() + response_matrix = get_response_matrix(df_chat) + user_reply2 = response_matrix.idxmax(axis=1) + reply2_user = response_matrix.T.idxmax(axis=1) + + response_matrix[COLNAMES_DF.USER_REPLY2] = user_reply2 + response_matrix[COLNAMES_DF.REPLY_2USER] = reply2_user + response_matrix.index.name = COLNAMES_DF.USERNAME + response_matrix = response_matrix.loc[:,[COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] + response_matrix = response_matrix.reset_index() + + df_participants = pd.merge(df_participants, response_matrix, how="left", on=COLNAMES_DF.USERNAME, validate="1:1") + return df_participants def remove_system_messages(chat): From faedf246b5562d668948659ea76c1b689475587c Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Thu, 7 Jul 2022 13:47:03 +0200 Subject: [PATCH 03/13] Add reply_2 --- data_extractor/whatsapp_chat/__init__.py | 86 ++++++++++-------------- 1 file changed, 35 insertions(+), 51 deletions(-) diff --git a/data_extractor/whatsapp_chat/__init__.py b/data_extractor/whatsapp_chat/__init__.py index d679838..52727b5 100644 --- a/data_extractor/whatsapp_chat/__init__.py +++ b/data_extractor/whatsapp_chat/__init__.py @@ -10,9 +10,6 @@ import hashlib import zipfile from pathlib import Path -#from nltk.corpus import stopwords -#from sklearn.feature_extraction.text import TfidfVectorizer - URL_PATTERN = r'(https?://\S+)' LOCATION_PATTERN = r'(Location: https?://\S+)' @@ -20,10 +17,7 @@ FILE_RE = re.compile(r".*.txt$") HIDDEN_FILE_RE = re.compile(r".*__MACOSX*") -SYSTEM_MESSAGES=[ - 'Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.', - 'Berichten en gesprekken worden end-to-end versleuteld. Niemand buiten deze chat kan ze lezen of beluisteren, zelfs WhatsApp niet.' -] +SYSTEM_MESSAGES = ['end-to-end','WhatsApp'] hformats = ['%m/%d/%y, %H:%M - %name:', '[%d/%m/%y, %H:%M:%S] %name:', '%d-%m-%y %H:%M - %name:', '[%d-%m-%y %H:%M:%S] %name:'] @@ -42,16 +36,16 @@ class ColnamesDf: MESSAGE_LENGTH = 'message_length' """Message length column""" - FirstMessage = 'Date first message' #'first_message_date' + FirstMessage = 'Date first message' """Date of first message column""" - LastMessage = 'Date last message' #'last_message_date' + LastMessage = 'Date last message' """Date of last message column""" - MESSAGE_NO = 'Number of messages' #'message_no' + MESSAGE_NO = 'Number of messages' """Number of Message column""" - WORDS_NO = 'Total number of words' #'total_words_no' + WORDS_NO = 'Total number of words' """Total number of words column""" REPLY_2USER = 'reply_2_user' @@ -63,13 +57,13 @@ class ColnamesDf: USER_REPLY2 = 'user_reply2' """User replies to who the most column""" - URL_NO = 'Number of URLs'#,'url_no' + URL_NO = 'Number of URLs' """Number of URLs column""" - LOCATION_NO = 'Number of shared locations'#'location_no' + LOCATION_NO = 'Number of shared locations' """Number of locations column""" - FILE_NO = 'Number of shared files'#'file_no' + FILE_NO = 'Number of shared files' """Number of files column""" OUT_DEGREE = 'out_degree' @@ -481,21 +475,13 @@ def anonymize_participants(df_participants): # df_participants[COLNAMES_DF.USER_REPLY2] = df_participants[COLNAMES_DF.USER_REPLY2].apply(lambda u: anonym_txt(u,salt)) # df_participants[['username', 'user_reply2']] = df_participants[['username', 'user_reply2']].stack().rank(method='dense').unstack() - # stacked = df_participants[['username', 'user_reply2', 'reply_2_user']].stack() - # df_participants[['username', 'user_reply2', 'reply_2_user']] = pd.Series(stacked.factorize()[0], - # index=stacked.index).unstack() - # df_participants[['username', 'user_reply2', 'reply_2_user']] = 'person' + df_participants[['username', 'user_reply2', - # 'reply_2_user']].astype(str) - # - - df_participants['username'] = pd.factorize(df_participants.username)[0] + 1 - df_participants['username'] = 'person' + df_participants['username'].astype(str) - - + stacked = df_participants[['username','user_reply2', 'reply_2_user']].stack() + df_participants[['username','user_reply2', 'reply_2_user']] = pd.Series(stacked.factorize()[0], index=stacked.index).unstack() + df_participants[['username','user_reply2', 'reply_2_user']] = 'person' + df_participants[['username','user_reply2', 'reply_2_user']].astype(str) return df_participants -def get_df_per_participant(df, anonymize): +def get_df_per_participant(df): """Generate one dataframe for each participant . Parameter ---------- @@ -514,16 +500,16 @@ def get_df_per_participant(df, anonymize): df_melt = pd.melt(df, id_vars=[COLNAMES_DF.USERNAME], value_vars=[COLNAMES_DF.WORDS_NO, COLNAMES_DF.MESSAGE_NO, COLNAMES_DF.FirstMessage, COLNAMES_DF.LastMessage, COLNAMES_DF.URL_NO, COLNAMES_DF.FILE_NO, - COLNAMES_DF.LOCATION_NO], + COLNAMES_DF.LOCATION_NO, + COLNAMES_DF.REPLY_2USER, + COLNAMES_DF.USER_REPLY2], var_name=COLNAMES_DF.DESCRIPTION, value_name=COLNAMES_DF.VALUE) - # usernames = set(df_melt[COLNAMES_DF.USERNAME]) - usernames = df_melt[COLNAMES_DF.USERNAME].unique() + usernames = sorted(set(df_melt[COLNAMES_DF.USERNAME])) for u in usernames: df_user = df_melt[(df_melt[COLNAMES_DF.USERNAME] == u) & df_melt[COLNAMES_DF.VALUE] != 0] - # if anonymize: - # df_user = anonymize_participants(df_user) + results.append(df_user) return results @@ -568,18 +554,13 @@ def get_participants_features(df_chat): }).reset_index() response_matrix = get_response_matrix(df_chat) - out_degree = response_matrix.sum(axis=1) - in_degree = response_matrix.T.sum(axis=1) user_reply2 = response_matrix.idxmax(axis=1) reply2_user = response_matrix.T.idxmax(axis=1) - response_matrix[COLNAMES_DF.OUT_DEGREE] = out_degree - response_matrix[COLNAMES_DF.IN_DEGREE] = in_degree response_matrix[COLNAMES_DF.USER_REPLY2] = user_reply2 response_matrix[COLNAMES_DF.REPLY_2USER] = reply2_user response_matrix.index.name = COLNAMES_DF.USERNAME - response_matrix = response_matrix.loc[:, - [COLNAMES_DF.OUT_DEGREE, COLNAMES_DF.IN_DEGREE, COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] + response_matrix = response_matrix.loc[:,[COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] response_matrix = response_matrix.reset_index() df_participants = pd.merge(df_participants, response_matrix, how="left", on=COLNAMES_DF.USERNAME, validate="1:1") @@ -597,11 +578,13 @@ def remove_system_messages(chat): pandas.DataFrame A filtered dataframe """ - # print(chat.loc[0,COLNAMES_DF.MESSAGE]) - # print(SYSTEM_MESSAGES[1]) - for m in SYSTEM_MESSAGES: - group_name = chat.loc[chat[COLNAMES_DF.MESSAGE]==m,COLNAMES_DF.USERNAME] - # print(group_name) + + message0 = chat.loc[0, COLNAMES_DF.MESSAGE] + is_system_message = True if all(s in message0 for s in SYSTEM_MESSAGES) else False + if is_system_message: + group_name = chat.loc[0, COLNAMES_DF.USERNAME] + chat = chat.loc[chat[COLNAMES_DF.USERNAME] != group_name,] + return chat def extract_participants_features(chat, anonymize=True): @@ -619,7 +602,10 @@ def extract_participants_features(chat, anonymize=True): """ df = get_participants_features(chat) - results = get_df_per_participant(df, anonymize) + if anonymize: + df= anonymize_participants(df) + + results = get_df_per_participant(df) return results # ***** end of analysis functions ***** @@ -662,29 +648,28 @@ def format_errors(errors): def process(file_data): - """Convert whatsapp chat_file.zip to participants dataframe. + """Convert whatsapp chat file to participant dataframes. This is the main function which extracts the participants - information from the row chat_file.zip provided by data-donators. + information from the row chat file provided by data-donators. Parameters ---------- file_data : str - The path of the chat_file.zip + The path of the chat file. It can be in zip or txt format. Returns ------- pandas.dataframe - Extracted data from the chat_file + Extracted data from the chat file """ errors = [] log_error = errors.append - zfile = None - #chats = [] + try: zfile = zipfile.ZipFile(file_data) except: if FILE_RE.match(file_data.name): tfile = open(file_data, encoding="utf8") chat = parse_chat(log_error, tfile.read()) - #chats.append(chat) + else: log_error("There is not a valid file format.") return [format_errors(errors)] @@ -693,7 +678,6 @@ def process(file_data): if errors: return [format_errors(errors)] - # print(chat) chat = remove_system_messages(chat) participants = extract_participants_features(chat) formatted_results = format_results(participants) From ffe7e53afa5a2921abeb7347fe1e501fd8f656a8 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Thu, 7 Jul 2022 13:48:46 +0200 Subject: [PATCH 04/13] Add test for new output format, reply2, and remove group_name --- data_extractor/tests/test_whatsapp_chat.py | 39 ++++++---------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/data_extractor/tests/test_whatsapp_chat.py b/data_extractor/tests/test_whatsapp_chat.py index b10b0b6..0938b68 100644 --- a/data_extractor/tests/test_whatsapp_chat.py +++ b/data_extractor/tests/test_whatsapp_chat.py @@ -1,35 +1,30 @@ from data_extractor.whatsapp_chat import process from data_extractor.whatsapp_chat import anonymize_participants -from data_extractor.whatsapp_chat import get_df_per_participant from pathlib import Path import pandas as pd from pandas.testing import assert_frame_equal - - DATA_PATH = Path(__file__).parent / "data" EXPECTED = [ {'username': 'person1', 'Total number of words': 20, 'Number of URLs': 1, 'Number of shared locations': 1, 'file_no': 0, 'Number of messages': 3, - 'Date first message': pd.to_datetime('2022-03-16 15:20:25'), 'Date last message': pd.to_datetime('2022-03-24 20:19:38')}, + 'Date first message': pd.to_datetime('2022-03-16 15:20:25'), 'Date last message': pd.to_datetime('2022-03-24 20:19:38'), + 'user_reply2': 'person2', 'reply_2_user': 'person2'}, {'username': 'person2', 'Total number of words': 7, 'Number of URLs': 1, 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 3, - 'Date first message': pd.to_datetime('2022-03-16 15:25:38'), 'Date last message': pd.to_datetime('2022-03-26 18:52:15')}, + 'Date first message': pd.to_datetime('2022-03-16 15:25:38'), 'Date last message': pd.to_datetime('2022-03-26 18:52:15'), + 'user_reply2': 'person1', 'reply_2_user': 'person1'}, {'username': 'person3', 'Total number of words': 1, 'Number of URLs': 0, 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 1, - 'Date first message': pd.to_datetime('2022-03-16 15:26:48'), 'Date last message': pd.to_datetime('2022-03-16 15:26:48')}, + 'Date first message': pd.to_datetime('2022-03-16 15:26:48'), 'Date last message': pd.to_datetime('2022-03-16 15:26:48'), + 'user_reply2': 'person2', 'reply_2_user': 'person2'}, {'username': 'person4', 'Total number of words': 21, 'Number of URLs': 0, 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 2, - 'Date first message': pd.to_datetime('2020-07-14 22:05:54'), 'Date last message': pd.to_datetime('2022-03-20 20:08:51')} + 'Date first message': pd.to_datetime('2020-07-14 22:05:54'), 'Date last message': pd.to_datetime('2022-03-20 20:08:51'), + 'user_reply2': 'person1', 'reply_2_user': 'person1'} ] -# EXPECTED_1 = {'Description': ['Total number of words', 'Number of messages', 'Date first message', 'Date last message', -# 'Number of URLs', 'Number of shared locations'], -# 'Value': [20, 3, pd.to_datetime('2022-03-16 15:20:25'), pd.to_datetime('2022-03-24 20:19:38'), 1, 1]} -# -# df_expected_1 = pd.DataFrame(data=EXPECTED_1) - def test_process(): """ Test process function. @@ -49,7 +44,8 @@ def test_process(): results = [] df_melt = pd.melt(df_expected, id_vars=["username"], value_vars=["Total number of words", "Number of messages", "Date first message", "Date last message", - "Number of URLs", "file_no", "Number of shared locations"], var_name='Description', value_name='Value') + "Number of URLs", "file_no", "Number of shared locations", "reply_2_user", "user_reply2"], + var_name='Description', value_name='Value') usernames = df_melt["username"].unique() for u in usernames: @@ -66,27 +62,14 @@ def test_process(): "data_frame": df[["Description", "Value"]].reset_index(drop=True) } ) - # print(type(expected_results[0]["data_frame"])) - # print(expected_results[1]["data_frame"]) - # print('******') - # result = process(DATA_PATH.joinpath("whatsapp_chat.zip")) df_result = process(DATA_PATH.joinpath("_chat.txt")) - # print(df_result[1]["data_frame"]) + assert_frame_equal(df_result[0]["data_frame"], expected_results[0]["data_frame"]) assert_frame_equal(df_result[1]["data_frame"], expected_results[1]["data_frame"]) assert_frame_equal(df_result[2]["data_frame"], expected_results[2]["data_frame"]) assert_frame_equal(df_result[3]["data_frame"], expected_results[3]["data_frame"]) - # print(type(df_result[0]["data_frame"])) - # print(df_result[0]["data_frame"]) - # print(df_expected_1) - # - # # assert len(result_file) == 1 - # - # - # assert_frame_equal(df_result[0]["data_frame"], df_expected_1) - if __name__ == "__main__": test_process() From 2ea702b5b14caef7bb0323cff2e90bcdbb651d4d Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Thu, 7 Jul 2022 13:50:17 +0200 Subject: [PATCH 05/13] Add groupname --- data_extractor/tests/data/_chat.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/data_extractor/tests/data/_chat.txt b/data_extractor/tests/data/_chat.txt index c43d07a..062e1f0 100644 --- a/data_extractor/tests/data/_chat.txt +++ b/data_extractor/tests/data/_chat.txt @@ -1,3 +1,4 @@ +1/21/22, 19:56 - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more. [16/03/2022, 15:20:25] person1: Hi shiva! [16/03/2022, 15:25:38] person2: Hi 👋 [16/03/2022, 15:26:48] person3: Hoi! From 3f934fa95d7184dad8f9ba120998754b324838a2 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Thu, 7 Jul 2022 13:51:54 +0200 Subject: [PATCH 06/13] Add groupname --- data_extractor/tests/data/whatsapp_chat.zip | Bin 617 -> 1150 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/data_extractor/tests/data/whatsapp_chat.zip b/data_extractor/tests/data/whatsapp_chat.zip index b3f6edfd76b4f70b94dd4b0ac39d66bf5fde1947..55bf9ed07df806479a7fb0374a0aca129d91fd48 100644 GIT binary patch literal 1150 zcmWIWW@Zs#-~hs~+~*+-P@urfz`)6%z!0CDkyxTvQc)5b!pp$E<=C;Ltw0Q-ODnh; z7+JnCGB5zOGccqEozA;rAaeJ+w%+fVHghaebQBi|357Cl%m`h`rD(p>c%zj423zLM z+Sx2Z))8Ot|6={ilDBBuNdxWMpLQNU{(O#2YUQaLJfB)0h@N?<+oljM;V?@~>gM+? zpE)vaRcQxSq*b~#Apw|C8R@Anm#KUOs9cZtMl4?SVY)DXX(C9D2e91ENE`iFRcH#^7XR$0;8 zz_;r#TM?eG9 z1PNw!0ck@6gI|o93iCHMHflVO5E5cZY%DP-VvrQg@S3sk(H|b3M<;*otmo*D~=e4ZA*jqjb*4 zJ7)|cKmT|$D@r{2Bv1ae>#l3o#~j_SZ?NLm-FU&X<-qhP^7Htu%sbJ~Htw-m89(LQ z_j~*GV@k2P o`=T3*Y$YgGVPHw45fji@GU7PEo0Sb@3KJ0S1k&q)DT#pr0LvM_U;qFB literal 617 zcmWIWW@Zs#U|`^2(3n0YWcuRU=PxocF#KU+VBiFb#wTYamgtpKl;lo2n0LiM#P$2s zU;O+V=c;WKZtqcL2?+@h?bNe!;!u2|malg0NZAJ4#&>O|Or5f8elgcvt#PzxR9~{v z?X6zd``zzf-22}6q;S$PlZ7R0ErqiRPT3tgwop|j&bwgin{4*uA`o5Li@$j_A<`-3ur612K{Ad@RKHpDnM(Ui2 zo#osA{yaA2koV&EJySAbw-}{eO$d<^wg3HR%0&&$XOhasViT8~_*7Z2?Bv|y4nfo?TNdL zq!J(MPTMNK{NA?g#dF%P&GLH{wCHohLbmJ@(I3D6r{1uiee6|$|Ahug_T3Y%uYU94 z!8YgIm?g7!bY{AF&Jynzf36=NGe>5sTeYIIpnIj0(X>r#>?aCbn=EI^nHV^)Mfl_~ zrONtc_I(?Ls^0!*4De=TvS+{*^D4l&2LS~ziIfhwphAoc5)4U7P8+U=++bYge#z Date: Thu, 7 Jul 2022 14:15:10 +0200 Subject: [PATCH 07/13] change format --- data_extractor/tests/data/_chat.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_extractor/tests/data/_chat.txt b/data_extractor/tests/data/_chat.txt index 062e1f0..845305d 100644 --- a/data_extractor/tests/data/_chat.txt +++ b/data_extractor/tests/data/_chat.txt @@ -1,4 +1,4 @@ -1/21/22, 19:56 - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more. +[16/03/2022, 15:10:17] Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more. [16/03/2022, 15:20:25] person1: Hi shiva! [16/03/2022, 15:25:38] person2: Hi 👋 [16/03/2022, 15:26:48] person3: Hoi! From f169f1693ea7e579d7ae4986667ec9042c3fbd81 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Thu, 7 Jul 2022 14:15:36 +0200 Subject: [PATCH 08/13] change format --- data_extractor/tests/data/whatsapp_chat.zip | Bin 1150 -> 1149 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/data_extractor/tests/data/whatsapp_chat.zip b/data_extractor/tests/data/whatsapp_chat.zip index 55bf9ed07df806479a7fb0374a0aca129d91fd48..c40699d7fc035e55b114fc2959ff7b97aaad7b0d 100644 GIT binary patch delta 717 zcmeyz@t31Mz?+#xgno}l0E@3 zh%T+*W?*Fb#>l_`)Xu<=8szJL$3S53XYFO*RXgXVcTc`5}EAk zH8|!*oebZ7BIwCg;Ys-u_{;0--#%Kl=tuu)&h)F^o4lTEeektoP4`9CiOaHoeK*>9 z;LDGI`SWV#zIvShzI?WK;HHVsey_Hd&sAQ$>Df$AZ}Ca3Tf5~ZAA9xX?Dy-c{+m|m zd6{VnPYXS|<=*U%Z)VQgb7GZXsp5kd|6YYP?@@?suAj_p@*>rRBTRYCh4lx6SA;d! zowDG0cACR|cFLALH7xDNlxJPIDUsk88R*ntAFF@X?Z<(4OZry@`puH?h}^ie=c@AW zmzVRWGuHBD@ja2uZP=zQ0H%{!@M#IU5EHX%zO%7*KWLpPx5Ca1MMOG}> delta 718 zcmey%@sFcEz?+#xgnl_`)Xu<=8gx4Eih;=8@7j95XWGoMNYPPTAS4vZxG^JiA(x{0PUDSI z_8V-OH*06J2w6vbz5k2#FH7E{X(#Iqv~PdfdHneEIX0=4r*80kYJDJj=AmwzLb!y( zEHSB@-?x0`$hcLxr8p|%_K&sQH`G3yYcl01lWoq*jCFbw(y(lDA=@#x1H4i@79>}^ zywmf8r(%k31pi*HiF%8dN?zfd<7qBaHRWEzHo3`*Pw(;Ctdx3FV*>Xy&SUmX&hl$c zvexej6R_Gk(VuT(jL28zNxdh2-;Te}UpQ6c|3j`rkzeg2b-P*>^`<&V`*-%nncarU*y@eZ^b_K8tZ8({@4D# zeV_m3-M*W1I_LO1P1@xkmEYbq&%NJQT>e;_ZycwB9 sm=US4>fzQVX`oFFKrF(b085k%OB#(P8?eYASvEP0MUibi&_N6g0K@DjzW@LL From da3c616ab0befae90399d99e4d2633ad0335a497 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Thu, 7 Jul 2022 14:19:30 +0200 Subject: [PATCH 09/13] Remove main() --- data_extractor/tests/test_whatsapp_chat.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/data_extractor/tests/test_whatsapp_chat.py b/data_extractor/tests/test_whatsapp_chat.py index 0938b68..cc580af 100644 --- a/data_extractor/tests/test_whatsapp_chat.py +++ b/data_extractor/tests/test_whatsapp_chat.py @@ -1,5 +1,5 @@ -from data_extractor.whatsapp_chat import process -from data_extractor.whatsapp_chat import anonymize_participants +from whatsapp_chat import process +from whatsapp_chat import anonymize_participants from pathlib import Path import pandas as pd from pandas.testing import assert_frame_equal @@ -71,7 +71,5 @@ def test_process(): assert_frame_equal(df_result[3]["data_frame"], expected_results[3]["data_frame"]) -if __name__ == "__main__": - test_process() From 1c7549768ce9d18502dbe28129bf845b54f73c25 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Thu, 7 Jul 2022 15:48:45 +0200 Subject: [PATCH 10/13] pylint check --- data_extractor/tests/test_whatsapp_chat.py | 58 ++++++++++++++-------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/data_extractor/tests/test_whatsapp_chat.py b/data_extractor/tests/test_whatsapp_chat.py index cc580af..f6459ff 100644 --- a/data_extractor/tests/test_whatsapp_chat.py +++ b/data_extractor/tests/test_whatsapp_chat.py @@ -1,55 +1,75 @@ -from whatsapp_chat import process -from whatsapp_chat import anonymize_participants from pathlib import Path import pandas as pd + +from whatsapp_chat import process +from whatsapp_chat import anonymize_participants + from pandas.testing import assert_frame_equal DATA_PATH = Path(__file__).parent / "data" EXPECTED = [ - {'username': 'person1', 'Total number of words': 20, 'Number of URLs': 1, 'Number of shared locations': 1, 'file_no': 0, 'Number of messages': 3, - 'Date first message': pd.to_datetime('2022-03-16 15:20:25'), 'Date last message': pd.to_datetime('2022-03-24 20:19:38'), + {'username': 'person1', 'Total number of words': 20, 'Number of URLs': 1, + 'Number of shared locations': 1,'file_no': 0, 'Number of messages': 3, + 'Date first message': pd.to_datetime('2022-03-16 15:20:25'), + 'Date last message': pd.to_datetime('2022-03-24 20:19:38'), 'user_reply2': 'person2', 'reply_2_user': 'person2'}, - {'username': 'person2', 'Total number of words': 7, 'Number of URLs': 1, 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 3, - 'Date first message': pd.to_datetime('2022-03-16 15:25:38'), 'Date last message': pd.to_datetime('2022-03-26 18:52:15'), + {'username': 'person2', 'Total number of words': 7, 'Number of URLs': 1, + 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 3, + 'Date first message': pd.to_datetime('2022-03-16 15:25:38'), + 'Date last message': pd.to_datetime('2022-03-26 18:52:15'), 'user_reply2': 'person1', 'reply_2_user': 'person1'}, - {'username': 'person3', 'Total number of words': 1, 'Number of URLs': 0, 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 1, - 'Date first message': pd.to_datetime('2022-03-16 15:26:48'), 'Date last message': pd.to_datetime('2022-03-16 15:26:48'), + {'username': 'person3', 'Total number of words': 1, 'Number of URLs': 0, + 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 1, + 'Date first message': pd.to_datetime('2022-03-16 15:26:48'), + 'Date last message': pd.to_datetime('2022-03-16 15:26:48'), 'user_reply2': 'person2', 'reply_2_user': 'person2'}, - {'username': 'person4', 'Total number of words': 21, 'Number of URLs': 0, 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 2, - 'Date first message': pd.to_datetime('2020-07-14 22:05:54'), 'Date last message': pd.to_datetime('2022-03-20 20:08:51'), + {'username': 'person4', 'Total number of words': 21, 'Number of URLs': 0, + 'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 2, + 'Date first message': pd.to_datetime('2020-07-14 22:05:54'), + 'Date last message': pd.to_datetime('2022-03-20 20:08:51'), 'user_reply2': 'person1', 'reply_2_user': 'person1'} ] def test_process(): """ Test process function. - compares the expected dataframe with the output of the process function to check if all the columns are match. + compares the expected dataframe with the output of the process function + to check if all the columns are match. Raises ------- - AssertionError: When provided expected dataframe could not match the participants dataframe + AssertionError: When provided expected dataframe could not match the + participants dataframe """ df_expected = pd.DataFrame(EXPECTED) df_expected = anonymize_participants(df_expected) df_expected['Number of messages'] = df_expected['Number of messages'].astype('int64') df_expected['Number of URLs'] = df_expected['Number of URLs'].astype('int32') - df_expected['Number of shared locations'] = df_expected['Number of shared locations'].astype('int32') + df_expected['Number of shared locations'] = \ + df_expected['Number of shared locations'].astype('int32') df_expected['file_no'] = df_expected['file_no'].astype('int32') results = [] df_melt = pd.melt(df_expected, id_vars=["username"], - value_vars=["Total number of words", "Number of messages", "Date first message", "Date last message", - "Number of URLs", "file_no", "Number of shared locations", "reply_2_user", "user_reply2"], + value_vars=["Total number of words", + "Number of messages", + "Date first message", + "Date last message", + "Number of URLs", + "file_no", + "Number of shared locations", + "reply_2_user", + "user_reply2"], var_name='Description', value_name='Value') usernames = df_melt["username"].unique() - for u in usernames: - df_user = df_melt[(df_melt["username"] == u) & df_melt["Value"] != 0] + for user in usernames: + df_user = df_melt[(df_melt["username"] == user) & df_melt["Value"] != 0] results.append(df_user) expected_results = [] @@ -69,7 +89,3 @@ def test_process(): assert_frame_equal(df_result[1]["data_frame"], expected_results[1]["data_frame"]) assert_frame_equal(df_result[2]["data_frame"], expected_results[2]["data_frame"]) assert_frame_equal(df_result[3]["data_frame"], expected_results[3]["data_frame"]) - - - - From 3e0b9cebb40dea798a82778c2e78e7e221a8190c Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Thu, 7 Jul 2022 16:16:34 +0200 Subject: [PATCH 11/13] run poetry --- data_extractor/tests/test_whatsapp_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_extractor/tests/test_whatsapp_chat.py b/data_extractor/tests/test_whatsapp_chat.py index f6459ff..87f3768 100644 --- a/data_extractor/tests/test_whatsapp_chat.py +++ b/data_extractor/tests/test_whatsapp_chat.py @@ -11,7 +11,7 @@ EXPECTED = [ {'username': 'person1', 'Total number of words': 20, 'Number of URLs': 1, - 'Number of shared locations': 1,'file_no': 0, 'Number of messages': 3, + 'Number of shared locations': 1, 'file_no': 0, 'Number of messages': 3, 'Date first message': pd.to_datetime('2022-03-16 15:20:25'), 'Date last message': pd.to_datetime('2022-03-24 20:19:38'), 'user_reply2': 'person2', 'reply_2_user': 'person2'}, From 43b8c1299a9838d4f7046900f5f3aac967997e62 Mon Sep 17 00:00:00 2001 From: parisa-zahedi Date: Fri, 8 Jul 2022 09:37:33 +0200 Subject: [PATCH 12/13] rename reply_2user and user_reply2 fields --- data_extractor/whatsapp_chat/__init__.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/data_extractor/whatsapp_chat/__init__.py b/data_extractor/whatsapp_chat/__init__.py index 52727b5..2331e8d 100644 --- a/data_extractor/whatsapp_chat/__init__.py +++ b/data_extractor/whatsapp_chat/__init__.py @@ -48,13 +48,13 @@ class ColnamesDf: WORDS_NO = 'Total number of words' """Total number of words column""" - REPLY_2USER = 'reply_2_user' + REPLY_2USER = 'Who replies to you the most often?' """Who replies to the user the most column""" MAX_REPLY_2 = 'max_reply_2' """User replies to who the most column""" - USER_REPLY2 = 'user_reply2' + USER_REPLY2 = 'Who do you most often reply to?' """User replies to who the most column""" URL_NO = 'Number of URLs' @@ -475,9 +475,11 @@ def anonymize_participants(df_participants): # df_participants[COLNAMES_DF.USER_REPLY2] = df_participants[COLNAMES_DF.USER_REPLY2].apply(lambda u: anonym_txt(u,salt)) # df_participants[['username', 'user_reply2']] = df_participants[['username', 'user_reply2']].stack().rank(method='dense').unstack() - stacked = df_participants[['username','user_reply2', 'reply_2_user']].stack() - df_participants[['username','user_reply2', 'reply_2_user']] = pd.Series(stacked.factorize()[0], index=stacked.index).unstack() - df_participants[['username','user_reply2', 'reply_2_user']] = 'person' + df_participants[['username','user_reply2', 'reply_2_user']].astype(str) + stacked = df_participants[[COLNAMES_DF.USERNAME,COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]].stack() + df_participants[[COLNAMES_DF.USERNAME,COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] = \ + pd.Series(stacked.factorize()[0], index=stacked.index).unstack() + df_participants[[COLNAMES_DF.USERNAME,COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]] = \ + 'person' + df_participants[[COLNAMES_DF.USERNAME,COLNAMES_DF.USER_REPLY2, COLNAMES_DF.REPLY_2USER]].astype(str) return df_participants From 0bdc58b11506b00e5648c0e872259f6c030e315f Mon Sep 17 00:00:00 2001 From: parisa-zahedi Date: Fri, 8 Jul 2022 18:11:47 +0200 Subject: [PATCH 13/13] add group_name and system message logs --- data_extractor/whatsapp_chat/__init__.py | 76 ++++++++++++++++-------- 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/data_extractor/whatsapp_chat/__init__.py b/data_extractor/whatsapp_chat/__init__.py index 2331e8d..f6b2045 100644 --- a/data_extractor/whatsapp_chat/__init__.py +++ b/data_extractor/whatsapp_chat/__init__.py @@ -7,9 +7,8 @@ import re from datetime import datetime import pandas as pd -import hashlib import zipfile -from pathlib import Path + URL_PATTERN = r'(https?://\S+)' LOCATION_PATTERN = r'(Location: https?://\S+)' @@ -51,9 +50,6 @@ class ColnamesDf: REPLY_2USER = 'Who replies to you the most often?' """Who replies to the user the most column""" - MAX_REPLY_2 = 'max_reply_2' - """User replies to who the most column""" - USER_REPLY2 = 'Who do you most often reply to?' """User replies to who the most column""" @@ -66,12 +62,6 @@ class ColnamesDf: FILE_NO = 'Number of shared files' """Number of files column""" - OUT_DEGREE = 'out_degree' - """Total number of sent message column""" - - IN_DEGREE = 'in_degree' - """Total number of received message column""" - EMOJI_NO = 'emoji_no' """Total number of emojies column""" @@ -209,9 +199,11 @@ def remove_alerts_from_df(r_x, df): pandas.DataFrame Fixed version of input DataFrame """ + + alerts_no = count_alerts(r_x, df) df_new = df.copy() df_new.loc[:, COLNAMES_DF.MESSAGE] = df_new[COLNAMES_DF.MESSAGE].apply(lambda x: remove_alerts_from_line(r_x, x)) - return df_new + return df_new,alerts_no def remove_alerts_from_line(r_x, line_df): @@ -228,11 +220,32 @@ def remove_alerts_from_line(r_x, line_df): Cleaned message string """ if re.search(r_x, line_df): + print(line_df[:re.search(r_x, line_df).start()]) return line_df[:re.search(r_x, line_df).start()] else: return line_df +def count_alerts(r_x, df): + """Count line content that is not desirable (automatic alerts etc.). + Parameters + ---------- + r_x : str + Regula expression to detect WhatsApp warnings + df : pandas.DataFrame + pandas.DataFrame with all interventions + + Returns + ------- + int + Number of line contents that is not desirable + """ + + # alerts_count = df[COLNAMES_DF.MESSAGE].apply(lambda x: (re.search(r_x, x) is not None)) + alerts_count = df[COLNAMES_DF.MESSAGE].apply(lambda x: re.findall(r_x, x)) + return alerts_count.str.len().sum() + + def get_message(text, headers, i): """Get i:th message from text. Parameters @@ -314,9 +327,12 @@ def make_chat_df(log_error, text, hformat): # Parse chat to DataFrame try: df = parse_text(text, r) - df = remove_alerts_from_df(r_x, df) + df, alerts_no = remove_alerts_from_df(r_x, df) df = add_schema(df) + if alerts_no>0: + log_error("Number of unprocessed system messages: "+str(alerts_no)) + return df except: print(f"hformat : {hformat} is not match with the given text") @@ -483,7 +499,7 @@ def anonymize_participants(df_participants): return df_participants -def get_df_per_participant(df): +def get_wide_to_long_participant(df): """Generate one dataframe for each participant . Parameter ---------- @@ -569,7 +585,7 @@ def get_participants_features(df_chat): return df_participants -def remove_system_messages(chat): +def remove_system_messages(log_error, chat): """Removes system messages from chat Parameters ---------- @@ -585,6 +601,7 @@ def remove_system_messages(chat): is_system_message = True if all(s in message0 for s in SYSTEM_MESSAGES) else False if is_system_message: group_name = chat.loc[0, COLNAMES_DF.USERNAME] + log_error("Identified group name:"+group_name) chat = chat.loc[chat[COLNAMES_DF.USERNAME] != group_name,] return chat @@ -607,13 +624,13 @@ def extract_participants_features(chat, anonymize=True): if anonymize: df= anonymize_participants(df) - results = get_df_per_participant(df) + results = get_wide_to_long_participant(df) return results # ***** end of analysis functions ***** -def format_results(df_list): +def format_results(df_list, error): """Format results to the standard format. Parameters ---------- @@ -627,11 +644,13 @@ def format_results(df_list): user_name = pd.unique(df[COLNAMES_DF.USERNAME])[0] results.append( { - "id": user_name,#"overview", - "title": user_name,#"The following data is extracted from the file:", + "id": user_name, + "title": user_name, "data_frame": df[[COLNAMES_DF.DESCRIPTION,COLNAMES_DF.VALUE]].reset_index(drop=True) } ) + if len(error)>0: + results = results+error return results @@ -644,9 +663,11 @@ def format_errors(errors): ------- pandas.dataframe """ + if len(errors) == 0: + return [] data_frame = pd.DataFrame() data_frame["Messages"] = pd.Series(errors, name="Messages") - return {"id": "extraction_log", "title": "Extraction log", "data_frame": data_frame} + return [{"id": "extraction_log", "title": "Extraction log", "data_frame": data_frame}] def process(file_data): @@ -674,14 +695,17 @@ def process(file_data): else: log_error("There is not a valid file format.") - return [format_errors(errors)] + return format_errors(errors) else: chat = parse_zipfile(log_error, zfile) - if errors: - return [format_errors(errors)] - chat = remove_system_messages(chat) - participants = extract_participants_features(chat) - formatted_results = format_results(participants) + if chat is not None: + chat = remove_system_messages(log_error,chat) + participants = extract_participants_features(chat) + + formatted_results = format_results(participants, format_errors(errors)) + + else: + return format_errors(errors) return formatted_results