Merge pull request #14 from sodascience/v4

Whatsapp script-New format
eyra · Jul 11, 2022 · b8fa592 · b8fa592
2 parents 3d15d55 + 117673a
commit b8fa592
Show file tree

Hide file tree

Showing 4 changed files with 235 additions and 107 deletions.
diff --git a/data_extractor/tests/data/_chat.txt b/data_extractor/tests/data/_chat.txt
@@ -1,11 +1,10 @@
-[16/03/2022, 15:20:25] user1: Hi shiva!
-[16/03/2022, 15:25:38] user2: Hi 👋
-[16/03/2022, 15:26:48] user3: Hoi!
-[16/03/2022, 18:39:29] user2: https://youtu.be/KBmUTY6mK_E
-[16/03/2022, 18:35:51] user1: ‎Location: https://maps.google.com/?q=52.089451,5.108469
-[20/03/2022, 20:08:51] user4‬: I’m about to generate some very random messages so that I can make some screenshots for the explanation to participants
-[24/03/2022, 20:19:38] user1‬: @user3 if you remove your Profile picture for a moment I will redo the screenshots 😁
-[26/03/2022, 18:52:15] user2: Well done Utrecht 😁
-Opkomst provinciesteden rond 20 procent, Utrecht hoogste opkomst van G4
-https://nos.nl/l/2421368#UPDATE-container-60145354
-[14/07/2020, 22:05:54] user4: 👍Bedankt
+[16/03/2022, 15:10:17] Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.
+[16/03/2022, 15:20:25] person1: Hi shiva!
+[16/03/2022, 15:25:38] person2: Hi 👋
+[16/03/2022, 15:26:48] person3: Hoi!
+[16/03/2022, 18:39:29] person2: https://youtu.be/KBmUTY6mK_E
+[16/03/2022, 18:35:51] person1: ‎Location: https://maps.google.com/?q=52.089451,5.108469
+[20/03/2022, 20:08:51] person4: I’m about to generate some very random messages so that I can make some screenshots for the explanation to participants
+[24/03/2022, 20:19:38] person1: @user3 if you remove your Profile picture for a moment I will redo the screenshots 😁
+[26/03/2022, 18:52:15] person2: Well done Utrecht 😁
+[14/07/2020, 22:05:54] person4: 👍Bedankt
diff --git a/data_extractor/tests/data/whatsapp_chat.zip b/data_extractor/tests/data/whatsapp_chat.zip
diff --git a/data_extractor/tests/test_whatsapp_chat.py b/data_extractor/tests/test_whatsapp_chat.py
@@ -1,46 +1,91 @@
-from whatsapp_chat import process
-from whatsapp_chat import anonymize_participants
 from pathlib import Path
 import pandas as pd
+
+from whatsapp_chat import process
+from whatsapp_chat import anonymize_participants
+
 from pandas.testing import assert_frame_equal
 
 
 DATA_PATH = Path(__file__).parent / "data"
 
 EXPECTED = [
-    {'username': 'user1', 'total_words_no': 20, 'url_no': 1, 'location_no': 1, 'file_no': 0, 'message_no': 3,
-     'out_degree': 2, 'in_degree': 3, 'user_reply2': 'user2', 'reply_2_user': 'user2'},
+    {'username': 'person1', 'Total number of words': 20, 'Number of URLs': 1,
+     'Number of shared locations': 1, 'file_no': 0, 'Number of messages': 3,
+     'Date first message': pd.to_datetime('2022-03-16 15:20:25'),
+     'Date last message': pd.to_datetime('2022-03-24 20:19:38'),
+     'user_reply2': 'person2', 'reply_2_user': 'person2'},
 
-    {'username': 'user2', 'total_words_no': 18, 'url_no': 2, 'location_no': 0, 'file_no': 0, 'message_no': 3,
-     'out_degree': 3, 'in_degree': 3, 'user_reply2': 'user1', 'reply_2_user': 'user1'},
+    {'username': 'person2', 'Total number of words': 7, 'Number of URLs': 1,
+     'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 3,
+     'Date first message': pd.to_datetime('2022-03-16 15:25:38'),
+     'Date last message': pd.to_datetime('2022-03-26 18:52:15'),
+     'user_reply2': 'person1', 'reply_2_user': 'person1'},
 
-    {'username': 'user3', 'total_words_no': 1, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 1,
-     'out_degree': 1, 'in_degree': 1, 'user_reply2': 'user2', 'reply_2_user': 'user2'},
+    {'username': 'person3', 'Total number of words': 1, 'Number of URLs': 0,
+     'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 1,
+     'Date first message': pd.to_datetime('2022-03-16 15:26:48'),
+     'Date last message': pd.to_datetime('2022-03-16 15:26:48'),
+     'user_reply2': 'person2', 'reply_2_user': 'person2'},
 
-    {'username': 'user4', 'total_words_no': 21, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 2,
-     'out_degree': 2, 'in_degree': 1, 'user_reply2': 'user1', 'reply_2_user': 'user1'}
+    {'username': 'person4', 'Total number of words': 21, 'Number of URLs': 0,
+     'Number of shared locations': 0, 'file_no': 0, 'Number of messages': 2,
+     'Date first message': pd.to_datetime('2020-07-14 22:05:54'),
+     'Date last message': pd.to_datetime('2022-03-20 20:08:51'),
+     'user_reply2': 'person1', 'reply_2_user': 'person1'}
 ]
 
 
 def test_process():
     """ Test process function.
-        compares the expected dataframe with the output of the process function to check if all the columns are match.
+        compares the expected dataframe with the output of the process function
+         to check if all the columns are match.
         Raises
         -------
-        AssertionError: When provided expected dataframe could not match the participants dataframe
+        AssertionError: When provided expected dataframe could not match the
+         participants dataframe
         """
 
     df_expected = pd.DataFrame(EXPECTED)
     df_expected = anonymize_participants(df_expected)
-    df_expected['message_no'] = df_expected['message_no'].astype('int64')
-    df_expected['url_no'] = df_expected['url_no'].astype('int32')
-    df_expected['location_no'] = df_expected['location_no'].astype('int32')
+    df_expected['Number of messages'] = df_expected['Number of messages'].astype('int64')
+    df_expected['Number of URLs'] = df_expected['Number of URLs'].astype('int32')
+    df_expected['Number of shared locations'] = \
+        df_expected['Number of shared locations'].astype('int32')
     df_expected['file_no'] = df_expected['file_no'].astype('int32')
 
-    # result = process(DATA_PATH.joinpath("whatsapp_chat.zip"))
-    result = process(DATA_PATH.joinpath("_chat.txt"))
-    assert len(result) == 1
-    df_result = result[0]["data_frame"]
-    assert_frame_equal(df_result, df_expected)
+    results = []
+    df_melt = pd.melt(df_expected, id_vars=["username"],
+                      value_vars=["Total number of words",
+                                  "Number of messages",
+                                  "Date first message",
+                                  "Date last message",
+                                  "Number of URLs",
+                                  "file_no",
+                                  "Number of shared locations",
+                                  "reply_2_user",
+                                  "user_reply2"],
+                      var_name='Description', value_name='Value')
+
+    usernames = df_melt["username"].unique()
+    for user in usernames:
+        df_user = df_melt[(df_melt["username"] == user) & df_melt["Value"] != 0]
+        results.append(df_user)
+
+    expected_results = []
+    for df in results:
+        user_name = pd.unique(df["username"])[0]
+        expected_results.append(
+            {
+                "id": user_name,  # "overview",
+                "title": user_name,  # "The following data is extracted from the file:",
+                "data_frame": df[["Description", "Value"]].reset_index(drop=True)
+            }
+        )
 
+    df_result = process(DATA_PATH.joinpath("_chat.txt"))
 
+    assert_frame_equal(df_result[0]["data_frame"], expected_results[0]["data_frame"])
+    assert_frame_equal(df_result[1]["data_frame"], expected_results[1]["data_frame"])
+    assert_frame_equal(df_result[2]["data_frame"], expected_results[2]["data_frame"])
+    assert_frame_equal(df_result[3]["data_frame"], expected_results[3]["data_frame"])