Merge pull request #11 from sodascience/v2

V2
eyra · Jun 14, 2022 · 3d15d55 · 3d15d55
2 parents 268eef9 + ede47cc
commit 3d15d55
Show file tree

Hide file tree

Showing 10 changed files with 808 additions and 42 deletions.
diff --git a/data_extractor/tests/data/_chat.txt b/data_extractor/tests/data/_chat.txt
@@ -0,0 +1,11 @@
+[16/03/2022, 15:20:25] user1: Hi shiva!
+[16/03/2022, 15:25:38] user2: Hi 👋
+[16/03/2022, 15:26:48] user3: Hoi!
+[16/03/2022, 18:39:29] user2: https://youtu.be/KBmUTY6mK_E
+[16/03/2022, 18:35:51] user1: ‎Location: https://maps.google.com/?q=52.089451,5.108469
+[20/03/2022, 20:08:51] user4‬: I’m about to generate some very random messages so that I can make some screenshots for the explanation to participants
+[24/03/2022, 20:19:38] user1‬: @user3 if you remove your Profile picture for a moment I will redo the screenshots 😁
+[26/03/2022, 18:52:15] user2: Well done Utrecht 😁
+Opkomst provinciesteden rond 20 procent, Utrecht hoogste opkomst van G4
+https://nos.nl/l/2421368#UPDATE-container-60145354
+[14/07/2020, 22:05:54] user4: 👍Bedankt
diff --git a/data_extractor/tests/data/account_info.zip b/data_extractor/tests/data/account_info.zip
diff --git a/data_extractor/tests/data/whatsapp_chat.zip b/data_extractor/tests/data/whatsapp_chat.zip
diff --git a/data_extractor/tests/test_whatsapp.py b/data_extractor/tests/test_whatsapp.py
diff --git a/data_extractor/tests/test_whatsapp_account_info.py b/data_extractor/tests/test_whatsapp_account_info.py
@@ -0,0 +1,28 @@
+import pandas as pd
+from pandas.testing import assert_frame_equal
+
+from whatsapp_account_info import process
+from pathlib import Path
+
+
+DATA_PATH = Path(__file__).parent / "data"
+EXPECTED = [
+    {'number_of_groups': 4,
+     'number_of_contacts': 3
+     }
+]
+
+
+def test_process():
+    """ Test process function.
+        compares the expected dataframe with the output of the process function to check if all the columns are matched.
+        Raises
+        -------
+        AssertionError: When provided expected dataframe could not match the participants dataframe
+        """
+    df_expected = pd.DataFrame(EXPECTED)
+
+    result = process(DATA_PATH.joinpath("account_info.zip"))
+    df_result = result[0]["data_frame"]
+    assert_frame_equal(df_result, df_expected)
+
diff --git a/data_extractor/tests/test_whatsapp_chat.py b/data_extractor/tests/test_whatsapp_chat.py
@@ -0,0 +1,46 @@
+from whatsapp_chat import process
+from whatsapp_chat import anonymize_participants
+from pathlib import Path
+import pandas as pd
+from pandas.testing import assert_frame_equal
+
+
+DATA_PATH = Path(__file__).parent / "data"
+
+EXPECTED = [
+    {'username': 'user1', 'total_words_no': 20, 'url_no': 1, 'location_no': 1, 'file_no': 0, 'message_no': 3,
+     'out_degree': 2, 'in_degree': 3, 'user_reply2': 'user2', 'reply_2_user': 'user2'},
+
+    {'username': 'user2', 'total_words_no': 18, 'url_no': 2, 'location_no': 0, 'file_no': 0, 'message_no': 3,
+     'out_degree': 3, 'in_degree': 3, 'user_reply2': 'user1', 'reply_2_user': 'user1'},
+
+    {'username': 'user3', 'total_words_no': 1, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 1,
+     'out_degree': 1, 'in_degree': 1, 'user_reply2': 'user2', 'reply_2_user': 'user2'},
+
+    {'username': 'user4', 'total_words_no': 21, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 2,
+     'out_degree': 2, 'in_degree': 1, 'user_reply2': 'user1', 'reply_2_user': 'user1'}
+]
+
+
+def test_process():
+    """ Test process function.
+        compares the expected dataframe with the output of the process function to check if all the columns are match.
+        Raises
+        -------
+        AssertionError: When provided expected dataframe could not match the participants dataframe
+        """
+
+    df_expected = pd.DataFrame(EXPECTED)
+    df_expected = anonymize_participants(df_expected)
+    df_expected['message_no'] = df_expected['message_no'].astype('int64')
+    df_expected['url_no'] = df_expected['url_no'].astype('int32')
+    df_expected['location_no'] = df_expected['location_no'].astype('int32')
+    df_expected['file_no'] = df_expected['file_no'].astype('int32')
+
+    # result = process(DATA_PATH.joinpath("whatsapp_chat.zip"))
+    result = process(DATA_PATH.joinpath("_chat.txt"))
+    assert len(result) == 1
+    df_result = result[0]["data_frame"]
+    assert_frame_equal(df_result, df_expected)
+
+
diff --git a/data_extractor/whatsapp/__init__.py b/data_extractor/whatsapp/__init__.py
diff --git a/data_extractor/whatsapp_account_info/__init__.py b/data_extractor/whatsapp_account_info/__init__.py
@@ -0,0 +1,96 @@
+__version__ = '0.2.0'
+
+import zipfile
+import re
+import pandas as pd
+import json
+
+HIDDEN_FILE_RE = re.compile(r".*__MACOSX*")
+FILE_RE = re.compile(r".*.json$")
+
+
+class ColnamesDf:
+    GROUPS = 'groups'
+    """Groups column"""
+
+    CONTACTS = 'contacts'
+    """Contacts column"""
+
+
+COLNAMES_DF = ColnamesDf()
+
+
+def format_results(df):
+    results = []
+    results.append(
+        {
+        "id": "Whatsapp account info",
+        "title": "The account information file is read:",
+        "data_frame": df
+        }
+    )
+    return results
+
+
+def format_errors(errors):
+    data_frame = pd.DataFrame()
+    data_frame["Messages"] = pd.Series(errors, name="Messages")
+    return {"id": "extraction_log", "title": "Extraction log", "data_frame": data_frame}
+
+
+def extract_data(log_error, data):
+    # data = pd.read_csv('whatsapp/df_chat.csv')
+    # return 1,1
+    groups_no = 0
+    contacts_no = 0
+    try:
+        groups_no = len(data[COLNAMES_DF.GROUPS])
+    except (TypeError, KeyError) as e:
+        print("No group is available")
+    try:
+        contacts_no = len(data[COLNAMES_DF.CONTACTS])
+    except (TypeError, KeyError) as e:
+        print("No contact is available")
+
+    if (groups_no == 0) and (contacts_no == 0):
+        log_error("Neither group nor contact is available")
+    return groups_no, contacts_no
+
+
+def parse_records(log_error, f):
+    try:
+        data = json.load(f)
+    except json.JSONDecodeError:
+        log_error(f"Could not parse: {f.name}")
+    else:
+        return data
+
+
+def parse_zipfile(log_error, zfile):
+    for name in zfile.namelist():
+        if HIDDEN_FILE_RE.match(name):
+            continue
+        if not FILE_RE.match(name):
+            continue
+        return parse_records(log_error, zfile.open(name))
+    log_error("No Json file is available")
+
+
+def process(file_data):
+    errors = []
+    log_error = errors.append
+    zfile = zipfile.ZipFile(file_data)
+    data = parse_zipfile(log_error, zfile)
+
+    if data is not None:
+        groups_no, contacts_no = extract_data(log_error, data)
+
+    if errors:
+        return [format_errors(errors)]
+
+    d = {'number_of_groups': [groups_no], 'number_of_contacts': [contacts_no]}
+    df = pd.DataFrame(data=d)
+    formatted_results = format_results(df)
+
+    return formatted_results
+