Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

V2 #11

Merged
merged 17 commits into from
Jun 14, 2022
Merged
11 changes: 11 additions & 0 deletions data_extractor/tests/data/_chat.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[16/03/2022, 15:20:25] user1: Hi shiva!
[16/03/2022, 15:25:38] user2: Hi 👋
[16/03/2022, 15:26:48] user3: Hoi!
[16/03/2022, 18:39:29] user2: https://youtu.be/KBmUTY6mK_E
[16/03/2022, 18:35:51] user1: ‎Location: https://maps.google.com/?q=52.089451,5.108469
[20/03/2022, 20:08:51] user4‬: I’m about to generate some very random messages so that I can make some screenshots for the explanation to participants
[24/03/2022, 20:19:38] user1‬: @user3 if you remove your Profile picture for a moment I will redo the screenshots 😁
[26/03/2022, 18:52:15] user2: Well done Utrecht 😁
Opkomst provinciesteden rond 20 procent, Utrecht hoogste opkomst van G4
https://nos.nl/l/2421368#UPDATE-container-60145354
[14/07/2020, 22:05:54] user4: 👍Bedankt
Binary file added data_extractor/tests/data/account_info.zip
Binary file not shown.
Binary file added data_extractor/tests/data/whatsapp_chat.zip
Binary file not shown.
22 changes: 0 additions & 22 deletions data_extractor/tests/test_whatsapp.py

This file was deleted.

28 changes: 28 additions & 0 deletions data_extractor/tests/test_whatsapp_account_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pandas as pd
from pandas.testing import assert_frame_equal

from whatsapp_account_info import process
from pathlib import Path


DATA_PATH = Path(__file__).parent / "data"
EXPECTED = [
{'number_of_groups': 4,
'number_of_contacts': 3
}
]


def test_process():
""" Test process function.
compares the expected dataframe with the output of the process function to check if all the columns are matched.
Raises
-------
AssertionError: When provided expected dataframe could not match the participants dataframe
"""
df_expected = pd.DataFrame(EXPECTED)

result = process(DATA_PATH.joinpath("account_info.zip"))
df_result = result[0]["data_frame"]
assert_frame_equal(df_result, df_expected)

46 changes: 46 additions & 0 deletions data_extractor/tests/test_whatsapp_chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from whatsapp_chat import process
from whatsapp_chat import anonymize_participants
from pathlib import Path
import pandas as pd
from pandas.testing import assert_frame_equal


DATA_PATH = Path(__file__).parent / "data"

EXPECTED = [
{'username': 'user1', 'total_words_no': 20, 'url_no': 1, 'location_no': 1, 'file_no': 0, 'message_no': 3,
'out_degree': 2, 'in_degree': 3, 'user_reply2': 'user2', 'reply_2_user': 'user2'},

{'username': 'user2', 'total_words_no': 18, 'url_no': 2, 'location_no': 0, 'file_no': 0, 'message_no': 3,
'out_degree': 3, 'in_degree': 3, 'user_reply2': 'user1', 'reply_2_user': 'user1'},

{'username': 'user3', 'total_words_no': 1, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 1,
'out_degree': 1, 'in_degree': 1, 'user_reply2': 'user2', 'reply_2_user': 'user2'},

{'username': 'user4', 'total_words_no': 21, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 2,
'out_degree': 2, 'in_degree': 1, 'user_reply2': 'user1', 'reply_2_user': 'user1'}
]


def test_process():
""" Test process function.
compares the expected dataframe with the output of the process function to check if all the columns are match.
Raises
-------
AssertionError: When provided expected dataframe could not match the participants dataframe
"""

df_expected = pd.DataFrame(EXPECTED)
df_expected = anonymize_participants(df_expected)
df_expected['message_no'] = df_expected['message_no'].astype('int64')
df_expected['url_no'] = df_expected['url_no'].astype('int32')
df_expected['location_no'] = df_expected['location_no'].astype('int32')
df_expected['file_no'] = df_expected['file_no'].astype('int32')

# result = process(DATA_PATH.joinpath("whatsapp_chat.zip"))
result = process(DATA_PATH.joinpath("_chat.txt"))
assert len(result) == 1
df_result = result[0]["data_frame"]
assert_frame_equal(df_result, df_expected)


20 changes: 0 additions & 20 deletions data_extractor/whatsapp/__init__.py

This file was deleted.

96 changes: 96 additions & 0 deletions data_extractor/whatsapp_account_info/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
__version__ = '0.2.0'

import zipfile
import re
import pandas as pd
import json

HIDDEN_FILE_RE = re.compile(r".*__MACOSX*")
FILE_RE = re.compile(r".*.json$")


class ColnamesDf:
GROUPS = 'groups'
"""Groups column"""

CONTACTS = 'contacts'
"""Contacts column"""


COLNAMES_DF = ColnamesDf()


def format_results(df):
results = []
results.append(
{
"id": "Whatsapp account info",
"title": "The account information file is read:",
"data_frame": df
}
)
return results


def format_errors(errors):
data_frame = pd.DataFrame()
data_frame["Messages"] = pd.Series(errors, name="Messages")
return {"id": "extraction_log", "title": "Extraction log", "data_frame": data_frame}


def extract_data(log_error, data):
# data = pd.read_csv('whatsapp/df_chat.csv')
# return 1,1
groups_no = 0
contacts_no = 0
try:
groups_no = len(data[COLNAMES_DF.GROUPS])
except (TypeError, KeyError) as e:
print("No group is available")
try:
contacts_no = len(data[COLNAMES_DF.CONTACTS])
except (TypeError, KeyError) as e:
print("No contact is available")

if (groups_no == 0) and (contacts_no == 0):
log_error("Neither group nor contact is available")
return groups_no, contacts_no


def parse_records(log_error, f):
try:
data = json.load(f)
except json.JSONDecodeError:
log_error(f"Could not parse: {f.name}")
else:
return data


def parse_zipfile(log_error, zfile):
for name in zfile.namelist():
if HIDDEN_FILE_RE.match(name):
continue
if not FILE_RE.match(name):
continue
return parse_records(log_error, zfile.open(name))
log_error("No Json file is available")


def process(file_data):
errors = []
log_error = errors.append
zfile = zipfile.ZipFile(file_data)
data = parse_zipfile(log_error, zfile)

if data is not None:
groups_no, contacts_no = extract_data(log_error, data)

if errors:
return [format_errors(errors)]

d = {'number_of_groups': [groups_no], 'number_of_contacts': [contacts_no]}
df = pd.DataFrame(data=d)
formatted_results = format_results(df)

return formatted_results

Loading