Skip to content

Commit

Permalink
Merge pull request #11 from sodascience/v2
Browse files Browse the repository at this point in the history
V2
  • Loading branch information
mellelieuwes authored Jun 14, 2022
2 parents 268eef9 + ede47cc commit 3d15d55
Show file tree
Hide file tree
Showing 10 changed files with 808 additions and 42 deletions.
11 changes: 11 additions & 0 deletions data_extractor/tests/data/_chat.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[16/03/2022, 15:20:25] user1: Hi shiva!
[16/03/2022, 15:25:38] user2: Hi 👋
[16/03/2022, 15:26:48] user3: Hoi!
[16/03/2022, 18:39:29] user2: https://youtu.be/KBmUTY6mK_E
[16/03/2022, 18:35:51] user1: ‎Location: https://maps.google.com/?q=52.089451,5.108469
[20/03/2022, 20:08:51] user4‬: I’m about to generate some very random messages so that I can make some screenshots for the explanation to participants
[24/03/2022, 20:19:38] user1‬: @user3 if you remove your Profile picture for a moment I will redo the screenshots 😁
[26/03/2022, 18:52:15] user2: Well done Utrecht 😁
Opkomst provinciesteden rond 20 procent, Utrecht hoogste opkomst van G4
https://nos.nl/l/2421368#UPDATE-container-60145354
[14/07/2020, 22:05:54] user4: 👍Bedankt
Binary file added data_extractor/tests/data/account_info.zip
Binary file not shown.
Binary file added data_extractor/tests/data/whatsapp_chat.zip
Binary file not shown.
22 changes: 0 additions & 22 deletions data_extractor/tests/test_whatsapp.py

This file was deleted.

28 changes: 28 additions & 0 deletions data_extractor/tests/test_whatsapp_account_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pandas as pd
from pandas.testing import assert_frame_equal

from whatsapp_account_info import process
from pathlib import Path


DATA_PATH = Path(__file__).parent / "data"
EXPECTED = [
{'number_of_groups': 4,
'number_of_contacts': 3
}
]


def test_process():
""" Test process function.
compares the expected dataframe with the output of the process function to check if all the columns are matched.
Raises
-------
AssertionError: When provided expected dataframe could not match the participants dataframe
"""
df_expected = pd.DataFrame(EXPECTED)

result = process(DATA_PATH.joinpath("account_info.zip"))
df_result = result[0]["data_frame"]
assert_frame_equal(df_result, df_expected)

46 changes: 46 additions & 0 deletions data_extractor/tests/test_whatsapp_chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from whatsapp_chat import process
from whatsapp_chat import anonymize_participants
from pathlib import Path
import pandas as pd
from pandas.testing import assert_frame_equal


DATA_PATH = Path(__file__).parent / "data"

EXPECTED = [
{'username': 'user1', 'total_words_no': 20, 'url_no': 1, 'location_no': 1, 'file_no': 0, 'message_no': 3,
'out_degree': 2, 'in_degree': 3, 'user_reply2': 'user2', 'reply_2_user': 'user2'},

{'username': 'user2', 'total_words_no': 18, 'url_no': 2, 'location_no': 0, 'file_no': 0, 'message_no': 3,
'out_degree': 3, 'in_degree': 3, 'user_reply2': 'user1', 'reply_2_user': 'user1'},

{'username': 'user3', 'total_words_no': 1, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 1,
'out_degree': 1, 'in_degree': 1, 'user_reply2': 'user2', 'reply_2_user': 'user2'},

{'username': 'user4', 'total_words_no': 21, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 2,
'out_degree': 2, 'in_degree': 1, 'user_reply2': 'user1', 'reply_2_user': 'user1'}
]


def test_process():
""" Test process function.
compares the expected dataframe with the output of the process function to check if all the columns are match.
Raises
-------
AssertionError: When provided expected dataframe could not match the participants dataframe
"""

df_expected = pd.DataFrame(EXPECTED)
df_expected = anonymize_participants(df_expected)
df_expected['message_no'] = df_expected['message_no'].astype('int64')
df_expected['url_no'] = df_expected['url_no'].astype('int32')
df_expected['location_no'] = df_expected['location_no'].astype('int32')
df_expected['file_no'] = df_expected['file_no'].astype('int32')

# result = process(DATA_PATH.joinpath("whatsapp_chat.zip"))
result = process(DATA_PATH.joinpath("_chat.txt"))
assert len(result) == 1
df_result = result[0]["data_frame"]
assert_frame_equal(df_result, df_expected)


20 changes: 0 additions & 20 deletions data_extractor/whatsapp/__init__.py

This file was deleted.

96 changes: 96 additions & 0 deletions data_extractor/whatsapp_account_info/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
__version__ = '0.2.0'

import zipfile
import re
import pandas as pd
import json

HIDDEN_FILE_RE = re.compile(r".*__MACOSX*")
FILE_RE = re.compile(r".*.json$")


class ColnamesDf:
GROUPS = 'groups'
"""Groups column"""

CONTACTS = 'contacts'
"""Contacts column"""


COLNAMES_DF = ColnamesDf()


def format_results(df):
results = []
results.append(
{
"id": "Whatsapp account info",
"title": "The account information file is read:",
"data_frame": df
}
)
return results


def format_errors(errors):
data_frame = pd.DataFrame()
data_frame["Messages"] = pd.Series(errors, name="Messages")
return {"id": "extraction_log", "title": "Extraction log", "data_frame": data_frame}


def extract_data(log_error, data):
# data = pd.read_csv('whatsapp/df_chat.csv')
# return 1,1
groups_no = 0
contacts_no = 0
try:
groups_no = len(data[COLNAMES_DF.GROUPS])
except (TypeError, KeyError) as e:
print("No group is available")
try:
contacts_no = len(data[COLNAMES_DF.CONTACTS])
except (TypeError, KeyError) as e:
print("No contact is available")

if (groups_no == 0) and (contacts_no == 0):
log_error("Neither group nor contact is available")
return groups_no, contacts_no


def parse_records(log_error, f):
try:
data = json.load(f)
except json.JSONDecodeError:
log_error(f"Could not parse: {f.name}")
else:
return data


def parse_zipfile(log_error, zfile):
for name in zfile.namelist():
if HIDDEN_FILE_RE.match(name):
continue
if not FILE_RE.match(name):
continue
return parse_records(log_error, zfile.open(name))
log_error("No Json file is available")


def process(file_data):
errors = []
log_error = errors.append
zfile = zipfile.ZipFile(file_data)
data = parse_zipfile(log_error, zfile)

if data is not None:
groups_no, contacts_no = extract_data(log_error, data)

if errors:
return [format_errors(errors)]

d = {'number_of_groups': [groups_no], 'number_of_contacts': [contacts_no]}
df = pd.DataFrame(data=d)
formatted_results = format_results(df)

return formatted_results

Loading

0 comments on commit 3d15d55

Please sign in to comment.