-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #11 from sodascience/v2
V2
- Loading branch information
Showing
10 changed files
with
808 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
[16/03/2022, 15:20:25] user1: Hi shiva! | ||
[16/03/2022, 15:25:38] user2: Hi 👋 | ||
[16/03/2022, 15:26:48] user3: Hoi! | ||
[16/03/2022, 18:39:29] user2: https://youtu.be/KBmUTY6mK_E | ||
[16/03/2022, 18:35:51] user1: Location: https://maps.google.com/?q=52.089451,5.108469 | ||
[20/03/2022, 20:08:51] user4: I’m about to generate some very random messages so that I can make some screenshots for the explanation to participants | ||
[24/03/2022, 20:19:38] user1: @user3 if you remove your Profile picture for a moment I will redo the screenshots 😁 | ||
[26/03/2022, 18:52:15] user2: Well done Utrecht 😁 | ||
Opkomst provinciesteden rond 20 procent, Utrecht hoogste opkomst van G4 | ||
https://nos.nl/l/2421368#UPDATE-container-60145354 | ||
[14/07/2020, 22:05:54] user4: 👍Bedankt |
Binary file not shown.
Binary file not shown.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import pandas as pd | ||
from pandas.testing import assert_frame_equal | ||
|
||
from whatsapp_account_info import process | ||
from pathlib import Path | ||
|
||
|
||
DATA_PATH = Path(__file__).parent / "data" | ||
EXPECTED = [ | ||
{'number_of_groups': 4, | ||
'number_of_contacts': 3 | ||
} | ||
] | ||
|
||
|
||
def test_process(): | ||
""" Test process function. | ||
compares the expected dataframe with the output of the process function to check if all the columns are matched. | ||
Raises | ||
------- | ||
AssertionError: When provided expected dataframe could not match the participants dataframe | ||
""" | ||
df_expected = pd.DataFrame(EXPECTED) | ||
|
||
result = process(DATA_PATH.joinpath("account_info.zip")) | ||
df_result = result[0]["data_frame"] | ||
assert_frame_equal(df_result, df_expected) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
from whatsapp_chat import process | ||
from whatsapp_chat import anonymize_participants | ||
from pathlib import Path | ||
import pandas as pd | ||
from pandas.testing import assert_frame_equal | ||
|
||
|
||
DATA_PATH = Path(__file__).parent / "data" | ||
|
||
EXPECTED = [ | ||
{'username': 'user1', 'total_words_no': 20, 'url_no': 1, 'location_no': 1, 'file_no': 0, 'message_no': 3, | ||
'out_degree': 2, 'in_degree': 3, 'user_reply2': 'user2', 'reply_2_user': 'user2'}, | ||
|
||
{'username': 'user2', 'total_words_no': 18, 'url_no': 2, 'location_no': 0, 'file_no': 0, 'message_no': 3, | ||
'out_degree': 3, 'in_degree': 3, 'user_reply2': 'user1', 'reply_2_user': 'user1'}, | ||
|
||
{'username': 'user3', 'total_words_no': 1, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 1, | ||
'out_degree': 1, 'in_degree': 1, 'user_reply2': 'user2', 'reply_2_user': 'user2'}, | ||
|
||
{'username': 'user4', 'total_words_no': 21, 'url_no': 0, 'location_no': 0, 'file_no': 0, 'message_no': 2, | ||
'out_degree': 2, 'in_degree': 1, 'user_reply2': 'user1', 'reply_2_user': 'user1'} | ||
] | ||
|
||
|
||
def test_process(): | ||
""" Test process function. | ||
compares the expected dataframe with the output of the process function to check if all the columns are match. | ||
Raises | ||
------- | ||
AssertionError: When provided expected dataframe could not match the participants dataframe | ||
""" | ||
|
||
df_expected = pd.DataFrame(EXPECTED) | ||
df_expected = anonymize_participants(df_expected) | ||
df_expected['message_no'] = df_expected['message_no'].astype('int64') | ||
df_expected['url_no'] = df_expected['url_no'].astype('int32') | ||
df_expected['location_no'] = df_expected['location_no'].astype('int32') | ||
df_expected['file_no'] = df_expected['file_no'].astype('int32') | ||
|
||
# result = process(DATA_PATH.joinpath("whatsapp_chat.zip")) | ||
result = process(DATA_PATH.joinpath("_chat.txt")) | ||
assert len(result) == 1 | ||
df_result = result[0]["data_frame"] | ||
assert_frame_equal(df_result, df_expected) | ||
|
||
|
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
__version__ = '0.2.0' | ||
|
||
import zipfile | ||
import re | ||
import pandas as pd | ||
import json | ||
|
||
HIDDEN_FILE_RE = re.compile(r".*__MACOSX*") | ||
FILE_RE = re.compile(r".*.json$") | ||
|
||
|
||
class ColnamesDf: | ||
GROUPS = 'groups' | ||
"""Groups column""" | ||
|
||
CONTACTS = 'contacts' | ||
"""Contacts column""" | ||
|
||
|
||
COLNAMES_DF = ColnamesDf() | ||
|
||
|
||
def format_results(df): | ||
results = [] | ||
results.append( | ||
{ | ||
"id": "Whatsapp account info", | ||
"title": "The account information file is read:", | ||
"data_frame": df | ||
} | ||
) | ||
return results | ||
|
||
|
||
def format_errors(errors): | ||
data_frame = pd.DataFrame() | ||
data_frame["Messages"] = pd.Series(errors, name="Messages") | ||
return {"id": "extraction_log", "title": "Extraction log", "data_frame": data_frame} | ||
|
||
|
||
def extract_data(log_error, data): | ||
# data = pd.read_csv('whatsapp/df_chat.csv') | ||
# return 1,1 | ||
groups_no = 0 | ||
contacts_no = 0 | ||
try: | ||
groups_no = len(data[COLNAMES_DF.GROUPS]) | ||
except (TypeError, KeyError) as e: | ||
print("No group is available") | ||
try: | ||
contacts_no = len(data[COLNAMES_DF.CONTACTS]) | ||
except (TypeError, KeyError) as e: | ||
print("No contact is available") | ||
|
||
if (groups_no == 0) and (contacts_no == 0): | ||
log_error("Neither group nor contact is available") | ||
return groups_no, contacts_no | ||
|
||
|
||
def parse_records(log_error, f): | ||
try: | ||
data = json.load(f) | ||
except json.JSONDecodeError: | ||
log_error(f"Could not parse: {f.name}") | ||
else: | ||
return data | ||
|
||
|
||
def parse_zipfile(log_error, zfile): | ||
for name in zfile.namelist(): | ||
if HIDDEN_FILE_RE.match(name): | ||
continue | ||
if not FILE_RE.match(name): | ||
continue | ||
return parse_records(log_error, zfile.open(name)) | ||
log_error("No Json file is available") | ||
|
||
|
||
def process(file_data): | ||
errors = [] | ||
log_error = errors.append | ||
zfile = zipfile.ZipFile(file_data) | ||
data = parse_zipfile(log_error, zfile) | ||
|
||
if data is not None: | ||
groups_no, contacts_no = extract_data(log_error, data) | ||
|
||
if errors: | ||
return [format_errors(errors)] | ||
|
||
d = {'number_of_groups': [groups_no], 'number_of_contacts': [contacts_no]} | ||
df = pd.DataFrame(data=d) | ||
formatted_results = format_results(df) | ||
|
||
return formatted_results | ||
|
Oops, something went wrong.