-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
141 lines (114 loc) · 5.03 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import pandas as pd
import os
import ast
from collections import OrderedDict
import numpy as np
import utility
import properties
'''
Read the answersheets as raw json output out by data conversion as csv
'''
def read_answersheets(file_path, separator=","):
"""
Read the actual answers of the subjects, from the table answersheets
:param file_path:
:return:
"""
column_names = ['id', 'user_id', 'questionnaire_id', 'locale', 'answers', 'sensordata', 'client', 'flags',
'collected_at', 'deleted_at', 'created_at', 'updated_at']
# read the answersheets csv and keep only the relevant columns
answer_df = pd.read_csv(file_path, sep=separator)#,names=column_names, header=None)
answer_df = answer_df[['user_id', 'questionnaire_id', 'answers','collected_at', 'deleted_at', 'created_at','updated_at']]
questions_list = np.sort(answer_df.questionnaire_id.unique())
questionnaires_dict = {}
# For each question build a dictionary and later we can write to separate files
for idx in questions_list:
li = []
for row in answer_df.itertuples():
if row.questionnaire_id == idx:
#print(row.questionnaire_id)
# the answers are json objects, which are evaluated by ast.literal_eval
a = manipulate_df(pd.DataFrame(ast.literal_eval(row.answers)),
row.user_id, row.questionnaire_id, row.collected_at, row.created_at, row.updated_at)
# append the list till all the answersheet rows are processed
li.append(a)
questionnaires_dict[idx] = li
return questionnaires_dict
#final_df = pd.DataFrame(li)
#return final_df
def manipulate_df(df, user_id, questionnaire_id, collected_at, created_at, updated_at, keep_answer_timestamps=False):
"""
Reads the elements of the answers and creates keys and values from the answers in a format fit for appending to the
final dataframe
:param df:
:param user_id:
:param questionnaire_id:
:return: A dictionary that becomes the row of the dataframe
"""
a = OrderedDict()
a['user_id'] = user_id
a['questionnaire_id'] = questionnaire_id
a['created_at'] = created_at
if keep_answer_timestamps:
a['collected_at'] = collected_at
a['updated_at'] = updated_at
for row in df.itertuples():
time_label = row.label + '_collected_at'
a[time_label] = row.collected_at
a[row.label] = row.value
else:
for row in df.itertuples():
#time_label = row.label + '_collected_at'
#a[time_label] = row.collected_at
a[row.label] = row.value
return a
# returns the final dataframe
def build_data(file_path):
"""
:param file_path:
:return:
"""
print('Preparing data. Please wait...')
return read_answersheets(file_path,";")
# Write to csv all 6 questionnaires
def write_from_dict(total_dict,csv=False):
#Note that, key is a question_id => (1 - 6) and val is the data which will be made to csv.
for key, val in total_dict.items():
temp_df = pd.DataFrame(val)
if csv:
temp_df.to_csv("".join(properties.data_location + str(key) + "_q.csv"))
else:
temp_df.to_pickle("".join(properties.data_location + str(key) + "_q.pckl"))
# Uncomment when this has to be reprocessed
#import properties
#file_path = properties.data_location + "answersheets.csv"
#total_questions_dict = build_data(file_path)
#write_from_dict(total_questions_dict, csv=True)
def preprocess_questions():
question_with_translation = pd.read_csv("data/questions_with_translations.csv", sep=";")
question_filtered = question_with_translation[['id', 'questiontype', 'label', 'values', 'required',
'locale', 'question', 'answers']]
question_tschq_en = question_filtered[(question_filtered["locale"] == "en") &
question_filtered["label"].str.match("tschq")]
question_hq_en = question_filtered[(question_filtered["locale"] == "en") &
question_filtered["label"].str.match("hq")]
total_questions = question_tschq_en.append(question_hq_en)
return total_questions.to_dict("r")
def remove_quotes(data):
format_data = []
formatted_dict = {}
for dicts in data:
for key, val in dicts.items():
if isinstance(val, str):
if val.startswith("[") and val.endswith("]"):
dicts[key] = ast.literal_eval(val)
elif val.startswith("{") and val.endswith("}"):
dicts[key] = ast.literal_eval(val)
format_data.append(dicts)
return format_data
#remove_quotes(preprocess_questions())
# Uncomment to recreate questions again
#total_questions = {}
#total_questions["questions"] = remove_quotes(preprocess_questions())
#utility.save_data("total_questions", total_questions)
#utility.save_data("filter_data", remove_quotes(utility.load_data("total_questions")))