-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
114 lines (104 loc) · 4.35 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import datetime
import os
import numpy as np
import pandas as pd
# Used for testing random sample keep the header, then take only 1% of lines
# if random from [0,1] interval is greater than 0.01 the row will be skipped
# p = 0.01
# df = pd.read_csv(
# "grs.csv.gz",
# index_col=0,
# header=0,
# skiprows=lambda i: i > 0 and random.random() > p,
# )
def load_data(data_folder):
json_path = os.path.join(data_folder, "grs.csv.gz")
df = pd.read_csv(json_path, index_col=0, header=0)
df = df[
[
"date",
"loc",
"lin",
"leaf",
"N_7",
"deltaN_7",
"N_prev_7",
"deltaN_prev_7",
"Prevalence_7",
"deltaPrevalence_7",
"G_7",
"deltaG_7",
]
]
df["Prevalence_7_percentage"] = df["Prevalence_7"] * 100
df["deltaPrevalence_7_percentage"] = df["deltaPrevalence_7"] * 100
df["G_7_linear"] = (np.exp(df["G_7"]) - 1) * 100
df["deltaG_7_linear"] = df["deltaG_7"] * np.exp(df["G_7"]) * 100
df["snr"] = abs(df["G_7_linear"] / df["deltaG_7_linear"])
df["invDeltaG_7"] = 1 / abs(df["deltaG_7"])
df["confidenceInterval95"] = df["deltaG_7"] * np.exp(df["G_7"]) * 1.96 * 100
df["confidenceInterval80"] = df["deltaG_7"] * np.exp(df["G_7"]) * 1.28 * 100
df["confidenceInterval65"] = df["deltaG_7"] * np.exp(df["G_7"]) * 0.93 * 100
df["confidenceInterval50"] = df["deltaG_7"] * np.exp(df["G_7"]) * 0.67 * 100
df["confidenceInterval35"] = df["deltaG_7"] * np.exp(df["G_7"]) * 0.45 * 100
df["confidenceInterval20"] = df["deltaG_7"] * np.exp(df["G_7"]) * 0.25 * 100
df["confidenceInterval5"] = df["deltaG_7"] * np.exp(df["G_7"]) * 0.06 * 100
# change date to datetime
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
# last 90 days and fillna with empty string
df = df[df["date"] >= (df["date"].max() - pd.to_timedelta("90day"))]
# group by location and lineage
df = df.groupby(["loc", "lin", "leaf"]).agg(lambda x: list(x))
# convert date back to string in format YYYY-MM-DD
df["date"] = df["date"].apply(lambda x: [i.strftime("%Y-%m-%d") for i in x])
df = df.transpose()
# get loop over columns in dataframe
for loc_lin_leaf, series in df.items():
record = {}
# list of dictionaries
values = []
# expand series into dataframe so we can convert to list of dict
values_df = pd.DataFrame([pd.Series(x) for x in series], index=df.index).fillna(0)
# change to dict
values_dict = values_df.to_dict()
for key, value in values_dict.items():
values.append(value)
record["_id"] = str(loc_lin_leaf[0]) + "_" + str(loc_lin_leaf[1]) + "_" + str(loc_lin_leaf[2])
record["location"] = loc_lin_leaf[0]
record["lineage"] = loc_lin_leaf[1]
record["leaf"] = loc_lin_leaf[2]
record["values"] = values
yield record
def custom_data_mapping(cls):
return {
"date": {"type": "date"},
"location": {"type": "keyword"},
"lineage": {"type": "keyword"},
"leaf": {"type": "keyword"},
"values": {
"properties": {
"confidenceInterval5": {"type": "double"},
"confidenceInterval20": {"type": "double"},
"confidenceInterval35": {"type": "double"},
"confidenceInterval50": {"type": "double"},
"confidenceInterval65": {"type": "double"},
"confidenceInterval80": {"type": "double"},
"confidenceInterval95": {"type": "double"},
"date": {"type": "date"},
"deltaG_7": {"type": "double"},
"deltaG_7_linear": {"type": "double"},
"deltaN_7": {"type": "double"},
"deltaN_prev_7": {"type": "double"},
"deltaPrevalence_7": {"type": "double"},
"deltaPrevalence_7_percentage": {"type": "double"},
"G_7": {"type": "double"},
"G_7_linear": {"type": "double"},
"invDeltaG_7": {"type": "double"},
"N_7": {"type": "double"},
"N_prev_7": {"type": "double"},
"Prevalence_7": {"type": "double"},
"Prevalence_7_percentage": {"type": "double"},
"snr": {"type": "double"},
}
},
}