-
Notifications
You must be signed in to change notification settings - Fork 2
/
load_data.py
151 lines (118 loc) · 5.86 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import numpy as np
import pandas as pd
from scipy.signal import lfilter
def filter_sparse_samples(data, percentage, pseudo_zero):
"""Removes ASV/species abundance time series consisting of more zeroes than the specified percentage."""
num_of_samples = data.shape[0]
zeroes_count = (data < pseudo_zero).sum(axis=0)
to_remove = zeroes_count > num_of_samples * percentage
data = data.drop(columns=to_remove[to_remove == True].index)
return data
def assign_func_clusters(func_tax, functions):
"""Assign labels/clusters according to functions.
Even if an ASV/species is positive in more than one function, it will only be assigned
one cluster."""
# Find positive functions.
func_start_column = func_tax.shape[1] - len(functions)
positives = func_tax[:,func_start_column:] == 'POS'
# Check if any ASV/species has more than one positive function.
if np.any(np.sum(positives, axis=1) > 1):
print('An ASV/species is positive in more than one function!')
# Assign clusters.
clusters_func = np.full(func_tax.shape[0], len(functions), dtype=int)
for i in range(len(functions)):
clusters_func[positives[:,i]] = i
# if np.any(clusters_func == len(functions)):
# functions = functions.tolist()
# functions.append('None')
print('Cluster labels: ', functions)
print('Cluster sizes: ', np.unique(clusters_func, axis=0, return_counts=True)[1])
print('Total taxa: ', func_tax.shape[0])
return clusters_func
def load_data(config):
"""Load the data from the different files.
Parameters:
config : Dictionary of values from config.json."""
# default file paths
pp_dir = config['results_dir'] + '/data_reformatted/'
pp_abund = pp_dir + 'abundances.csv'
pp_tax_wfunctions = pp_dir + 'taxonomy_wfunctions.csv'
# read abundance data
abund = pd.read_csv(pp_abund, index_col=0)
abund = abund.astype('float32', copy=False)
# read metadata and sort chronologically
meta = pd.read_csv(pp_dir + 'metadata.csv', index_col=0, parse_dates=['Date'])
meta.sort_values(by='Date', inplace=True)
# also sort samples in abund chronologically according to metadata
abund = abund.reindex(index=meta.index, copy=False)
# read taxonomy_wfunctions
func_tax = pd.read_csv(pp_tax_wfunctions, index_col=0, dtype=str)
func_start_column = func_tax.columns.get_loc('Genus') + 1
#func_in_file = func_tax.columns[func_start_column:].to_numpy()
# filter sparse samples with many zeros
abund = filter_sparse_samples(abund, config['max_zeros_pct'], config['pseudo_zero'])
# Filter taxa with no positive value in any of the chosen functional groups
if config['only_pos_func']:
func_start_column = func_tax.columns.get_loc('Genus') + 1
positives = func_tax.iloc[:,func_start_column:] == 'POS'
func_tax = func_tax.loc[positives.any(axis=1)]
# Make abund and taxonomy contain the same taxa (intersect).
func_tax = func_tax.filter(abund.columns, axis=0)
abund = abund.filter(func_tax.index, axis=1)
# Convert to numpy
func_tax.reset_index(inplace=True)
func_tax = func_tax.to_numpy().astype(str)
abund = abund.to_numpy().astype(float)
abund = np.transpose(abund)
clusters_func = assign_func_clusters(func_tax, functions = config['functions'])
return abund, meta, func_tax, clusters_func, config['functions']
def transform(data, transform = "standardize"):
"""'Normalize' (divide by mean) or 'standardize' (subtract mean and divide my std) the data. Both will scale to [0,1]. Use 'divmean' to only divide by the mean without scaling. 'None' returns the data untouched."""
transform = transform.lower()
valid_transformations = ["divmean", "normalize", "standardize", "none"]
result = data
mean = data.mean(axis=1).reshape(-1,1)
std = data.std(axis=1).reshape(-1,1)
min = result.min(axis=1).reshape(-1,1)
max = result.max(axis=1).reshape(-1,1)
if not (transform in valid_transformations):
raise Exception(f'Valid transformations are: {valid_transformations}')
elif transform == 'normalize' or transform == 'divmean':
result = data / mean
elif transform == 'standardize':
result = data - mean
result = result / std
if transform == 'normalize' or transform == 'standardize':
result = result - min
result = result / max
return result, mean, std, min, max, transform
def rev_transform(DF, mean, std, min, max, transform = "standardize"):
"""Reverse transform predicted values when the model is trained on transformed values.
It's very important that the order matches between all the inputs."""
transform = transform.lower()
valid_transformations = ["divmean", "normalize", "standardize", "none"]
if not (transform in valid_transformations):
raise Exception(f'Valid transformations are: {valid_transformations}')
result = DF.copy()
#loop over each column in input DataFrame, assuming order is the same
#between DF+mean+std+min+max
for col in result:
if transform == 'normalize' or transform == 'standardize':
result[col] = result[col] * max[DF.columns.get_loc(col)].item()
result[col] = result[col] + min[DF.columns.get_loc(col)].item()
if transform == 'normalize' or transform == 'divmean':
result[col] = result[col] * mean[DF.columns.get_loc(col)].item()
elif transform == 'standardize':
result[col] = result[col] * std[DF.columns.get_loc(col)].item()
result[col] = result[col] + mean[DF.columns.get_loc(col)].item()
return result
def smooth(data, factor=8):
"""Smoothing factor is the number of data points to use for smoothing."""
b = [1.0 / factor] * factor
a = 1
return lfilter(b, a, data)
if __name__ == "__main__":
import json
with open('config.json', 'r') as config_file:
config = json.load(config_file)
load_data(config)