-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpreprocessing_funcs.py
172 lines (144 loc) · 7.27 KB
/
preprocessing_funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
######## BIN_SPIKES ########
def bin_spikes(spike_times,dt,wdw_start,wdw_end):
"""
Function that puts spikes into bins
Parameters
----------
spike_times: an array of arrays
an array of neurons. within each neuron's array is an array containing all the spike times of that neuron
dt: number (any format)
size of time bins
wdw_start: number (any format)
the start time for putting spikes in bins
wdw_end: number (any format)
the end time for putting spikes in bins
Returns
-------
neural_data: a matrix of size "number of time bins" x "number of neurons"
the number of spikes in each time bin for each neuron
"""
edges=np.arange(wdw_start,wdw_end,dt) #Get edges of time bins
num_bins=edges.shape[0]-1 #Number of bins
num_neurons=spike_times.shape[0] #Number of neurons
neural_data=np.empty([num_bins,num_neurons]) #Initialize array for binned neural data
#Count number of spikes in each bin for each neuron, and put in array
for i in range(num_neurons):
neural_data[:,i]=np.histogram(spike_times[i],edges)[0]
return neural_data
######## BIN_OUTPUT #######
def bin_output(outputs,output_times,dt,wdw_start,wdw_end,downsample_factor=1):
"""
Function that puts outputs into bins
Parameters
----------
outputs: matrix of size "number of times the output was recorded" x "number of features in the output"
each entry in the matrix is the value of the output feature
output_times: a vector of size "number of times the output was recorded"
each entry has the time the output was recorded
dt: number (any format)
size of time bins
wdw_start: number (any format)
the start time for binning the outputs
wdw_end: number (any format)
the end time for binning the outputs
downsample_factor: integer, optional, default=1
how much to downsample the outputs prior to binning
larger values will increase speed, but decrease precision
Returnsfaster_rcnn
-------
outputs_binned: matrix of size "number of time bins" x "number of features in the output"
the average value of each output feature in every time bin
"""
###Downsample output###
#We just take 1 out of every "downsample_factor" values#
if downsample_factor!=1: #Don't downsample if downsample_factor=1
downsample_idxs=np.arange(0,output_times.shape[0],downsample_factor) #Get the idxs of values we are going to include after downsampling
outputs=outputs[downsample_idxs,:] #Get the downsampled outputs
output_times=output_times[downsample_idxs] #Get the downsampled output times
###Put outputs into bins###
edges=np.arange(wdw_start,wdw_end,dt) #Get edges of time bins
num_bins=edges.shape[0]-1 #Number of bins
output_dim=outputs.shape[1] #Number of output features
outputs_binned=np.empty([num_bins,output_dim]) #Initialize matrix of binned outputs
#Loop through bins, and get the mean outputs in those bins
for i in range(num_bins): #Loop through bins
idxs=np.where((np.squeeze(output_times)>=edges[i]) & (np.squeeze(output_times)<edges[i+1]))[0] #Indices to consider the output signal (when it's in the correct time range)
for j in range(output_dim): #Loop through output features
outputs_binned[i,j]=np.mean(outputs[idxs,j])
return outputs_binned
###$$ GET_SPIKES_WITH_HISTORY #####
def get_spikes_with_history(neural_data,bins_before,bins_after,bins_current=1):
"""
Function that creates the covariate matrix of neural activity
Parameters
----------
neural_data: a matrix of size "number of time bins" x "number of neurons"
the number of spikes in each time bin for each neuron
bins_before: integer
How many bins of neural data prior to the output are used for decoding
bins_after: integer
How many bins of neural data after the output are used for decoding
bins_current: 0 or 1, optional, default=1
Whether to use the concurrent time bin of neural data for decoding
Returns
-------
X: a matrix of size "number of total time bins" x "number of surrounding time bins used for prediction" x "number of neurons"
For every time bin, there are the firing rates of all neurons from the specified number of time bins before (and after)
"""
num_examples=neural_data.shape[0] #Number of total time bins we have neural data for
num_neurons=neural_data.shape[1] #Number of neurons
surrounding_bins=bins_before+bins_after+bins_current #Number of surrounding time bins used for prediction
X=np.empty([num_examples,surrounding_bins,num_neurons]) #Initialize covariate matrix with NaNs
X[:] = np.NaN
#Loop through each time bin, and collect the spikes occurring in surrounding time bins
#Note that the first "bins_before" and last "bins_after" rows of X will remain filled with NaNs, since they don't get filled in below.
#This is because, for example, we cannot collect 10 time bins of spikes before time bin 8
start_idx=0
for i in range(num_examples-bins_before-bins_after): #The first bins_before and last bins_after bins don't get filled in
end_idx=start_idx+surrounding_bins; #The bins of neural data we will be including are between start_idx and end_idx (which will have length "surrounding_bins")
X[i+bins_before,:,:]=neural_data[start_idx:end_idx,:] #Put neural data from surrounding bins in X, starting at row "bins_before"
start_idx=start_idx+1;
return X
def preprocessing(trainX, testX, trainY, testY) :
"""
Function that pre-pocesses the data
Parameters
----------
TrainX, TestX: numpy array of shape [#datapoints, input_dim]
contain feature vectors of train and test data
TrainY, TestY: numpy array of shape [#datapoints, 1]
contain labels of train and test data
Returns
-------
scaler: StandardScaler
scaler used to standardize the data
scaled_trainX, scaled_testX: numpy array of shape [#datapoints, input_dim]
scaled train and test feature vectors
trainY, testY: numpy array of shape [#datapoints, 1]
train and test labels
"""
x_scaler = StandardScaler()
y_scaler = MinMaxScaler(feature_range=(0,1)) #StandardScaler()
# fit and transform the data
scaled_trainX = x_scaler.fit_transform(trainX)
scaled_testX = x_scaler.transform(testX)
scaled_trainY = y_scaler.fit_transform(trainY)
scaled_testY = y_scaler.transform(testY)
return x_scaler, y_scaler, scaled_trainX, scaled_testX, scaled_trainY, scaled_testY
def remove_outliers(tX):
"""replace the outliers (values more than mean+3*std and less than mean-3*std ) with the closet bound """
clean_data = []
for f in tX.T:
mean = np.mean(f, axis=0)
std = np.std(f, axis=0)
edge = std * 3
lowerb = mean - edge
upperb = mean + edge
for i,x in enumerate(f):
if x<lowerb:
f[i]=lowerb
elif x>upperb:
f[i]=upperb
return tX