-
Notifications
You must be signed in to change notification settings - Fork 4
/
groupSIFT_imputeMissingSubjects.m
116 lines (100 loc) · 4.89 KB
/
groupSIFT_imputeMissingSubjects.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
% groupSIFT_imputeMissingSubjects()--Takes two groupSIFT-saved individual
% subject data to perform missing data
% imputation using MDI Toolbox 4. The
% default algorithm used is Trimmed
% Scores Regression (TSR). The output
% variable may be saved by using
% Matlab fucntion writetable().
%
% Usage
% outputTable = groupSIFT_imputeMissingSubjects(xlsx1, xlsx2)
%
% Input
% xlsx1: Fullpath for the *.xlsx file generated by groupSIFT that contains individual subject data for each edge.
% xlsx2: Fullpath for the *.xlsx file generated by groupSIFT that contains individual subject data for each edge.
%
% Output
% outputTable: Matlab table variable that contains corresponding pairs between xlsx1 and xlsx2 while missing variables imputed.
%
% History
% 06/05/2020 Makoto. Created.
% Copyright (C) 2020, Makoto Miyakoshi ([email protected]) , SCCN,INC,UCSD
%
% Redistribution and use in source and binary forms, with or without
% modification, are permitted provided that the following conditions are met:
%
% 1. Redistributions of source code must retain the above copyright notice,
% this list of conditions and the following disclaimer.
%
% 2. Redistributions in binary form must reproduce the above copyright notice,
% this list of conditions and the following disclaimer in the documentation
% and/or other materials provided with the distribution.
%
% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
% THE POSSIBILITY OF SUCH DAMAGE.
function outputTable = groupSIFT_imputeMissingSubjects(xlsx1, xlsx2)
% Set path to the Missing Data Imputation toolbox.
pathCheck = which('pcambtsr');
if isempty(pathCheck)
error('Set path to MDI Toolbox 4.')
end
% Load two datasets.
[NUM1,TXT1,RAW1] = xlsread(xlsx1);
[NUM2,TXT2,RAW2] = xlsread(xlsx2);
% Preliminary solution to export to excel.
dataSheet = cell(1,1);
dataSheet{1,1} = 'from';
dataSheet{2,1} = 'to';
dataSheet{3,1} = 'centroidTimeS';
dataSheet{4,1} = 'centroidFreqHz';
dataSheet{5,1} = 'clusterSize';
% Loop across edges.
for edgeIdx = 1:(size(RAW1,2)-1)/2
% Find column indices for variables and subject names.
currentEdgeVariableIdx = edgeIdx*2;
currentEdgeSubjIdx = currentEdgeVariableIdx+1;
% Obtain the variables and subject names.
currentEdgeVar1 = cell2mat(RAW1(7:end,currentEdgeVariableIdx));
currentSubjVar1 = RAW1(7:end,currentEdgeSubjIdx);
currentEdgeVar2 = cell2mat(RAW2(7:end,currentEdgeVariableIdx));
currentSubjVar2 = RAW2(7:end,currentEdgeSubjIdx);
% Remove NaNs.
nanMask1 = isnan(currentEdgeVar1);
currentEdgeVar1(nanMask1) = [];
currentSubjVar1(nanMask1) = [];
nanMask2 = isnan(currentEdgeVar2);
currentEdgeVar2(nanMask2) = [];
currentSubjVar2(nanMask2) = [];
% Extract common subject IDs.
subj1 = cellfun(@(x) x(4:9), currentSubjVar1, 'uniformoutput', false);
subj2 = cellfun(@(x) x(4:9), currentSubjVar2, 'uniformoutput', false);
% Generate data matrix with missing values.
allUniqueSubj = unique([subj1; subj2]);
missingDataMatrix = nan(length(allUniqueSubj),2);
[~,subj1Idx] = intersect(allUniqueSubj, subj1);
[~,subj2Idx] = intersect(allUniqueSubj, subj2);
missingDataMatrix(subj1Idx,1) = currentEdgeVar1;
missingDataMatrix(subj2Idx,2) = currentEdgeVar2;
% Perform missing value interpolation.
imputedData = pcambtsr(missingDataMatrix, size(missingDataMatrix,2), 5000, 1e-10);
% Build the new output.
dataSheet(1:5, edgeIdx*3-1) = RAW1(2:6,currentEdgeVariableIdx);
dataSheet(6:6+size(imputedData,1)-1, edgeIdx*3-1) = allUniqueSubj;
dataSheet{4, edgeIdx*3} = 'Data1';
dataSheet{5, edgeIdx*3} = ['Missing:', num2str(find(isnan(missingDataMatrix(:,1)')))];
dataSheet(6:6+size(imputedData,1)-1, edgeIdx*3) = num2cell(imputedData(:,1));
dataSheet{4, edgeIdx*3+1} = 'Data2';
dataSheet{5, edgeIdx*3+1} = ['Missing:', num2str(find(isnan(missingDataMatrix(:,2)')))];
dataSheet(6:6+size(imputedData,1)-1, edgeIdx*3+1) = num2cell(imputedData(:,2));
end
% Convert cell to table.
outputTable = cell2table(dataSheet);