-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquantilerobust_MSREinsensitive_normalize.pl
executable file
·317 lines (286 loc) · 8.92 KB
/
quantilerobust_MSREinsensitive_normalize.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
#takes 3 arguments
#1 - name of file with the data
#2 - name of file with the UNCUTSNP sets (probe_ids)
#3 - name of file with the CUTSNP sets (probe_ids)
#4 - name of file with the model data (order of probe_ids doesn't matter)
use strict;
use warnings;
use Getopt::Long;
my $help;
my $datafile='';
my $mnrfile='';
my $mprfile='';
my $modelfile='';
GetOptions(
'datafile|i=s' => \$datafile,
'mnrfile|u=s' => \$mnrfile,
'mprfile|c=s' => \$mprfile,
'modelfile|m=s' => \$modelfile,
'help|h' => \&usage);
# usage() if ( @ARGV < 1 or $help eq 1);
if (defined $datafile && $datafile eq '') {
print("raw intensity datafile not defined\nUse --datafile/-i\n");
die;
}
if (defined $mnrfile && $mnrfile eq '') {
print("MNR probe id file not defined\nUse --mnrfile/-u\n");
die;
}
if (defined $mprfile && $mprfile eq '') {
print("MPR probe id file not defined\nUse --mprfile/-c/\n");
die;
}
if (defined $modelfile && $modelfile eq '') {
print("model sample raw intensity file not defined\nUse --modelfile/-m\n");
die;
}
##filenames
my $rawuncutdata = "rawUNCUT.txt";
my $rawcutdata = "rawCUT.txt";
my $quantuncutdata = "quantUNCUT.txt";
my $adjustdata = "quant-rawUNCUTadjustments.txt";
my $temp = "tempfile.txt";
#open model file and combine with datafile
#open model file, read into hash with probeid as key
open (MODEL, $modelfile);
my ($probeid, $data, $modelcolheader, $line);
my $modelheader =<MODEL>;
my %model;
while (<MODEL>) {
chomp;
($probeid, $data) = split /\t/;
$model{$probeid}=$data;
}
close MODEL;
#open data file, read into hash with probeid as key
open (ALLRAW, $datafile);
my $dataheader=<ALLRAW>;
chomp($dataheader);
my %allraw;
my @allrawline;
while (<ALLRAW>) {
chomp;
@allrawline = split /\t/, $_;
$probeid = $allrawline[0];
$line = join "\t", @allrawline;
$allraw{$probeid}=$line;
}
close ALLRAW;
#combine this data in a new file (temp)
open (TEMP, ">$temp");
print TEMP "$dataheader\tMODEL\n";
my $value;
foreach $value (keys %allraw) {
#print $model{$value};
print TEMP $allraw{$value}."\t".$model{$value}."\n";
}
close TEMP;
#extract UNCUT SNP data from temp datafile
open (TEMP, "<$temp");
open (UNCUTSNPS, $mnrfile);
open (RAWUNCUT, ">$rawuncutdata");
pullsnps (*TEMP, *UNCUTSNPS, *RAWUNCUT);
close UNCUTSNPS;
close RAWUNCUT;
close TEMP;
print "UNCUTSNP data extracted\n";
#extract CUT SNP data from datafile)
open (TEMP, "<$temp");
open (CUTSNPS, $mprfile);
open (RAWCUT, ">$rawcutdata");
pullsnps (*TEMP, *CUTSNPS, *RAWCUT);
close CUTSNPS;
close RAWCUT;
close TEMP;
print "CUTSNP data extracted\n";
#run R quantile normalization in R on rawUNCUTdata and ouput both the
# quantile normalized UNCUTdata results as well as the adjustments
open (QUANTUNCUT, ">$quantuncutdata");
open (ADJUSTS, ">$adjustdata");
system ("R --no-save < ./bin/Rquantilenorm.robust.R $rawuncutdata $quantuncutdata $adjustdata");
close QUANTUNCUT;
close ADJUSTS;
#open a data file to grab the header ($header) and count the number of cel files you are analysing ($numdatafields)
open (RAWCUT, "<$rawcutdata");
my $header=<RAWCUT>;
chomp $header;
my @headerfields=split (/\t/, $header);
my $numdatafields=@headerfields;
#print "number of cel files analysed: $numdatafields\n";
close RAWCUT;
#load the data files into a bunch of hashes
#this is done one data column at a time using a while loop and the $fieldcount and $numdatafields variables
#(with common keys for ADJ and UNCUT)
my $fieldcount=1;
while ($fieldcount<$numdatafields) {
#get the header for the fileoutput
my $fileheader=$headerfields[$fieldcount];
my $fileoutput=$fileheader."_quant.txt";
open (FILEOUTPUT, ">$fileoutput");
print FILEOUTPUT "probeID\tIntensity\n";
#load in the raw UNCUT data, after chopping off the header
#split the data and stuff into hash,
#key=first column=probeid ($uncutfields[0])
#value= intensity data from whichever column is referred to by $fieldcount ($uncutfields[$fieldcount])
#secondvalue=cutstatus of this probe
open (RAWUNCUT, "<$rawuncutdata");
<RAWUNCUT>;
my %all;
while (<RAWUNCUT> ) {
my @fields = split /\t/;
my $data=$fields[$fieldcount];
my $probeid=$fields[0];
$all{$probeid}[0]=$data;
$all{$probeid}[1]="UN";
}
close RAWUNCUT;
print "$fieldcount: RAWUNCUT loaded\n";
#load in the raw CUT data, after chopping off the header
#same hash loading as above
open (RAWCUT, "<$rawcutdata");
<RAWCUT>;
while (<RAWCUT> ) {
my @fields = split /\t/;
my $data=$fields[$fieldcount];
my $probeid=$fields[0];
$all{$probeid}[0]=$data;
$all{$probeid}[1]="CUT";
}
close RAWCUT;
print "$fieldcount: RAWCUT loaded\n";
#load in the adjustment values for each of the raw UNCUT data points, after chopping off the header
#same hash loading as above
open (ADJUSTS, "<$adjustdata");
<ADJUSTS>;
my %ADJHASH;
while (<ADJUSTS> ) {
my @fields = split /\t/;
my $data=$fields[$fieldcount];
my $probeid=$fields[0];
$ADJHASH{$probeid}=$data;
}
close ADJUSTS;
print "$fieldcount: ADJUSTS loaded\n";
#load in the data you have already calculated for the uncut probes (you will print this out later after you have the cut values)
open (QUANTUNCUT, "<$quantuncutdata");
<QUANTUNCUT>;
my %QUANT;
while (<QUANTUNCUT> ) {
chomp;
my @fields = split /\t/;
my $data=$fields[$fieldcount];
my $probeid=$fields[0];
$QUANT{$probeid}=$data;
}
close QUANTUNCUT;
print "$fieldcount: QUANTILE normalized UNCUT data loaded\n";
#sort the array of data (%all) by the intensities and put in same order onto two arrays, one with the SNP names (@snparray), and the other with the cut/uncut statuses (@cutarray)
my @snparray;
my @cutarray;
my $count=0;
my $probeid;
foreach $probeid (sort { $all{$a}[0] <=> $all{$b}[0] } keys %all) {
$all{$probeid}[2]=$count;
$count=$count+1;
push (@snparray, $probeid);
push (@cutarray, $all{$probeid}[1]);
}
print "$fieldcount: HASH sorted, and pushed onto arrays\n";
#this is the algorithm for finding the next highest/lowest uncut probe for each cut probe
my $totalprobes=@cutarray;
my $probeindex;
my $oversnp;
my $undersnp;
my $overtest;
my $i;
foreach $probeid (sort { $all{$a}[2] <=> $all{$b}[2] } keys %all) {
$probeindex=$all{$probeid}[2];
if ($cutarray[$probeindex]=~ /CUT/)
{
#start the counter at the current index in the sort
$i=$probeindex;
#temporarily set the $oversnp value to be the same as the current probe, in case there is not a higher uncut snp
$oversnp=$snparray[$i];
#step through the array until you get to the top, and exit loop when you find an uncut probe, dump that value into the $oversnp variable
while ($i<$totalprobes) {
if ($cutarray[$i]=~/UN/) {
$oversnp=$snparray[$i];
last;
}
$i=$i+1;
}
#reset the counter
$i=$probeindex;
#set the $undersnp to be the same as the $oversnp, in case there is not a lower uncut snp
$undersnp=$oversnp;
while (0<=$i) {
if ($cutarray[$i]=~/UN/) {
$undersnp=$snparray[$i];
last;
}
$i=$i-1;
}
#in case there was no $oversnp found and is still set the same as the current probe, set it to be the same as the $undersnp!
if ($oversnp=$snparray[$probeindex]) {
$oversnp=$undersnp;
}
my $overvalue=$all{$oversnp}[0];
my $undervalue=$all{$undersnp}[0];
my $currentvalue=$all{$probeid}[0];
my $over=abs($overvalue-$currentvalue);
my $under=abs($currentvalue-$undervalue);
my $adjcutid;
if ($over>=$under) {
$adjcutid=$undersnp;
} else {
$adjcutid=$oversnp;
}
# using the probe id of the UNCUT data point which has the closest value to the CUT data point
# grab the adjustment factor for that UNCUT probe and apply it to the raw CUT data point
my $adjcut = $ADJHASH{$adjcutid}*$currentvalue;
#round up result to one decimal place
$adjcut = sprintf("%.1f", $adjcut);
#push these values onto the results hash (with the uncut data)
$QUANT{$probeid}=$adjcut;
}
}
#sort the quantile normalized data by the snpid (uncut and cut) and print out
foreach $probeid (sort keys %QUANT) {
print FILEOUTPUT "$probeid\t$QUANT{$probeid}\n";
}
print $fileheader." printed to file\n";
print "$fieldcount: all adjusted values printed to file\n";
$fieldcount=$fieldcount+1;
close FILEOUTPUT;
}
exit;
#SUBSCRIPTS
sub pullsnps {
my $data = shift;
my $snps = shift;
my $outputfile = shift;
my $header=<$data>;
my $key;
print $outputfile $header;
my %DATAHASH;
while (<$data>) {
my @elements = split /\t/;
my $line = join "\t", @elements;
$DATAHASH{$elements[0]}=$line;
}
close $data;
<$snps>;
while (<$snps>) {
chomp $_;
$key = $_;
if (exists $DATAHASH{$key}) {
print $outputfile $DATAHASH{$key};
}
}
close $snps;
}
sub usage {
print "Unknown option: @_\n" if ( @_ );
print "usage: program [--datafile/-i absolute path to datafile] [--mnrfile/u absolute path to MNR probeid file] [--mprfile/c absolute path to MPR probeid file] [--modelfile/m absolute path to HapMap-derived model raw data file] [--help|-h]\n";
exit;
}