forked from afranks86/tissue-ptr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_protein.R
79 lines (64 loc) · 2.75 KB
/
preprocess_protein.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
rm(list=ls())
library(plyr)
source("util_functions.R")
protein_datasets <- c("Kim", "Wilhelm")
for(nm in protein_datasets) {
protein.cur <- matrix(ncol=1)
colnames(protein.cur) <- "Gene.names"
tissues <- c("adrenal.gland", "esophagus", "testis", "kidney",
"ovary", "pancreas", "prostate", "liver", "uterus",
"stomach", "thryoid.gland", "salivary.gland", "spleen",
"colon", "heart", "lung")
for(tissue in tissues){
fname <- sprintf("data/%s_Search_Results_Data/%s_proteinGroups.txt", nm, tissue)
if(file.exists(fname)) {
tissue.tab <- read.table(fname,stringsAsFactors=FALSE,sep="\t",header=TRUE)
tissue.tab <- tissue.tab[,c("Gene.names","iBAQ")]
colnames(tissue.tab)[2] <- tissue
tissue.tab <- tissue.tab[tissue.tab$Gene.names!="",]
protein.cur <- merge(protein.cur, tissue.tab, by="Gene.names", all.x=TRUE, all.y=TRUE)
protein.cur <- ddply(protein.cur, ~Gene.names, function(x)
colSums(x[, 2:ncol(x), drop=FALSE],na.rm=TRUE))
}
}
ens <- unlist(sapply(protein.cur[,"Gene.names"],function(x) {
x
split.lst <- strsplit(x,";")
elst <- sapply(split.lst, n2e)
if(all(is.na(elst))) {
NA
} else {
if(length(elst) > 1)
NA
else
elst[!is.na(elst)]
}
}))
not.na <- !duplicated(ens)&!is.na(ens)
protein.cur <- protein.cur[not.na,]
rownames(protein.cur) <- ens[not.na]
protein.cur[protein.cur==0] <- NA
protein.cur <- as.matrix(log10(protein.cur[, 2:ncol(protein.cur)]))
## Normalize proteins against particular tissue
norm.tissue <- colnames(protein.cur)[1]
for(tissue in setdiff(colnames(protein.cur),norm.tissue)){
protein.cur[,tissue] <- protein.cur[,tissue] +
median(protein.cur[,norm.tissue] - protein.cur[,tissue],na.rm=TRUE)
}
write.csv(protein.cur, file=sprintf("data/protein_%s.csv", nm), quote=FALSE)
}
## Gold standard targeted MS for validation (from Edfors et al, 2016)
protVal <- read.csv("data/validation_raw.csv", row.names=1)
colnames(protVal) <- tolower(colnames(protVal))
gnms <- n2e(rownames(protVal))
protVal <- protVal[!is.na(gnms), ]
rownames(protVal) <- gnms[!is.na(gnms)]
protVal <- log10(protVal[, intersect(colnames(protVal), colnames(protein))])
protVal[protVal == -Inf] <- NA
## Normalize proteins against first tissue
norm.tissue <- colnames(protVal)[1]
for(tissue in setdiff(colnames(protVal), norm.tissue)){
protVal[, tissue] <- protVal[, tissue] +
median(protVal[, norm.tissue] - protVal[, tissue], na.rm=TRUE)
}
write.csv(protVal, file="data/validation.csv", quote=FALSE)