-
Notifications
You must be signed in to change notification settings - Fork 1
/
normalize.R
78 lines (69 loc) · 3.04 KB
/
normalize.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#
# Step 4: Perform normalization
#
# # # # # # # # # # # # # # # # # # #
# 'order_counts.csv' is generated by 'compile_raw.R'
rawData <- read.csv('data/order_counts.csv', row.names=1, stringsAsFactors=F)
# 'minReads' is the rarefaction cutoff specified by 'summarize_raw.R'
minReads <- 144136
rawTreatments <- subset(rawData[,c(1:6)], reads >= minReads)
rawReads <- subset(rawData, reads >= minReads)[,-c(1:6)]
# rarefaction (provided by 'vegan') selects a random subsample, n='minReads', of the raw data
# from each library
library(vegan)
normalizedData <- cbind(rawTreatments, rrarefy(rawReads, minReads))
normalizedData$reads <- minReads
# Check that actual rarefied library read counts all match 'minReads'
for(i in 1:length(normalizedData$reads)) {
expected <- minReads
actual <- sum(normalizedData[i, c(7:length(normalizedData[i,]))])
if(expected != actual) {
print(paste("ERROR: A row was incorrectly normalized:", expected, "!=", actual, collapse=" "))
q(save="yes")
}
}
# Randomly remove libraries until there is an equal number for all treatment combinations
#>>>START
trtCombos <- unique(rawData[,c(3:5)])
trtNames <- apply(trtCombos, 1, paste, collapse=" ")
trtLength <- length(trtNames)
trtCounts <- matrix(0, nrow=trtLength, ncol=2)
rownames(trtCounts) <- trtNames
colnames(trtCounts) <- c("Total", "n to remove")
# find the total number of libraries in each treatment combination
for (i in 1:dim(normalizedData)[1]) {
curTrt <- paste(normalizedData[i,c(3:5)], collapse=" ")
trtCounts[curTrt, "Total"] <- trtCounts[curTrt, "Total"] + 1
}
# find the minimum library count from all the treatment combinations
minCount <- min(trtCounts[,"Total"])
trtCounts[,"n to remove"] <- trtCounts[,"Total"] - minCount
librariesChosenForRemoval = vector()
for (i in 1:trtLength) {
curTrt <- trtCounts[trtNames[i],]
if (curTrt["n to remove"] != 0) {
potentialLibraries <- subset(normalizedData, Plants %in% trtCombos[i,] & Water %in% trtCombos[i,] & Season %in% trtCombos[i,])
chosenLibrary <- sample(rownames(potentialLibraries), as.numeric(curTrt["n to remove"]))
# select n libraries to remove
librariesChosenForRemoval <- append(librariesChosenForRemoval, chosenLibrary)
}
}
indexesChosenForRemoval <- match(librariesChosenForRemoval, rownames(normalizedData))
# perform the removal
normalizedData <- normalizedData[-indexesChosenForRemoval, ]
#>>>FINISH
# Check that all treatment combinations have equal counts
trtCounts <- vector()
for (i in 1:trtLength) {
potentialLibraries <- subset(normalizedData, Plants %in% trtCombos[i,] & Water %in% trtCombos[i,] & Season %in% trtCombos[i,])
trtCounts <- append(trtCounts, dim(potentialLibraries)[1])
}
if (length(unique(trtCounts)) > 1) {
print("ERROR: Not all treatment combinations have the same number of replicates")
q(save="yes")
}
# For the sake of replication, keep track of which libraries were randomly chosen for removal
write.csv(librariesChosenForRemoval, file="data/libraries_chosen_for_removal.csv")
# Write normalized data to disk
write.csv(normalizedData, file="data/normalized_order_counts.csv")
q(save="no")