-
Notifications
You must be signed in to change notification settings - Fork 1
/
summarize_raw.R
77 lines (64 loc) · 3.43 KB
/
summarize_raw.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#
# Step 3: Determine optimum normalization cutoff
#
# # # # # # # # # # # # # # # # # # # # # # # # # # #
# 'order_counts.csv' is generated by 'compile_raw.R'
rawData <- read.csv('data/order_counts.csv', row.names=1, stringsAsFactors=F)
totalReads <- sum(rawData$reads)
totalLibraries <- dim(rawData)[1]
# We care about the number of libraries left in each treatment combination
# at each potential cutoff threshold
trtCombos <- unique(rawData[,c("Plants", "Water", "Season")])
trtCount <- dim(trtCombos)[1]
rownames(trtCombos) <- c(1:trtCount)
trtNames <- unique(paste(rawData$Plants, rawData$Water, rawData$Season))
# Reads can be lost one of three ways. We will keep track of the individual and total reads
# lost for each cutoff threshold
allLosses <- matrix(ncol=totalLibraries, nrow=3)
colnames(allLosses) <- orderedReadCounts
rownames(allLosses) <- c('Small Library', 'Rarefaction', 'Treatment Unification')
# Small library: once the cutoff surpasses a library's size, all of that library's reads are lost
# Rarefaction: Libraries larger than the cutoff will be reduced to a random sample of the original library, equal in size to the cutoff
# Treatment unification: we will make sure we have the same number of libraries in each unique treatment combination by randomly
# removing libraries in treatments that have more
# Potentially, we could set our cutoff to the number of reads in any library
orderedReadCounts <- sort(rawData$reads)
for (i in 1:totalLibraries) {
curCutoff <- orderedReadCounts[i]
remainingData <- subset(rawData[,c('Plants', 'Water', 'Season', 'reads')], reads > curCutoff)
# For the matrix keeping track of libraries left per treatment combination
newRow <- matrix(0, ncol=trtCount)
colnames(newRow) <- trtNames
rownames(newRow) <- curCutoff
for (j in 1:trtCount) {
trtCombo <- trtCombos[j,]
matchingLibraries <- subset(remainingData, Plants %in% trtCombo & Water %in% trtCombo & Season %in% trtCombo)
newRow[,trtNames[j]] <- dim(matchingLibraries)[1]
}
# targetSize: the number of libraries per treatment combination after cutoff
# librariesLost: starts as just the small libraries, then includes the libraries randomly removed to unify treatments
targetSize <- min(newRow)
librariesLost <- i
for (j in 1:trtCount) {
librariesLost <- librariesLost + (newRow[,trtNames[j]] - targetSize)
}
# The rest of this loop is a little involved
# It calculates the number of reads lost to each of the three categories listed above
smallLibraryLosses <- sum(orderedReadCounts[c(1:i)])
allLosses['Small Library',as.character(curCutoff)] <- smallLibraryLosses
librariesLeft <- totalLibraries - i
potentialReadsFromRemainingLibraries <- totalReads - smallLibraryLosses
actualReadsFromRemainingLibraries <- librariesLeft * curCutoff
allLosses['Rarefaction',as.character(curCutoff)] <- potentialReadsFromRemainingLibraries - actualReadsFromRemainingLibraries
librariesChosenForRemoval <- librariesLost - i
allLosses['Treatment Unification',as.character(curCutoff)] <- librariesChosenForRemoval * curCutoff
}
# Visualize the numbers of reads lost for each cutoff
colors <- rainbow(3)
barplot(allLosses, main="Normalization Total Read Loss", xlab="Rarefaction Cutoff", ylab="Reads", col=colors)
legend("topleft", inset=c(.175, 0), legend=rownames(allLosses), col=colors, pch=c(19,19,19))
# Print out the optimum cutoff threshold
totalLosses <- colSums(allLosses)
totalLosses <- totalLosses[order(totalLosses)]
head(totalLosses, n=1)
q(save="no")