summarize_raw.R

#
#	Step 3: Determine optimum normalization cutoff
#
# # # # # # # # # # # # # # # # # # # # # # # # # # #

# 'order_counts.csv' is generated by 'compile_raw.R'
rawData <- read.csv('data/order_counts.csv', row.names=1, stringsAsFactors=F)

totalReads <- sum(rawData$reads)
totalLibraries <- dim(rawData)[1]

# We care about the number of libraries left in each treatment combination
#  at each potential cutoff threshold
trtCombos <- unique(rawData[,c("Plants", "Water", "Season")])
trtCount <- dim(trtCombos)[1]
rownames(trtCombos) <- c(1:trtCount)
trtNames <- unique(paste(rawData$Plants, rawData$Water, rawData$Season))

# Reads can be lost one of three ways.  We will keep track of the individual and total reads
#  lost for each cutoff threshold
allLosses <- matrix(ncol=totalLibraries, nrow=3)
colnames(allLosses) <- orderedReadCounts
rownames(allLosses) <- c('Small Library', 'Rarefaction', 'Treatment Unification')
# Small library: once the cutoff surpasses a library's size, all of that library's reads are lost
# Rarefaction: Libraries larger than the cutoff will be reduced to a random sample of the original library, equal in size to the cutoff
# Treatment unification: we will make sure we have the same number of libraries in each unique treatment combination by randomly
# 	removing libraries in treatments that have more

# Potentially, we could set our cutoff to the number of reads in any library
orderedReadCounts <- sort(rawData$reads)
for (i in 1:totalLibraries) {
	curCutoff <- orderedReadCounts[i]
	remainingData <- subset(rawData[,c('Plants', 'Water', 'Season', 'reads')], reads > curCutoff)

	# For the matrix keeping track of libraries left per treatment combination
	newRow <- matrix(0, ncol=trtCount)
	colnames(newRow) <- trtNames
	rownames(newRow) <- curCutoff
	for (j in 1:trtCount) {
		trtCombo <- trtCombos[j,]
		matchingLibraries <- subset(remainingData, Plants %in% trtCombo & Water %in% trtCombo & Season %in% trtCombo)
		newRow[,trtNames[j]] <- dim(matchingLibraries)[1]
	}

	# targetSize: the number of libraries per treatment combination after cutoff
	# librariesLost: starts as just the small libraries, then includes the libraries randomly removed to unify treatments
	targetSize <- min(newRow)
	librariesLost <- i
	for (j in 1:trtCount) {
		librariesLost <- librariesLost + (newRow[,trtNames[j]] - targetSize)
	}
	
	# The rest of this loop is a little involved
	# It calculates the number of reads lost to each of the three categories listed above
	smallLibraryLosses <- sum(orderedReadCounts[c(1:i)])
	allLosses['Small Library',as.character(curCutoff)] <- smallLibraryLosses

	librariesLeft <- totalLibraries - i
	potentialReadsFromRemainingLibraries <- totalReads - smallLibraryLosses
	actualReadsFromRemainingLibraries <- librariesLeft * curCutoff
	allLosses['Rarefaction',as.character(curCutoff)] <- potentialReadsFromRemainingLibraries - actualReadsFromRemainingLibraries

	librariesChosenForRemoval <- librariesLost - i
	allLosses['Treatment Unification',as.character(curCutoff)] <- librariesChosenForRemoval * curCutoff
}

# Visualize the numbers of reads lost for each cutoff
colors <- rainbow(3)
barplot(allLosses, main="Normalization Total Read Loss", xlab="Rarefaction Cutoff", ylab="Reads", col=colors)
legend("topleft", inset=c(.175, 0), legend=rownames(allLosses), col=colors, pch=c(19,19,19))

# Print out the optimum cutoff threshold
totalLosses <- colSums(allLosses)
totalLosses <- totalLosses[order(totalLosses)]
head(totalLosses, n=1)

q(save="no")