more prepraration for background synchronization (issue #17)

VariantEffect · May 25, 2018 · 6b7483b · 6b7483b
1 parent b70d4c4
commit 6b7483b
Show file tree

Hide file tree

Showing 3 changed files with 153 additions and 58 deletions.
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -88,7 +88,7 @@ RUN a2dissite 000-default.conf && a2ensite mavevis
 #   ln -sf /proc/self/fd/1 /var/www/mavevis/logs/access.log
 
 #setup startup script and daemon
-COPY startup.sh daemon.R /setup/
+COPY startup.sh daemon.R sync.R /setup/
 
 #startup script starts apache and daemon
 CMD bash /setup/startup.sh
diff --git a/docker/daemon.R b/docker/daemon.R
@@ -61,6 +61,9 @@ daemon <- function() {
 			logger("INFO: Daemon started.")
 			#infinite loop 
 			while(TRUE) {
+				#start DB synchronization if necessary
+				#TODO: enable once production DB is updated
+				# check.sync()
 				#patrol the directory for new jobs
 				patrol()
 				#sleep for two seconds until next patrol
@@ -71,6 +74,25 @@ daemon <- function() {
 	)
 }
 
+#time of last synchronization cycle
+lastSyncTime <- as.Date("2018-01-01")
+
+#check whether 1 day has passed since the last synchronization, if so run it.
+check.sync <- function() {
+	#calculate time passed since last synchronization
+	daysSinceSync <- difftime(Sys.time(), lastSyncTime, units = "days")
+	#if more than one day has passed
+	if (daysSinceSync > 1) {
+		#start the sync job
+		system(
+			paste("Rscript sync.R"),
+			wait=FALSE
+		)
+		#update synchronization time
+		lastSyncTime <<- Sys.time()
+	}
+}
+
 #check the directory for jobs and deal with them
 patrol <- function() {
 	#catch any errors and handle them

diff --git a/docker/sync.R b/docker/sync.R
@@ -6,10 +6,12 @@ options(stringsAsFactors=FALSE)
 library(rapimave)
 library(hgvsParseR)
 library(yogitools)
+library(mavevis)
 
 baseURL <- "http://ec2-13-210-169-246.ap-southeast-2.compute.amazonaws.com/api/"
 
 #Caching directory
+# cache.dir <- paste0(tempdir(),"/")
 cache.dir <- Sys.getenv("MAVEVIS_CACHE",unset="/var/www/mavevis/cache/")
 if (!file.exists(cache.dir)) {
 	stop("Cache directory does not exist!")
@@ -25,7 +27,54 @@ logger <- function(msg) {
 	close(con)
 }
 
-logger("Starting new synchronization cyle.")
+#translate nucleotide sequence to amino acid sequence
+translate <- function(dna) {
+	#load translation table from mavevis package
+	data("trtable")
+	#define codon start positions
+	cstarts <- seq(1,nchar(dna),3)
+	#extract codons and translate
+	aa.seq <- paste(sapply(cstarts,function(cs) trtable[[substr(dna,cs,cs+2)]]),collapse="")
+	return(aa.seq)
+}
+
+#function to calculate the --ungapped-- offset from the uniprot sequence
+calcOffset <- function(uniprot.acc, maveSeq) {
+
+	#if it's DNA, translate to protein first
+	if (grepl("^[ACGT]+$",maveSeq)) {
+		maveSeq <- translate(maveSeq)
+	}
+
+	#use getUniprotSeq function from mavevis package to download sequence from Uniprot
+	uniSeq <- getUniprotSeq(uniprot.acc)
+
+	error <- "ERROR: WT sequence does not match Uniprot entry"
+
+	#if the sequence in MaveDB is longer than the one in Uniprot, it can't be a match
+	if (nchar(uniSeq) < nchar(maveSeq)) {
+		logger(error)
+		return(-1)
+	#if they're identical, there is no offset
+	} else if (uniSeq == maveSeq) {
+		return(0)
+	} else {
+		#linearly traverse the longer sequence until it matches
+		i <- 1
+		imax <- nchar(uniSeq)-nchar(maveSeq)
+		while (maveSeq != substr(uniSeq,i,i+nchar(maveSeq)-1) && i <= imax+1) {
+			i <- i+1
+		}
+		if (i <= imax) {
+			return(i)
+		} else {
+			logger(error)
+			return(-1)
+		}
+	}
+}
+
+logger("Starting new synchronization cycle.")
 
 #Open existing scorest index
 indexFile <- paste0(cache.dir,"searchIndex.csv")
@@ -35,60 +84,84 @@ if (file.exists(indexFile)) {
 	index <- NULL
 }
 
-#Open API connection
-rmave <- new.rapimave(baseURL)
-
-#Query list of scoresets
-scoresets <- rmave$getAllScoreSets()
-
-#Iterate overscoresets
-invisible(lapply(scoresets,function(scoreset) {
-
-	urn <- scoreset$getURN()
-	name <- scoreset$getTitle()
-
-	#No need to process if it's already known
-	if (!is.null(index) && urn %in% index$urn) {
-		return(NULL)
-	}
-
-	logger(paste("New scoreset found:",urn))
-
-	target <- scoreset$getTarget()
-	tname <- target$getName()
-	wtseq <- target$getSequence()
-	uniprot <- target$getXrefUniprot()
-
-	label <- paste0(urn,": ",tname," - ",name)
-
-	#Download scores and write to cache location
-	scoreTable <- rmave$getScores(urn)
-	scoreCacheFile <- paste0(cache.dir,urn,".csv")
-	write.table(scoreTable,scoreCacheFile,sep=",",row.names=FALSE)
-
-	#Parse score file to check for presence of syn/stop
-	if (grepl(" \\(",scoreTable$hgvs[[1]])) {
-		hgvsp <- sub("\\)$","",sapply(strsplit(scoreTable$hgvs," \\("),`[[`,2))
-	} else {
-		hgvsp <- scoreTable$hgvs
-	}
-	varInfo <- parseHGVS(hgvsp)
-	#TODO: pre-cache varInfo
-
-	hasStop <- any(varInfo$variant %in% c("Ter","*"))
-	hasSyn <- any(varInfo$type == "synonymous")
-
-	#add scoreset information to index
-	index <<- rbind(index,data.frame(
-		label=label,urn=urn,target=tname,wt=wtseq,
-		uniprot=uniprot$getID(),
-		syn=if (hasSyn) "auto" else "manual",
-		stop=if (hasStop) "auto" else "manual"
-	))
-
-	logger("...cached and indexed")
+tryCatch({
+
+	#Open API connection
+	rmave <- new.rapimave(baseURL)
+
+	#Query list of scoresets
+	scoresets <- rmave$getAllScoreSets()
+
+	#Iterate overscoresets
+	invisible(lapply(scoresets,function(scoreset) {
+
+		if (!is.null(scoreset$getNextVersion())) {
+			#it's an outdated scoreset!
+			return(NULL)
+		}
+
+		urn <- scoreset$getURN()
+		name <- scoreset$getTitle()
+
+		#No need to process if it's already known
+		if (!is.null(index) && urn %in% index$urn) {
+			return(NULL)
+		}
+
+		logger(paste("New scoreset found:",urn))
+
+		target <- scoreset$getTarget()
+		tname <- target$getName()
+		wtseq <- target$getSequence()
+		uniprot <- target$getXrefUniprot()
+
+		value <- paste0(urn,": ",tname," - ",name)
+		label <- paste0(tname," - ",name)
+
+		#Download scores and write to cache location
+		scoreTable <- rmave$getScores(urn)
+		scoreCacheFile <- paste0(cache.dir,urn,".csv")
+		write.table(scoreTable,scoreCacheFile,sep=",",row.names=FALSE)
+
+		#Parse score file to check for presence of syn/stop
+		if (grepl(" \\(",scoreTable$hgvs[[1]])) {
+			hgvsp <- sub("\\)$","",sapply(strsplit(scoreTable$hgvs," \\("),`[[`,2))
+		} else {
+			hgvsp <- scoreTable$hgvs
+		}
+		varInfo <- parseHGVS(hgvsp)
+
+		#Cache varInfo
+		mutCacheFile <- paste0(cache.dir,urn,"_muts.csv")
+		write.table(varInfo,mutCacheFile,sep=",",row.names=FALSE)
+
+		#Get off set or calculate if necessary
+		offset <- uniprot$getOffset()
+		if (is.null(offset)) {
+			offset <- calcOffset(uniprot$getID(),wtseq)
+		}
+
+		#determine whether stop and synonymous variants are present
+		hasStop <- any(varInfo$variant %in% c("Ter","*"))
+		hasSyn <- any(varInfo$type == "synonymous")
+
+		#add scoreset information to index
+		index <<- rbind(index,data.frame(
+			value=value,label=label,urn=urn,target=tname,
+			uniprot=uniprot$getID(),
+			syn=if (hasSyn) "auto" else "manual",
+			stop=if (hasStop) "auto" else "manual",
+			offset=offset, wt=wtseq
+		))
+
+		logger("...cached and indexed")
+	}))
+
+	#TODO: pre-cache alignments, PDB files, and structure tracks.
+
+	logger("Synchronization complete.")
+
+},error=function(e) {
+	logger("ERROR: Synchronization failed!")
+	logger(e)
 })
-
-#TODO: pre-cache alignments PDB files and structure tracks.
-
-logger("Synchronization complete.")