diff --git a/docker/Dockerfile b/docker/Dockerfile index 2071d03..40036e5 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -88,7 +88,7 @@ RUN a2dissite 000-default.conf && a2ensite mavevis # ln -sf /proc/self/fd/1 /var/www/mavevis/logs/access.log #setup startup script and daemon -COPY startup.sh daemon.R /setup/ +COPY startup.sh daemon.R sync.R /setup/ #startup script starts apache and daemon CMD bash /setup/startup.sh diff --git a/docker/daemon.R b/docker/daemon.R index 739b2c6..4097d89 100644 --- a/docker/daemon.R +++ b/docker/daemon.R @@ -61,6 +61,9 @@ daemon <- function() { logger("INFO: Daemon started.") #infinite loop while(TRUE) { + #start DB synchronization if necessary + #TODO: enable once production DB is updated + # check.sync() #patrol the directory for new jobs patrol() #sleep for two seconds until next patrol @@ -71,6 +74,25 @@ daemon <- function() { ) } +#time of last synchronization cycle +lastSyncTime <- as.Date("2018-01-01") + +#check whether 1 day has passed since the last synchronization, if so run it. +check.sync <- function() { + #calculate time passed since last synchronization + daysSinceSync <- difftime(Sys.time(), lastSyncTime, units = "days") + #if more than one day has passed + if (daysSinceSync > 1) { + #start the sync job + system( + paste("Rscript sync.R"), + wait=FALSE + ) + #update synchronization time + lastSyncTime <<- Sys.time() + } +} + #check the directory for jobs and deal with them patrol <- function() { #catch any errors and handle them diff --git a/docker/sync.R b/docker/sync.R index e08dda7..70e1ce4 100644 --- a/docker/sync.R +++ b/docker/sync.R @@ -6,10 +6,12 @@ options(stringsAsFactors=FALSE) library(rapimave) library(hgvsParseR) library(yogitools) +library(mavevis) baseURL <- "http://ec2-13-210-169-246.ap-southeast-2.compute.amazonaws.com/api/" #Caching directory +# cache.dir <- paste0(tempdir(),"/") cache.dir <- Sys.getenv("MAVEVIS_CACHE",unset="/var/www/mavevis/cache/") if (!file.exists(cache.dir)) { stop("Cache directory does not exist!") @@ -25,7 +27,54 @@ logger <- function(msg) { close(con) } -logger("Starting new synchronization cyle.") +#translate nucleotide sequence to amino acid sequence +translate <- function(dna) { + #load translation table from mavevis package + data("trtable") + #define codon start positions + cstarts <- seq(1,nchar(dna),3) + #extract codons and translate + aa.seq <- paste(sapply(cstarts,function(cs) trtable[[substr(dna,cs,cs+2)]]),collapse="") + return(aa.seq) +} + +#function to calculate the --ungapped-- offset from the uniprot sequence +calcOffset <- function(uniprot.acc, maveSeq) { + + #if it's DNA, translate to protein first + if (grepl("^[ACGT]+$",maveSeq)) { + maveSeq <- translate(maveSeq) + } + + #use getUniprotSeq function from mavevis package to download sequence from Uniprot + uniSeq <- getUniprotSeq(uniprot.acc) + + error <- "ERROR: WT sequence does not match Uniprot entry" + + #if the sequence in MaveDB is longer than the one in Uniprot, it can't be a match + if (nchar(uniSeq) < nchar(maveSeq)) { + logger(error) + return(-1) + #if they're identical, there is no offset + } else if (uniSeq == maveSeq) { + return(0) + } else { + #linearly traverse the longer sequence until it matches + i <- 1 + imax <- nchar(uniSeq)-nchar(maveSeq) + while (maveSeq != substr(uniSeq,i,i+nchar(maveSeq)-1) && i <= imax+1) { + i <- i+1 + } + if (i <= imax) { + return(i) + } else { + logger(error) + return(-1) + } + } +} + +logger("Starting new synchronization cycle.") #Open existing scorest index indexFile <- paste0(cache.dir,"searchIndex.csv") @@ -35,60 +84,84 @@ if (file.exists(indexFile)) { index <- NULL } -#Open API connection -rmave <- new.rapimave(baseURL) - -#Query list of scoresets -scoresets <- rmave$getAllScoreSets() - -#Iterate overscoresets -invisible(lapply(scoresets,function(scoreset) { - - urn <- scoreset$getURN() - name <- scoreset$getTitle() - - #No need to process if it's already known - if (!is.null(index) && urn %in% index$urn) { - return(NULL) - } - - logger(paste("New scoreset found:",urn)) - - target <- scoreset$getTarget() - tname <- target$getName() - wtseq <- target$getSequence() - uniprot <- target$getXrefUniprot() - - label <- paste0(urn,": ",tname," - ",name) - - #Download scores and write to cache location - scoreTable <- rmave$getScores(urn) - scoreCacheFile <- paste0(cache.dir,urn,".csv") - write.table(scoreTable,scoreCacheFile,sep=",",row.names=FALSE) - - #Parse score file to check for presence of syn/stop - if (grepl(" \\(",scoreTable$hgvs[[1]])) { - hgvsp <- sub("\\)$","",sapply(strsplit(scoreTable$hgvs," \\("),`[[`,2)) - } else { - hgvsp <- scoreTable$hgvs - } - varInfo <- parseHGVS(hgvsp) - #TODO: pre-cache varInfo - - hasStop <- any(varInfo$variant %in% c("Ter","*")) - hasSyn <- any(varInfo$type == "synonymous") - - #add scoreset information to index - index <<- rbind(index,data.frame( - label=label,urn=urn,target=tname,wt=wtseq, - uniprot=uniprot$getID(), - syn=if (hasSyn) "auto" else "manual", - stop=if (hasStop) "auto" else "manual" - )) - - logger("...cached and indexed") +tryCatch({ + + #Open API connection + rmave <- new.rapimave(baseURL) + + #Query list of scoresets + scoresets <- rmave$getAllScoreSets() + + #Iterate overscoresets + invisible(lapply(scoresets,function(scoreset) { + + if (!is.null(scoreset$getNextVersion())) { + #it's an outdated scoreset! + return(NULL) + } + + urn <- scoreset$getURN() + name <- scoreset$getTitle() + + #No need to process if it's already known + if (!is.null(index) && urn %in% index$urn) { + return(NULL) + } + + logger(paste("New scoreset found:",urn)) + + target <- scoreset$getTarget() + tname <- target$getName() + wtseq <- target$getSequence() + uniprot <- target$getXrefUniprot() + + value <- paste0(urn,": ",tname," - ",name) + label <- paste0(tname," - ",name) + + #Download scores and write to cache location + scoreTable <- rmave$getScores(urn) + scoreCacheFile <- paste0(cache.dir,urn,".csv") + write.table(scoreTable,scoreCacheFile,sep=",",row.names=FALSE) + + #Parse score file to check for presence of syn/stop + if (grepl(" \\(",scoreTable$hgvs[[1]])) { + hgvsp <- sub("\\)$","",sapply(strsplit(scoreTable$hgvs," \\("),`[[`,2)) + } else { + hgvsp <- scoreTable$hgvs + } + varInfo <- parseHGVS(hgvsp) + + #Cache varInfo + mutCacheFile <- paste0(cache.dir,urn,"_muts.csv") + write.table(varInfo,mutCacheFile,sep=",",row.names=FALSE) + + #Get off set or calculate if necessary + offset <- uniprot$getOffset() + if (is.null(offset)) { + offset <- calcOffset(uniprot$getID(),wtseq) + } + + #determine whether stop and synonymous variants are present + hasStop <- any(varInfo$variant %in% c("Ter","*")) + hasSyn <- any(varInfo$type == "synonymous") + + #add scoreset information to index + index <<- rbind(index,data.frame( + value=value,label=label,urn=urn,target=tname, + uniprot=uniprot$getID(), + syn=if (hasSyn) "auto" else "manual", + stop=if (hasStop) "auto" else "manual", + offset=offset, wt=wtseq + )) + + logger("...cached and indexed") + })) + + #TODO: pre-cache alignments, PDB files, and structure tracks. + + logger("Synchronization complete.") + +},error=function(e) { + logger("ERROR: Synchronization failed!") + logger(e) }) - -#TODO: pre-cache alignments PDB files and structure tracks. - -logger("Synchronization complete.")