Skip to content

Commit

Permalink
more prepraration for background synchronization (issue #17)
Browse files Browse the repository at this point in the history
  • Loading branch information
jweile committed May 25, 2018
1 parent b70d4c4 commit 6b7483b
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 58 deletions.
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ RUN a2dissite 000-default.conf && a2ensite mavevis
# ln -sf /proc/self/fd/1 /var/www/mavevis/logs/access.log

#setup startup script and daemon
COPY startup.sh daemon.R /setup/
COPY startup.sh daemon.R sync.R /setup/

#startup script starts apache and daemon
CMD bash /setup/startup.sh
22 changes: 22 additions & 0 deletions docker/daemon.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ daemon <- function() {
logger("INFO: Daemon started.")
#infinite loop
while(TRUE) {
#start DB synchronization if necessary
#TODO: enable once production DB is updated
# check.sync()
#patrol the directory for new jobs
patrol()
#sleep for two seconds until next patrol
Expand All @@ -71,6 +74,25 @@ daemon <- function() {
)
}

#time of last synchronization cycle
lastSyncTime <- as.Date("2018-01-01")

#check whether 1 day has passed since the last synchronization, if so run it.
check.sync <- function() {
#calculate time passed since last synchronization
daysSinceSync <- difftime(Sys.time(), lastSyncTime, units = "days")
#if more than one day has passed
if (daysSinceSync > 1) {
#start the sync job
system(
paste("Rscript sync.R"),
wait=FALSE
)
#update synchronization time
lastSyncTime <<- Sys.time()
}
}

#check the directory for jobs and deal with them
patrol <- function() {
#catch any errors and handle them
Expand Down
187 changes: 130 additions & 57 deletions docker/sync.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ options(stringsAsFactors=FALSE)
library(rapimave)
library(hgvsParseR)
library(yogitools)
library(mavevis)

baseURL <- "http://ec2-13-210-169-246.ap-southeast-2.compute.amazonaws.com/api/"

#Caching directory
# cache.dir <- paste0(tempdir(),"/")
cache.dir <- Sys.getenv("MAVEVIS_CACHE",unset="/var/www/mavevis/cache/")
if (!file.exists(cache.dir)) {
stop("Cache directory does not exist!")
Expand All @@ -25,7 +27,54 @@ logger <- function(msg) {
close(con)
}

logger("Starting new synchronization cyle.")
#translate nucleotide sequence to amino acid sequence
translate <- function(dna) {
#load translation table from mavevis package
data("trtable")
#define codon start positions
cstarts <- seq(1,nchar(dna),3)
#extract codons and translate
aa.seq <- paste(sapply(cstarts,function(cs) trtable[[substr(dna,cs,cs+2)]]),collapse="")
return(aa.seq)
}

#function to calculate the --ungapped-- offset from the uniprot sequence
calcOffset <- function(uniprot.acc, maveSeq) {

#if it's DNA, translate to protein first
if (grepl("^[ACGT]+$",maveSeq)) {
maveSeq <- translate(maveSeq)
}

#use getUniprotSeq function from mavevis package to download sequence from Uniprot
uniSeq <- getUniprotSeq(uniprot.acc)

error <- "ERROR: WT sequence does not match Uniprot entry"

#if the sequence in MaveDB is longer than the one in Uniprot, it can't be a match
if (nchar(uniSeq) < nchar(maveSeq)) {
logger(error)
return(-1)
#if they're identical, there is no offset
} else if (uniSeq == maveSeq) {
return(0)
} else {
#linearly traverse the longer sequence until it matches
i <- 1
imax <- nchar(uniSeq)-nchar(maveSeq)
while (maveSeq != substr(uniSeq,i,i+nchar(maveSeq)-1) && i <= imax+1) {
i <- i+1
}
if (i <= imax) {
return(i)
} else {
logger(error)
return(-1)
}
}
}

logger("Starting new synchronization cycle.")

#Open existing scorest index
indexFile <- paste0(cache.dir,"searchIndex.csv")
Expand All @@ -35,60 +84,84 @@ if (file.exists(indexFile)) {
index <- NULL
}

#Open API connection
rmave <- new.rapimave(baseURL)

#Query list of scoresets
scoresets <- rmave$getAllScoreSets()

#Iterate overscoresets
invisible(lapply(scoresets,function(scoreset) {

urn <- scoreset$getURN()
name <- scoreset$getTitle()

#No need to process if it's already known
if (!is.null(index) && urn %in% index$urn) {
return(NULL)
}

logger(paste("New scoreset found:",urn))

target <- scoreset$getTarget()
tname <- target$getName()
wtseq <- target$getSequence()
uniprot <- target$getXrefUniprot()

label <- paste0(urn,": ",tname," - ",name)

#Download scores and write to cache location
scoreTable <- rmave$getScores(urn)
scoreCacheFile <- paste0(cache.dir,urn,".csv")
write.table(scoreTable,scoreCacheFile,sep=",",row.names=FALSE)

#Parse score file to check for presence of syn/stop
if (grepl(" \\(",scoreTable$hgvs[[1]])) {
hgvsp <- sub("\\)$","",sapply(strsplit(scoreTable$hgvs," \\("),`[[`,2))
} else {
hgvsp <- scoreTable$hgvs
}
varInfo <- parseHGVS(hgvsp)
#TODO: pre-cache varInfo

hasStop <- any(varInfo$variant %in% c("Ter","*"))
hasSyn <- any(varInfo$type == "synonymous")

#add scoreset information to index
index <<- rbind(index,data.frame(
label=label,urn=urn,target=tname,wt=wtseq,
uniprot=uniprot$getID(),
syn=if (hasSyn) "auto" else "manual",
stop=if (hasStop) "auto" else "manual"
))

logger("...cached and indexed")
tryCatch({

#Open API connection
rmave <- new.rapimave(baseURL)

#Query list of scoresets
scoresets <- rmave$getAllScoreSets()

#Iterate overscoresets
invisible(lapply(scoresets,function(scoreset) {

if (!is.null(scoreset$getNextVersion())) {
#it's an outdated scoreset!
return(NULL)
}

urn <- scoreset$getURN()
name <- scoreset$getTitle()

#No need to process if it's already known
if (!is.null(index) && urn %in% index$urn) {
return(NULL)
}

logger(paste("New scoreset found:",urn))

target <- scoreset$getTarget()
tname <- target$getName()
wtseq <- target$getSequence()
uniprot <- target$getXrefUniprot()

value <- paste0(urn,": ",tname," - ",name)
label <- paste0(tname," - ",name)

#Download scores and write to cache location
scoreTable <- rmave$getScores(urn)
scoreCacheFile <- paste0(cache.dir,urn,".csv")
write.table(scoreTable,scoreCacheFile,sep=",",row.names=FALSE)

#Parse score file to check for presence of syn/stop
if (grepl(" \\(",scoreTable$hgvs[[1]])) {
hgvsp <- sub("\\)$","",sapply(strsplit(scoreTable$hgvs," \\("),`[[`,2))
} else {
hgvsp <- scoreTable$hgvs
}
varInfo <- parseHGVS(hgvsp)

#Cache varInfo
mutCacheFile <- paste0(cache.dir,urn,"_muts.csv")
write.table(varInfo,mutCacheFile,sep=",",row.names=FALSE)

#Get off set or calculate if necessary
offset <- uniprot$getOffset()
if (is.null(offset)) {
offset <- calcOffset(uniprot$getID(),wtseq)
}

#determine whether stop and synonymous variants are present
hasStop <- any(varInfo$variant %in% c("Ter","*"))
hasSyn <- any(varInfo$type == "synonymous")

#add scoreset information to index
index <<- rbind(index,data.frame(
value=value,label=label,urn=urn,target=tname,
uniprot=uniprot$getID(),
syn=if (hasSyn) "auto" else "manual",
stop=if (hasStop) "auto" else "manual",
offset=offset, wt=wtseq
))

logger("...cached and indexed")
}))

#TODO: pre-cache alignments, PDB files, and structure tracks.

logger("Synchronization complete.")

},error=function(e) {
logger("ERROR: Synchronization failed!")
logger(e)
})

#TODO: pre-cache alignments PDB files and structure tracks.

logger("Synchronization complete.")

0 comments on commit 6b7483b

Please sign in to comment.