Skip to content

Commit

Permalink
Improved and enabled background synchronization (Issue #17); added ca…
Browse files Browse the repository at this point in the history
…ching for structural features (Issue #4)
  • Loading branch information
jweile committed Jun 28, 2018
1 parent 6e57af5 commit 237ff24
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 42 deletions.
10 changes: 10 additions & 0 deletions R/calcStrucFeats.R
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,14 @@ subcomplex.combos <- function(pdb.file,chain.sets) {
#' }
#' @export
calc.strucfeats <- function(pdb.acc,main.chain) {

struc.cache.file <- getCacheFile(paste0(pdb.acc,":",main.chain,"_features.csv"))

if (file.exists(struc.cache.file)) {
cat("Using cached features...\n")
burial.all <- read.csv(struc.cache.file)
return(burial.all)
}

library("httr")
set_config(config(ssl_verifypeer = 0L))
Expand Down Expand Up @@ -238,6 +246,8 @@ calc.strucfeats <- function(pdb.acc,main.chain) {
burial.all <- cbind(burial[as.character(allpos),],secstruc=secstruc[as.character(allpos),"struc"])
rownames(burial.all) <- burial.all$pos <- allpos

write.table(burial.all,struc.cache.file,sep=",")

# cat("Deleting temporary files...\n")
# #clean up temp files
# file.remove(pdb.file)
Expand Down
11 changes: 5 additions & 6 deletions docker/daemon.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,7 @@ daemon <- function() {
#infinite loop
while(TRUE) {
#start DB synchronization if necessary
#TODO: enable once production DB is updated
# check.sync()
check.sync()
#patrol the directory for new jobs
patrol()
#sleep for two seconds until next patrol
Expand All @@ -80,10 +79,10 @@ lastSyncTime <- as.Date("2018-01-01")
#check whether 1 day has passed since the last synchronization, if so run it.
check.sync <- function() {
#calculate time passed since last synchronization
daysSinceSync <- difftime(Sys.time(), lastSyncTime, units = "days")
#if more than one day has passed
if (daysSinceSync > 1) {
#start the sync job
minSinceSync <- difftime(Sys.time(), lastSyncTime, units = "mins")
#if more than five minutes has passed
if (minSinceSync > 5) {
#start a sync job
system(
paste("Rscript sync.R"),
wait=FALSE
Expand Down
129 changes: 93 additions & 36 deletions docker/sync.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@

options(stringsAsFactors=FALSE)

#This requires the new changes that are in the rapimave devel branch!!
library(rapimave)
library(hgvsParseR)
library(yogitools)
library(mavevis)

baseURL <- "http://ec2-13-210-169-246.ap-southeast-2.compute.amazonaws.com/api/"

baseURL <- "https://www.mavedb.org/api/"

#Caching directory
# cache.dir <- paste0(tempdir(),"/")
cache.dir <- Sys.getenv("MAVEVIS_CACHE",unset="/var/www/mavevis/cache/")
if (!file.exists(cache.dir)) {
stop("Cache directory does not exist!")
Expand Down Expand Up @@ -76,14 +75,6 @@ calcOffset <- function(uniprot.acc, maveSeq) {

logger("Starting new synchronization cycle.")

#Open existing scorest index
indexFile <- paste0(cache.dir,"searchIndex.csv")
if (file.exists(indexFile)) {
index <- read.csv(indexFile)
} else {
index <- NULL
}

tryCatch({

#Open API connection
Expand All @@ -92,49 +83,65 @@ tryCatch({
#Query list of scoresets
scoresets <- rmave$getAllScoreSets()

#Iterate overscoresets
invisible(lapply(scoresets,function(scoreset) {
#Iterate overscoresets and build index
index <- as.df(lapply(scoresets,function(scoreset) {

#If it's an outdated scoreset, skip it!
if (!is.null(scoreset$getNextVersion())) {
#it's an outdated scoreset!
return(NULL)
}

urn <- scoreset$getURN()
name <- scoreset$getTitle()

#No need to process if it's already known
if (!is.null(index) && urn %in% index$urn) {
return(NULL)
}

logger(paste("New scoreset found:",urn))
# if (!is.null(index) && urn %in% index$urn) {
# return(NULL)
# }

target <- scoreset$getTarget()
tname <- target$getName()
wtseq <- target$getSequence()
uniprot <- target$getXrefUniprot()

value <- paste0(urn,": ",tname," - ",name)
label <- paste0(tname," - ",name)
#If the title already contains the target name, there's no need to repeat it
if (grepl(tname,name)) {
value <- paste0(urn,": ",name)
label <- paste0(name)
} else {
value <- paste0(urn,": ",tname," - ",name)
label <- paste0(tname," - ",name)
}

#Download scores and write to cache location
scoreTable <- rmave$getScores(urn)
#Check if scores are already cached
scoreCacheFile <- paste0(cache.dir,urn,".csv")
write.table(scoreTable,scoreCacheFile,sep=",",row.names=FALSE)
if (!file.exists(scoreCacheFile)) {
#If not, download scores and write to cache location
scoreTable <- rmave$getScores(urn)
write.table(scoreTable,scoreCacheFile,sep=",",row.names=FALSE)
}

#Parse score file to check for presence of syn/stop
if (grepl(" \\(",scoreTable$hgvs[[1]])) {
hgvsp <- sub("\\)$","",sapply(strsplit(scoreTable$hgvs," \\("),`[[`,2))
#Check if variant descriptors have already been cached
mutCacheFile <- paste0(cache.dir,urn,"_muts.csv")
if (!file.exists(mutCacheFile)) {
#if not, do so
if (!exists(scoreTable)) {
scoreTable <- read.csv(scoreCacheFile)
}
if (!all(is.na(scoreTable$hgvs_pro))) {
varInfo <- parseHGVS(scoreTable$hgvs_pro)
write.table(varInfo,mutCacheFile,sep=",",row.names=FALSE)
} else {
logger(paste("WARNING: Scoreset",urn,"has no protein-level variant descriptors."))
return(NULL)
}
} else {
hgvsp <- scoreTable$hgvs
varInfo <- read.csv(mutCacheFile)
}
varInfo <- parseHGVS(hgvsp)

#Cache varInfo
mutCacheFile <- paste0(cache.dir,urn,"_muts.csv")
write.table(varInfo,mutCacheFile,sep=",",row.names=FALSE)

#Parse score file to check for presence of syn/stop

#Get off set or calculate if necessary
offset <- uniprot$getOffset()
if (is.null(offset)) {
Expand All @@ -145,19 +152,69 @@ tryCatch({
hasStop <- any(varInfo$variant %in% c("Ter","*"))
hasSyn <- any(varInfo$type == "synonymous")


#add scoreset information to index
index <<- rbind(index,data.frame(
list(
value=value,label=label,urn=urn,target=tname,
uniprot=uniprot$getID(),
syn=if (hasSyn) "auto" else "manual",
stop=if (hasStop) "auto" else "manual",
offset=offset, wt=wtseq
))
)

logger("...cached and indexed")
}))

#TODO: pre-cache alignments, PDB files, and structure tracks.
indexFile <- paste0(cache.dir,"searchIndex.csv")
write.table(index,indexFile,sep=",",row.names=FALSE)

logger("Index successfully updated.")

logger("Starting caching cycle.")

#pre-cache alignments, PDB files, and structure tracks.
invisible(lapply(index$uniprot,function(acc) {

#check if pre-calculated alignment exists. If not, create it.
alignment.file <- getCacheFile(paste0(acc,"_alignment.fasta"))
if (!file.exists(alignment.file)) {
logger(paste("Caching alignment for",acc))
calc.conservation(acc)
}

#check if pre-calculated pdb table exists. If not, create it.
pdb.table.file <- getCacheFile(paste0(acc,"_pdbs.csv"))
if (!file.exists(pdb.table.file)) {

logger(paste("Caching structures for",acc))
pdb.table <- find.pdbs(acc)

#iterate over associated pdb structures
apply(pdb.table,1,function(pdb.row) {
pdbacc <- pdb.row[["pdb"]]
mainChains <- strsplit(pdb.row[["mainChains"]],"/")[[1]]
#iterate over possible main chains
lapply(mainChains,function(mc) {
#check if pre-calculated structure data exists. If not, create it.
struc.cache.file <- getCacheFile(paste0(pdbacc,":",mc,"_features.csv"))
if (!file.exists(struc.cache.file)) {
logger(paste("Caching features for",acc,":",pdbacc,"-",mc))
tryCatch({
calc.strucfeats(pdbacc,mc)
},
error=function(e) {
logger(paste(
"ERROR: Features calculation failed for",
acc,":",pdbacc,"-",mc,"\n",e
))
})
}
})
})

}

}))


logger("Synchronization complete.")

Expand Down

0 comments on commit 237ff24

Please sign in to comment.