From 69865cb67f92872a95cc88181b9265b8ba6be851 Mon Sep 17 00:00:00 2001
From: Jochen Weile <jochenweile@gmail.com>
Date: Tue, 4 Dec 2018 14:23:08 -0500
Subject: [PATCH] addressing issue #38. finding the matching uniref cluster

---
 R/orthologs.R | 57 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/R/orthologs.R b/R/orthologs.R
index 2764a65..fdc41da 100644
--- a/R/orthologs.R
+++ b/R/orthologs.R
@@ -179,26 +179,60 @@ pdb.informative <- function(pdb.table) {
 #' @param acc the Uniprot accession
 #' @return a numerical vector with the position-wise conservation.
 #' @export
-calc.conservation <- function(acc) {
+calc.conservation <- function(acc,overrideCache=FALSE) {
 	library("httr")
 	set_config(config(ssl_verifypeer = 0L))
 
 	uniprot.base <- "https://www.uniprot.org/uniprot/"
-	uniref90.base <- "https://www.uniprot.org/uniref/UniRef90_"
+	# uniref90.base <- "https://www.uniprot.org/uniref/UniRef90_"
+	uniref90.base <- "https://www.uniprot.org/uniref/"
 	uniparc.base <- "https://www.uniprot.org/uniparc/"
 	batch.base <- "https://www.uniprot.org/uploadlists/"
 
 	#Get Orthologs
 	alignment.file <- getCacheFile(paste0(acc,"_alignment.fasta"))
-	if (!file.exists(alignment.file)) {
-		ref.url <- paste0(uniref90.base,acc,".list")
+	if (!file.exists(alignment.file) || overrideCache) {
+
+		cat("Querying UniRef...")
+
+		#Find the appropriate UniRef cluster entry
+		htr <- POST(batch.base, body=list(
+			uploadQuery=acc,
+			format="list",
+			from="ACC+ID",
+			to="NF90"
+		),encode="multipart")
+		if (http_status(htr)$category != "Success") {
+			stop("Unable to access UniprotKB!\n",http_status(htr)$message)
+		}
+		unirefAccs <- strsplit(content(htr,"text",encoding="UTF-8"),"\n")[[1]]
+		if (length(unirefAccs) == 0){
+			stop("No Uniref entry exists for ",acc)
+		} else if (length(unirefAccs) > 1) {
+			warning("Multiple Uniref entries for ",acc)
+			unirefAccs <- unirefAccs[[1]]
+		} else {
+			cat("success\n")
+		}
+
+		#Resolve the member IDs for the Uniref cluster
+		ref.url <- paste0(uniref90.base,unirefAccs,".list")
 		htr <- GET(ref.url)
 		if (http_status(htr)$category != "Success") {
 			stop("Unable to access Uniref!\n",http_status(htr)$message)
 		}
 		xrefs <- strsplit(content(htr,"text",encoding="UTF-8"),"\n")[[1]]
-		cat("Retrieving sequences...")
+		#double-check that this is the correct cluster
+		if (!(acc %in% xrefs)) {
+			stop("UniRef Cluster does not contain query protein!")
+		}
+		#make sure query protein is at the top of the list, otherwise re-order
+		if (xrefs[[1]] != acc) {
+			xrefs <- c(acc,setdiff(xrefs,acc))
+		}
 
+		cat("Retrieving sequences...")
+		#use batch-service to download multi-fasta file for the set of sequences
 		htr <- POST(batch.base, body=list(
 			uploadQuery=paste(xrefs,collapse=" "),
 			format="fasta",
@@ -209,7 +243,7 @@ calc.conservation <- function(acc) {
 			stop("Error accessing UniprotKB!\n",http_status(htr)$message)
 		}	
 		multifasta <- content(htr,"text",encoding="UTF-8")
-		cat("done\n")
+		cat("success\n")
 
 		#export to 
 		fasta.file <- getCacheFile(paste0(acc,"_orthologs.fasta"))
@@ -220,21 +254,14 @@ calc.conservation <- function(acc) {
 		#Run ClustalOmega on the sequences
 		cat("Aligning sequences...")
 		retVal <- system(paste(
-			"clustalo -i",fasta.file,"-o",alignment.file
+			"clustalo -i",fasta.file,"--force -o",alignment.file
 		),wait=TRUE)
 		if (retVal != 0) stop("ClustalOmega failed!")
+		cat("success\n")
 	} else cat("Using archived alignment.\n")
 
-	# title.is <- which(grepl("^>",alignment.lines))
-	# end.is <- c(title.is[-1]-1,length(alignment.lines))
-	# alignment.rows <- mapply(function(s,e) paste(alignment.lines[s:e],collapse=""),s=title.is+1,e=end.is)
-	# names(alignment.rows) <- xrefs
-
 	amas <- new.amasLite()
 	conservation <- amas$run(alignment.file)
 
 	return(conservation)
 }
-
-#https://www.uniprot.org/uniparc/UPI0006D932F4.fasta
-# https://www.uniprot.org/uniref/UniRef50_P63279.list
\ No newline at end of file