Adjusting sequence download to new Uniprot Programmatic Access

https://www.uniprot.org/help/programmatic_access While this soluting works, queries with many hits at UniprotKB take very long time to retrieve corresponding protein clusters. We will need to optimize UniprotKB ID to Uniref mapping ... Refers to issue #128
jotech · Oct 3, 2022 · 82ff4ad · 82ff4ad
1 parent 50ca9fa
commit 82ff4ad
Show file tree

Hide file tree

Showing 5 changed files with 182 additions and 16 deletions.
diff --git a/docs/install.md b/docs/install.md
@@ -3,7 +3,7 @@
 ## Ubuntu/Debian/Mint
 ```
 sudo apt install ncbi-blast+ git libglpk-dev r-base-core exonerate bedtools barrnap bc parallel libcurl4-openssl-dev
-R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite"))'
+R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite","httr"))'
 R -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager"); BiocManager::install("Biostrings")'
 git clone https://github.com/jotech/gapseq && cd gapseq
 ```
@@ -13,7 +13,7 @@ git clone https://github.com/jotech/gapseq && cd gapseq
 sudo yum install ncbi-blast+ git glpk-devel BEDTools exonerate hmmer bc parallel libcurl-devel
 git clone https://github.com/tseemann/barrnap.git
 export PATH=$PATH:barrnap/bin/barrnap # needs to be permanent => .bashrc ?
-R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite"))'
+R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite","httr"))'
 R -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager"); BiocManager::install("Biostrings")'
 git clone https://github.com/jotech/gapseq && cd gapseq
 ```
@@ -22,7 +22,7 @@ git clone https://github.com/jotech/gapseq && cd gapseq
 Using [homebrew](https://brew.sh). Please note: Some Mac-Users reported difficulties to install gapseq on MacOS using the following commands. The issues are mainly due to some Mac-specific functioning of central programs such as sed, awk, and grep. If you are experiencing issues, we recommend to try to install gapseq in an own conda environment using the steps described [below](#conda).
 ```
 brew install coreutils binutils git glpk blast bedtools r brewsci/bio/barrnap grep bc gzip parallel curl
-R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite"))'
+R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite","httr"))'
 R -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager"); BiocManager::install("Biostrings")'
 git clone https://github.com/jotech/gapseq && cd gapseq
 ```

diff --git a/gapseq_env.yml b/gapseq_env.yml
@@ -35,3 +35,4 @@ dependencies:
   - r-renv
   - r-glpkapi
   - r-rcurl
+  - r-httr
diff --git a/src/test.sh b/src/test.sh
@@ -52,7 +52,7 @@ echo -e "\nMissing dependencies: $i\n\n"
 echo "#####################"
 echo "#Checking R packages#"
 echo "#####################"
-Rscript -e 'needed.packages <- c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "BiocManager", "Biostrings", "jsonlite", "CHNOSZ"); avail.packages <- installed.packages(); i=0; for( pkg in needed.packages ){; idx <- match(pkg, avail.packages[,"Package"]); if( ! is.na(idx) ){; cat(pkg, avail.packages[idx,"Version"], "\n"); }else{; cat(pkg, "NOT FOUND", "\n"); i=i+1; }; }; cat("\nMissing R packages: ", i, "\n\n\n")'
+Rscript -e 'needed.packages <- c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "BiocManager", "Biostrings", "jsonlite", "CHNOSZ", "httr"); avail.packages <- installed.packages(); i=0; for( pkg in needed.packages ){; idx <- match(pkg, avail.packages[,"Package"]); if( ! is.na(idx) ){; cat(pkg, avail.packages[idx,"Version"], "\n"); }else{; cat(pkg, "NOT FOUND", "\n"); i=i+1; }; }; cat("\nMissing R packages: ", i, "\n\n\n")'
 
 echo "##############################"
 echo "#Checking basic functionality#"

diff --git a/src/uniprot.sh b/src/uniprot.sh
@@ -74,7 +74,7 @@ shift $((OPTIND-1))
 # consider special case of eg. prokaryotes which are not supported by uniprot taxonomy
 if [ "$taxonomy" = "Prokaryota" ]; then
     folder=Prokaryota
-    taxonomy="(bacteria%20OR%20archaea)"
+    taxonomy="Bacteria,Archaea"
 else
     folder=$taxonomy
 fi
@@ -127,12 +127,11 @@ if [ -n "$ecnumber" ]; then
             echo -en " ... Downloading $ec \t\t"
             if [ "$get_unrev" = false ]; then
                 #swissprot
-                url="https://www.uniprot.org/uniref/?query=uniprot%3A(ec%3A$ec%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ayes)%20identity%3A$identity&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
+                Rscript $dir/uniprot_query.R ec $ec $ec.fasta $taxonomy true 0.9 && stat=ok || stat=err
             else
                 # unreviewed
-                url="https://www.uniprot.org/uniref/?query=uniprot%3A(ec%3A$ec%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ano)%20identity%3A$identity_unrev&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
+                Rscript $dir/uniprot_query.R ec $ec $ec.fasta $taxonomy false 0.5 && stat=ok || stat=err
             fi
-            wget -q $url -O $ec.fasta && stat=ok || stat=err # download only if file not exists
             newmd5=$(md5sum $ec.fasta | cut -d " " -f1)
             newcount=$(cat $ec.fasta | grep ">" | wc -l)
             [[ "$oldmd5" != "$newmd5" ]] && echo "`date +"%d/%m/%Y"` `echo $newcount` `echo $newcount-$oldcount | bc` $seqpath/$ec.fasta $stat" >> ../updating.log
@@ -157,11 +156,10 @@ if [ -n "$reaNames" ]; then
         rm -f $reaNameHash.fasta
         echo -en " ... Downloading $rea\t\t"
         if [ "$get_unrev" = false ]; then 
-            url="https://www.uniprot.org/uniref/?query=uniprot%3A(name%3A\"$rea\"%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ayes)%20identity%3A$identity&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
+            Rscript $dir/uniprot_query.R protein_name "$rea" $reaNameHash.fasta $taxonomy true 0.9 && stat=ok || stat=err
         else 
-            url="https://www.uniprot.org/uniref/?query=uniprot%3A(name%3A\"$rea\"%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ano)%20identity%3A$identity_unrev&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
+            Rscript $dir/uniprot_query.R protein_name "$rea" $reaNameHash.fasta $taxonomy false 0.5 && stat=ok || stat=err
         fi
-        wget -q "$url" -O "$reaNameHash.fasta" && stat=ok || stat=err
         newmd5=$(md5sum $reaNameHash.fasta | cut -d " " -f1)
         newcount=$(cat $reaNameHash.fasta | grep ">" | wc -l)
         [[ "$oldmd5" != "$newmd5" ]] && echo "`date +"%d/%m/%Y"` `echo $newcount` `echo $newcount-$oldcount | bc` $seqpath/$reaNameHash.fasta $stat" >> ../updating.log
@@ -177,11 +175,10 @@ if [ -n "$geneName" ]; then
         rm -f $geneName.fasta
         echo -en " ... Downloading $geneName\t\t"
         if [ "$get_unrev" = false ]; then 
-            url="https://www.uniprot.org/uniref/?query=uniprot%3A(gene_exact%3A\"$geneName\"%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ayes)%20identity%3A$identity&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
+            Rscript $dir/uniprot_query.R gene "$geneName" $reaNameHash.fasta $taxonomy true 0.9 && stat=ok || stat=err
         else 
-            url="https://www.uniprot.org/uniref/?query=uniprot%3A(gene_exact%3A\"$geneName\"%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ano)%20identity%3A$identity_unrev&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
+            Rscript $dir/uniprot_query.R gene "$geneName" $reaNameHash.fasta $taxonomy false 0.5 && stat=ok || stat=err
         fi
-        wget -q "$url" -O "$geneName.fasta" && stat=ok || stat=err
         newmd5=$(md5sum $geneName.fasta | cut -d " " -f1)
         newcount=$(cat $geneName.fasta | grep ">" | wc -l)
         [[ "$oldmd5" != "$newmd5" ]] && echo "`date +"%d/%m/%Y"` `echo $newcount` `echo $newcount-$oldcount | bc` $seqpath/$geneName.fasta $stat" >> ../updating.log
@@ -198,8 +195,7 @@ if [ -n "$dbref" ]; then
         oldcount=$(cat $dbref.fasta | grep ">" | wc -l)
         rm -f $dbref.fasta
         echo -en " ... Downloading $dbref\t\t"
-        url="https://www.uniprot.org/uniprot/?query=uniprot%3A(id%3A\"$dbref\"%20taxonomy%3A$taxonomy)&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
-        wget -q "$url" -O "$dbref.fasta" && stat=ok || stat=err
+        Rscript $dir/uniprot_query.R id $dbref $dbref.fasta $taxonomy && stat=ok || stat=err
         newmd5=$(md5sum $dbref.fasta | cut -d " " -f1)
         newcount=$(cat $dbref.fasta | grep ">" | wc -l)
         [[ "$oldmd5" != "$newmd5" ]] && echo "`date +"%d/%m/%Y"` `echo $newcount` `echo $newcount-$oldcount | bc` $seqpath/$dbref.fasta $stat" >> ../updating.log

diff --git a/src/uniprot_query.R b/src/uniprot_query.R
@@ -0,0 +1,169 @@
+library(httr)
+library(stringr)
+
+# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~#
+# Programmatic download of protein cluster sequences from EC / Protein name /  #
+# Gene ID queries via Uniprot REST requires 3 steps:                           #
+#  (1) Find UniprotKB ID/AC of query-matching proteins                         #
+#  (2) Match IDs/Accessions from (1) to identity-based protein clusters        #
+#      (UniRef_90/UniRef_50)                                                   #
+#  (3) Retrieve representative Sequences of protein clusters                   #
+# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~#
+
+# query_type <- "ec" # (either "ec", "protein_name", "gene_exact" or "id").
+# query_term <- "1.2.1.3"
+batch_size_query_results <- 500
+batch_size_clusters <- 200 # 200 should work: https://github.com/ebi-uniprot/uniprot-rest-api/issues/275#issuecomment-1173888616
+max_attempts <- 10 # Maximum number of attempts per query to receive a status==200 response from uniprot website.
+
+GET_retries <- function(url) {
+  attempts <- 1
+  get_success <- FALSE
+  while(get_success == FALSE && attempts <= max_attempts) {
+    res <- GET(url)
+    attempts <- attempts + 1
+    get_success <- res$status_code
+  }
+  if(get_success == FALSE) {
+    quit(save = "no", status = 1)
+  }
+
+  return(res)
+}
+# rev_status <- "true"
+# uniref_id <- "0.9"
+# output_fasta_file <- "test.fasta"
+# taxonomy <- "Bacteria"
+
+# positional arguments:
+# 1: query type (EC, protein_name, gene_exact)
+# 2: query term
+# 3: output file name
+# 4: taxonomy
+# 5: reviewed status (true or false)
+# 6: UniRef identity clustering cutoff (0.9 or 0.5)
+
+args = commandArgs(trailingOnly=TRUE)
+
+# parse positional arguments
+query_type <- args[1]
+query_term <- args[2]
+output_fasta_file <- args[3]
+taxonomy <- args[4]
+if(length(args)==6) {
+  rev_status <- args[5]
+  uniref_id <- args[6]
+}
+
+# check if all required arguments as provided
+if (query_type!="id" & length(args)!=6) {
+  stop("")
+}
+if (query_type=="id" & length(args)!=4) {
+  stop("")
+}
+
+
+if(query_type != "id") {
+  # parse taxonomy term
+  taxonomy <- unlist(strsplit(taxonomy, ","))
+  taxonomy <- paste0("(taxonomy_name:",taxonomy,")", collapse = "%20OR%20")
+  taxonomy <- paste0("(",taxonomy,")")
+
+  # (0) - Build uniprot entry URL 
+  urli <- paste0("https://rest.uniprot.org/uniprotkb/search?format=list&query=(",
+                 query_type,":",query_term,
+                 ")%20AND%20",taxonomy,"%20AND%20(reviewed:",
+                 rev_status,")&size=",
+                 batch_size_query_results)
+
+  # (1) - Find uniprot accession numbers of matching entries
+  accessions <- c()
+  total <- NA_integer_
+  # cat("Retrieving sequence accession IDs ...\n")
+  while(length(urli)==1) {
+    ri <- GET_retries(urli)
+    ri_header <- headers(ri)
+
+    # get metrics and accessions
+    total <- as.numeric(ri_header$`x-total-results`)
+    accessions <- c(accessions, unlist(strsplit(httr::content(ri, as = "text", encoding = "UTF-8"),"\n")))
+    cat("\r\t\t\t",length(accessions), "/", total)
+
+    ind_next <- grep("\"next\"$",ri_header$link)
+
+    next_url <-  str_match(ri_header$link[ind_next], "<\\s*(.*?)\\s*>")[,2]
+
+    urli <- next_url
+  }
+  cat("\n")
+  if(length(accessions) == 0) {
+    cat(NULL, file = output_fasta_file)
+    quit(save = "no", status = 0)
+  }
+
+
+  # (2) retrieve corresponding uniref90 cluster IDs (not sequences)
+  batch_ids <- floor(1:length(accessions)/batch_size_clusters)
+  cluster_ids <- c()
+  k <- 0
+
+  for(batchi in unique(batch_ids)) {
+    k <- k + sum(batch_ids == batchi)
+    cat("\r\t\t\t",k,"/",total)
+    acc_concat <- paste0("%28uniprot_id%3A",accessions[batch_ids == batchi],"%29",
+                         collapse = "%20OR%20")
+    urlc <- paste0("https://rest.uniprot.org/uniref/search?format=list&query=%28%28",
+                   acc_concat,"%29%20AND%20%28identity%3A",uniref_id,"%29%29",
+                   "&size=", batch_size_query_results)
+
+    ri <- GET_retries(urlc)
+    cluster_ids <- c(cluster_ids,
+                     unlist(strsplit(httr::content(ri, as = "text",
+                                                   encoding = "UTF-8"),"\n")))
+  }
+  cluster_ids <- unique(cluster_ids)
+  if(length(cluster_ids)==0)
+    quit(save = "no", status = 1)
+  cat(" (",length(cluster_ids),"cluster sequences found)\n")
+  total_uniref <- length(cluster_ids)
+
+  # (3) retrieve Uniref sequences
+  batch_ids <- floor(1:length(cluster_ids)/batch_size_clusters)
+  seqs <- c()
+  k <- 0
+
+  for(batchi in unique(batch_ids)) {
+    k <- k + sum(batch_ids == batchi)
+    cat("\r\t\t\t",k,"/",total_uniref)
+    uniref_acc_concat <- paste0("id%3A",cluster_ids[batch_ids == batchi],
+                                collapse = "%20OR%20")
+
+    urls <- paste0("https://rest.uniprot.org/uniref/search?compressed=false&format=fasta&query=",
+                   uniref_acc_concat,"&size=", batch_size_query_results)
+
+    ri <- GET_retries(urls)
+
+    seqs <- c(seqs, httr::content(ri, as = "text", encoding = "UTF-8"))
+  }
+  cat("\n")
+  if(length(seqs)==0)
+    quit(save = "no", status = 1)
+
+  cat(seqs, sep = "", file = output_fasta_file)
+  quit(save = "no", status = 0)
+}
+
+
+if(query_type == "id") {
+  # (0) - Build uniprot entry URL 
+  urli <- paste0("https://rest.uniprot.org/uniprotkb/search?compressed=false&format=fasta&query=%28%28accession%3A",
+                 query_term,"%29%20AND%20",taxonomy,"%29")
+
+  ri <- GET_retries(urli)
+
+  seqi <- httr::content(ri, as = "text", encoding = "UTF-8")
+  cat(seqi, sep = "", file = output_fasta_file)
+  quit(save = "no", status = 0)
+}
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,3 +35,4 @@ dependencies: @@
       - r-renv
       - r-glpkapi
       - r-rcurl
+      - r-httr