Skip to content

Commit

Permalink
Adjusting sequence download to new Uniprot Programmatic Access
Browse files Browse the repository at this point in the history
https://www.uniprot.org/help/programmatic_access

While this soluting works, queries with many hits at UniprotKB take
very long time to retrieve corresponding protein clusters. We will need to
optimize UniprotKB ID to Uniref mapping ...

Refers to issue #128
  • Loading branch information
Waschina committed Oct 3, 2022
1 parent 50ca9fa commit 82ff4ad
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 16 deletions.
6 changes: 3 additions & 3 deletions docs/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## Ubuntu/Debian/Mint
```
sudo apt install ncbi-blast+ git libglpk-dev r-base-core exonerate bedtools barrnap bc parallel libcurl4-openssl-dev
R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite"))'
R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite","httr"))'
R -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager"); BiocManager::install("Biostrings")'
git clone https://github.com/jotech/gapseq && cd gapseq
```
Expand All @@ -13,7 +13,7 @@ git clone https://github.com/jotech/gapseq && cd gapseq
sudo yum install ncbi-blast+ git glpk-devel BEDTools exonerate hmmer bc parallel libcurl-devel
git clone https://github.com/tseemann/barrnap.git
export PATH=$PATH:barrnap/bin/barrnap # needs to be permanent => .bashrc ?
R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite"))'
R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite","httr"))'
R -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager"); BiocManager::install("Biostrings")'
git clone https://github.com/jotech/gapseq && cd gapseq
```
Expand All @@ -22,7 +22,7 @@ git clone https://github.com/jotech/gapseq && cd gapseq
Using [homebrew](https://brew.sh). Please note: Some Mac-Users reported difficulties to install gapseq on MacOS using the following commands. The issues are mainly due to some Mac-specific functioning of central programs such as sed, awk, and grep. If you are experiencing issues, we recommend to try to install gapseq in an own conda environment using the steps described [below](#conda).
```
brew install coreutils binutils git glpk blast bedtools r brewsci/bio/barrnap grep bc gzip parallel curl
R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite"))'
R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite","httr"))'
R -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager"); BiocManager::install("Biostrings")'
git clone https://github.com/jotech/gapseq && cd gapseq
```
Expand Down
1 change: 1 addition & 0 deletions gapseq_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,4 @@ dependencies:
- r-renv
- r-glpkapi
- r-rcurl
- r-httr
2 changes: 1 addition & 1 deletion src/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ echo -e "\nMissing dependencies: $i\n\n"
echo "#####################"
echo "#Checking R packages#"
echo "#####################"
Rscript -e 'needed.packages <- c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "BiocManager", "Biostrings", "jsonlite", "CHNOSZ"); avail.packages <- installed.packages(); i=0; for( pkg in needed.packages ){; idx <- match(pkg, avail.packages[,"Package"]); if( ! is.na(idx) ){; cat(pkg, avail.packages[idx,"Version"], "\n"); }else{; cat(pkg, "NOT FOUND", "\n"); i=i+1; }; }; cat("\nMissing R packages: ", i, "\n\n\n")'
Rscript -e 'needed.packages <- c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "BiocManager", "Biostrings", "jsonlite", "CHNOSZ", "httr"); avail.packages <- installed.packages(); i=0; for( pkg in needed.packages ){; idx <- match(pkg, avail.packages[,"Package"]); if( ! is.na(idx) ){; cat(pkg, avail.packages[idx,"Version"], "\n"); }else{; cat(pkg, "NOT FOUND", "\n"); i=i+1; }; }; cat("\nMissing R packages: ", i, "\n\n\n")'

echo "##############################"
echo "#Checking basic functionality#"
Expand Down
20 changes: 8 additions & 12 deletions src/uniprot.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ shift $((OPTIND-1))
# consider special case of eg. prokaryotes which are not supported by uniprot taxonomy
if [ "$taxonomy" = "Prokaryota" ]; then
folder=Prokaryota
taxonomy="(bacteria%20OR%20archaea)"
taxonomy="Bacteria,Archaea"
else
folder=$taxonomy
fi
Expand Down Expand Up @@ -127,12 +127,11 @@ if [ -n "$ecnumber" ]; then
echo -en " ... Downloading $ec \t\t"
if [ "$get_unrev" = false ]; then
#swissprot
url="https://www.uniprot.org/uniref/?query=uniprot%3A(ec%3A$ec%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ayes)%20identity%3A$identity&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
Rscript $dir/uniprot_query.R ec $ec $ec.fasta $taxonomy true 0.9 && stat=ok || stat=err
else
# unreviewed
url="https://www.uniprot.org/uniref/?query=uniprot%3A(ec%3A$ec%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ano)%20identity%3A$identity_unrev&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
Rscript $dir/uniprot_query.R ec $ec $ec.fasta $taxonomy false 0.5 && stat=ok || stat=err
fi
wget -q $url -O $ec.fasta && stat=ok || stat=err # download only if file not exists
newmd5=$(md5sum $ec.fasta | cut -d " " -f1)
newcount=$(cat $ec.fasta | grep ">" | wc -l)
[[ "$oldmd5" != "$newmd5" ]] && echo "`date +"%d/%m/%Y"` `echo $newcount` `echo $newcount-$oldcount | bc` $seqpath/$ec.fasta $stat" >> ../updating.log
Expand All @@ -157,11 +156,10 @@ if [ -n "$reaNames" ]; then
rm -f $reaNameHash.fasta
echo -en " ... Downloading $rea\t\t"
if [ "$get_unrev" = false ]; then
url="https://www.uniprot.org/uniref/?query=uniprot%3A(name%3A\"$rea\"%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ayes)%20identity%3A$identity&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
Rscript $dir/uniprot_query.R protein_name "$rea" $reaNameHash.fasta $taxonomy true 0.9 && stat=ok || stat=err
else
url="https://www.uniprot.org/uniref/?query=uniprot%3A(name%3A\"$rea\"%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ano)%20identity%3A$identity_unrev&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
Rscript $dir/uniprot_query.R protein_name "$rea" $reaNameHash.fasta $taxonomy false 0.5 && stat=ok || stat=err
fi
wget -q "$url" -O "$reaNameHash.fasta" && stat=ok || stat=err
newmd5=$(md5sum $reaNameHash.fasta | cut -d " " -f1)
newcount=$(cat $reaNameHash.fasta | grep ">" | wc -l)
[[ "$oldmd5" != "$newmd5" ]] && echo "`date +"%d/%m/%Y"` `echo $newcount` `echo $newcount-$oldcount | bc` $seqpath/$reaNameHash.fasta $stat" >> ../updating.log
Expand All @@ -177,11 +175,10 @@ if [ -n "$geneName" ]; then
rm -f $geneName.fasta
echo -en " ... Downloading $geneName\t\t"
if [ "$get_unrev" = false ]; then
url="https://www.uniprot.org/uniref/?query=uniprot%3A(gene_exact%3A\"$geneName\"%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ayes)%20identity%3A$identity&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
Rscript $dir/uniprot_query.R gene "$geneName" $reaNameHash.fasta $taxonomy true 0.9 && stat=ok || stat=err
else
url="https://www.uniprot.org/uniref/?query=uniprot%3A(gene_exact%3A\"$geneName\"%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ano)%20identity%3A$identity_unrev&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
Rscript $dir/uniprot_query.R gene "$geneName" $reaNameHash.fasta $taxonomy false 0.5 && stat=ok || stat=err
fi
wget -q "$url" -O "$geneName.fasta" && stat=ok || stat=err
newmd5=$(md5sum $geneName.fasta | cut -d " " -f1)
newcount=$(cat $geneName.fasta | grep ">" | wc -l)
[[ "$oldmd5" != "$newmd5" ]] && echo "`date +"%d/%m/%Y"` `echo $newcount` `echo $newcount-$oldcount | bc` $seqpath/$geneName.fasta $stat" >> ../updating.log
Expand All @@ -198,8 +195,7 @@ if [ -n "$dbref" ]; then
oldcount=$(cat $dbref.fasta | grep ">" | wc -l)
rm -f $dbref.fasta
echo -en " ... Downloading $dbref\t\t"
url="https://www.uniprot.org/uniprot/?query=uniprot%3A(id%3A\"$dbref\"%20taxonomy%3A$taxonomy)&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta"
wget -q "$url" -O "$dbref.fasta" && stat=ok || stat=err
Rscript $dir/uniprot_query.R id $dbref $dbref.fasta $taxonomy && stat=ok || stat=err
newmd5=$(md5sum $dbref.fasta | cut -d " " -f1)
newcount=$(cat $dbref.fasta | grep ">" | wc -l)
[[ "$oldmd5" != "$newmd5" ]] && echo "`date +"%d/%m/%Y"` `echo $newcount` `echo $newcount-$oldcount | bc` $seqpath/$dbref.fasta $stat" >> ../updating.log
Expand Down
169 changes: 169 additions & 0 deletions src/uniprot_query.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
library(httr)
library(stringr)

# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~#
# Programmatic download of protein cluster sequences from EC / Protein name / #
# Gene ID queries via Uniprot REST requires 3 steps: #
# (1) Find UniprotKB ID/AC of query-matching proteins #
# (2) Match IDs/Accessions from (1) to identity-based protein clusters #
# (UniRef_90/UniRef_50) #
# (3) Retrieve representative Sequences of protein clusters #
# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~#

# query_type <- "ec" # (either "ec", "protein_name", "gene_exact" or "id").
# query_term <- "1.2.1.3"
batch_size_query_results <- 500
batch_size_clusters <- 200 # 200 should work: https://github.com/ebi-uniprot/uniprot-rest-api/issues/275#issuecomment-1173888616
max_attempts <- 10 # Maximum number of attempts per query to receive a status==200 response from uniprot website.

GET_retries <- function(url) {
attempts <- 1
get_success <- FALSE
while(get_success == FALSE && attempts <= max_attempts) {
res <- GET(url)
attempts <- attempts + 1
get_success <- res$status_code
}
if(get_success == FALSE) {
quit(save = "no", status = 1)
}

return(res)
}
# rev_status <- "true"
# uniref_id <- "0.9"
# output_fasta_file <- "test.fasta"
# taxonomy <- "Bacteria"

# positional arguments:
# 1: query type (EC, protein_name, gene_exact)
# 2: query term
# 3: output file name
# 4: taxonomy
# 5: reviewed status (true or false)
# 6: UniRef identity clustering cutoff (0.9 or 0.5)

args = commandArgs(trailingOnly=TRUE)

# parse positional arguments
query_type <- args[1]
query_term <- args[2]
output_fasta_file <- args[3]
taxonomy <- args[4]
if(length(args)==6) {
rev_status <- args[5]
uniref_id <- args[6]
}

# check if all required arguments as provided
if (query_type!="id" & length(args)!=6) {
stop("")
}
if (query_type=="id" & length(args)!=4) {
stop("")
}


if(query_type != "id") {
# parse taxonomy term
taxonomy <- unlist(strsplit(taxonomy, ","))
taxonomy <- paste0("(taxonomy_name:",taxonomy,")", collapse = "%20OR%20")
taxonomy <- paste0("(",taxonomy,")")

# (0) - Build uniprot entry URL
urli <- paste0("https://rest.uniprot.org/uniprotkb/search?format=list&query=(",
query_type,":",query_term,
")%20AND%20",taxonomy,"%20AND%20(reviewed:",
rev_status,")&size=",
batch_size_query_results)

# (1) - Find uniprot accession numbers of matching entries
accessions <- c()
total <- NA_integer_
# cat("Retrieving sequence accession IDs ...\n")
while(length(urli)==1) {
ri <- GET_retries(urli)
ri_header <- headers(ri)

# get metrics and accessions
total <- as.numeric(ri_header$`x-total-results`)
accessions <- c(accessions, unlist(strsplit(httr::content(ri, as = "text", encoding = "UTF-8"),"\n")))
cat("\r\t\t\t",length(accessions), "/", total)

ind_next <- grep("\"next\"$",ri_header$link)

next_url <- str_match(ri_header$link[ind_next], "<\\s*(.*?)\\s*>")[,2]

urli <- next_url
}
cat("\n")
if(length(accessions) == 0) {
cat(NULL, file = output_fasta_file)
quit(save = "no", status = 0)
}


# (2) retrieve corresponding uniref90 cluster IDs (not sequences)
batch_ids <- floor(1:length(accessions)/batch_size_clusters)
cluster_ids <- c()
k <- 0

for(batchi in unique(batch_ids)) {
k <- k + sum(batch_ids == batchi)
cat("\r\t\t\t",k,"/",total)
acc_concat <- paste0("%28uniprot_id%3A",accessions[batch_ids == batchi],"%29",
collapse = "%20OR%20")
urlc <- paste0("https://rest.uniprot.org/uniref/search?format=list&query=%28%28",
acc_concat,"%29%20AND%20%28identity%3A",uniref_id,"%29%29",
"&size=", batch_size_query_results)

ri <- GET_retries(urlc)
cluster_ids <- c(cluster_ids,
unlist(strsplit(httr::content(ri, as = "text",
encoding = "UTF-8"),"\n")))
}
cluster_ids <- unique(cluster_ids)
if(length(cluster_ids)==0)
quit(save = "no", status = 1)
cat(" (",length(cluster_ids),"cluster sequences found)\n")
total_uniref <- length(cluster_ids)

# (3) retrieve Uniref sequences
batch_ids <- floor(1:length(cluster_ids)/batch_size_clusters)
seqs <- c()
k <- 0

for(batchi in unique(batch_ids)) {
k <- k + sum(batch_ids == batchi)
cat("\r\t\t\t",k,"/",total_uniref)
uniref_acc_concat <- paste0("id%3A",cluster_ids[batch_ids == batchi],
collapse = "%20OR%20")

urls <- paste0("https://rest.uniprot.org/uniref/search?compressed=false&format=fasta&query=",
uniref_acc_concat,"&size=", batch_size_query_results)

ri <- GET_retries(urls)

seqs <- c(seqs, httr::content(ri, as = "text", encoding = "UTF-8"))
}
cat("\n")
if(length(seqs)==0)
quit(save = "no", status = 1)

cat(seqs, sep = "", file = output_fasta_file)
quit(save = "no", status = 0)
}


if(query_type == "id") {
# (0) - Build uniprot entry URL
urli <- paste0("https://rest.uniprot.org/uniprotkb/search?compressed=false&format=fasta&query=%28%28accession%3A",
query_term,"%29%20AND%20",taxonomy,"%29")

ri <- GET_retries(urli)

seqi <- httr::content(ri, as = "text", encoding = "UTF-8")
cat(seqi, sep = "", file = output_fasta_file)
quit(save = "no", status = 0)
}

0 comments on commit 82ff4ad

Please sign in to comment.