diff --git a/docs/install.md b/docs/install.md index befde38b..892f3018 100644 --- a/docs/install.md +++ b/docs/install.md @@ -3,7 +3,7 @@ ## Ubuntu/Debian/Mint ``` sudo apt install ncbi-blast+ git libglpk-dev r-base-core exonerate bedtools barrnap bc parallel libcurl4-openssl-dev -R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite"))' +R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite","httr"))' R -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager"); BiocManager::install("Biostrings")' git clone https://github.com/jotech/gapseq && cd gapseq ``` @@ -13,7 +13,7 @@ git clone https://github.com/jotech/gapseq && cd gapseq sudo yum install ncbi-blast+ git glpk-devel BEDTools exonerate hmmer bc parallel libcurl-devel git clone https://github.com/tseemann/barrnap.git export PATH=$PATH:barrnap/bin/barrnap # needs to be permanent => .bashrc ? -R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite"))' +R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite","httr"))' R -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager"); BiocManager::install("Biostrings")' git clone https://github.com/jotech/gapseq && cd gapseq ``` @@ -22,7 +22,7 @@ git clone https://github.com/jotech/gapseq && cd gapseq Using [homebrew](https://brew.sh). Please note: Some Mac-Users reported difficulties to install gapseq on MacOS using the following commands. The issues are mainly due to some Mac-specific functioning of central programs such as sed, awk, and grep. If you are experiencing issues, we recommend to try to install gapseq in an own conda environment using the steps described [below](#conda). ``` brew install coreutils binutils git glpk blast bedtools r brewsci/bio/barrnap grep bc gzip parallel curl -R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite"))' +R -e 'install.packages(c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "CHNOSZ", "jsonlite","httr"))' R -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager"); BiocManager::install("Biostrings")' git clone https://github.com/jotech/gapseq && cd gapseq ``` diff --git a/gapseq_env.yml b/gapseq_env.yml index ac261955..c164726f 100644 --- a/gapseq_env.yml +++ b/gapseq_env.yml @@ -35,3 +35,4 @@ dependencies: - r-renv - r-glpkapi - r-rcurl + - r-httr diff --git a/src/test.sh b/src/test.sh index bdbf3a7b..38cd1958 100755 --- a/src/test.sh +++ b/src/test.sh @@ -52,7 +52,7 @@ echo -e "\nMissing dependencies: $i\n\n" echo "#####################" echo "#Checking R packages#" echo "#####################" -Rscript -e 'needed.packages <- c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "BiocManager", "Biostrings", "jsonlite", "CHNOSZ"); avail.packages <- installed.packages(); i=0; for( pkg in needed.packages ){; idx <- match(pkg, avail.packages[,"Package"]); if( ! is.na(idx) ){; cat(pkg, avail.packages[idx,"Version"], "\n"); }else{; cat(pkg, "NOT FOUND", "\n"); i=i+1; }; }; cat("\nMissing R packages: ", i, "\n\n\n")' +Rscript -e 'needed.packages <- c("data.table", "stringr", "sybil", "getopt", "doParallel", "foreach", "R.utils", "stringi", "glpkAPI", "BiocManager", "Biostrings", "jsonlite", "CHNOSZ", "httr"); avail.packages <- installed.packages(); i=0; for( pkg in needed.packages ){; idx <- match(pkg, avail.packages[,"Package"]); if( ! is.na(idx) ){; cat(pkg, avail.packages[idx,"Version"], "\n"); }else{; cat(pkg, "NOT FOUND", "\n"); i=i+1; }; }; cat("\nMissing R packages: ", i, "\n\n\n")' echo "##############################" echo "#Checking basic functionality#" diff --git a/src/uniprot.sh b/src/uniprot.sh index 35860c34..ba69351d 100755 --- a/src/uniprot.sh +++ b/src/uniprot.sh @@ -74,7 +74,7 @@ shift $((OPTIND-1)) # consider special case of eg. prokaryotes which are not supported by uniprot taxonomy if [ "$taxonomy" = "Prokaryota" ]; then folder=Prokaryota - taxonomy="(bacteria%20OR%20archaea)" + taxonomy="Bacteria,Archaea" else folder=$taxonomy fi @@ -127,12 +127,11 @@ if [ -n "$ecnumber" ]; then echo -en " ... Downloading $ec \t\t" if [ "$get_unrev" = false ]; then #swissprot - url="https://www.uniprot.org/uniref/?query=uniprot%3A(ec%3A$ec%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ayes)%20identity%3A$identity&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta" + Rscript $dir/uniprot_query.R ec $ec $ec.fasta $taxonomy true 0.9 && stat=ok || stat=err else # unreviewed - url="https://www.uniprot.org/uniref/?query=uniprot%3A(ec%3A$ec%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ano)%20identity%3A$identity_unrev&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta" + Rscript $dir/uniprot_query.R ec $ec $ec.fasta $taxonomy false 0.5 && stat=ok || stat=err fi - wget -q $url -O $ec.fasta && stat=ok || stat=err # download only if file not exists newmd5=$(md5sum $ec.fasta | cut -d " " -f1) newcount=$(cat $ec.fasta | grep ">" | wc -l) [[ "$oldmd5" != "$newmd5" ]] && echo "`date +"%d/%m/%Y"` `echo $newcount` `echo $newcount-$oldcount | bc` $seqpath/$ec.fasta $stat" >> ../updating.log @@ -157,11 +156,10 @@ if [ -n "$reaNames" ]; then rm -f $reaNameHash.fasta echo -en " ... Downloading $rea\t\t" if [ "$get_unrev" = false ]; then - url="https://www.uniprot.org/uniref/?query=uniprot%3A(name%3A\"$rea\"%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ayes)%20identity%3A$identity&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta" + Rscript $dir/uniprot_query.R protein_name "$rea" $reaNameHash.fasta $taxonomy true 0.9 && stat=ok || stat=err else - url="https://www.uniprot.org/uniref/?query=uniprot%3A(name%3A\"$rea\"%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ano)%20identity%3A$identity_unrev&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta" + Rscript $dir/uniprot_query.R protein_name "$rea" $reaNameHash.fasta $taxonomy false 0.5 && stat=ok || stat=err fi - wget -q "$url" -O "$reaNameHash.fasta" && stat=ok || stat=err newmd5=$(md5sum $reaNameHash.fasta | cut -d " " -f1) newcount=$(cat $reaNameHash.fasta | grep ">" | wc -l) [[ "$oldmd5" != "$newmd5" ]] && echo "`date +"%d/%m/%Y"` `echo $newcount` `echo $newcount-$oldcount | bc` $seqpath/$reaNameHash.fasta $stat" >> ../updating.log @@ -177,11 +175,10 @@ if [ -n "$geneName" ]; then rm -f $geneName.fasta echo -en " ... Downloading $geneName\t\t" if [ "$get_unrev" = false ]; then - url="https://www.uniprot.org/uniref/?query=uniprot%3A(gene_exact%3A\"$geneName\"%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ayes)%20identity%3A$identity&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta" + Rscript $dir/uniprot_query.R gene "$geneName" $reaNameHash.fasta $taxonomy true 0.9 && stat=ok || stat=err else - url="https://www.uniprot.org/uniref/?query=uniprot%3A(gene_exact%3A\"$geneName\"%20taxonomy%3A$taxonomy%20AND%20reviewed%3Ano)%20identity%3A$identity_unrev&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta" + Rscript $dir/uniprot_query.R gene "$geneName" $reaNameHash.fasta $taxonomy false 0.5 && stat=ok || stat=err fi - wget -q "$url" -O "$geneName.fasta" && stat=ok || stat=err newmd5=$(md5sum $geneName.fasta | cut -d " " -f1) newcount=$(cat $geneName.fasta | grep ">" | wc -l) [[ "$oldmd5" != "$newmd5" ]] && echo "`date +"%d/%m/%Y"` `echo $newcount` `echo $newcount-$oldcount | bc` $seqpath/$geneName.fasta $stat" >> ../updating.log @@ -198,8 +195,7 @@ if [ -n "$dbref" ]; then oldcount=$(cat $dbref.fasta | grep ">" | wc -l) rm -f $dbref.fasta echo -en " ... Downloading $dbref\t\t" - url="https://www.uniprot.org/uniprot/?query=uniprot%3A(id%3A\"$dbref\"%20taxonomy%3A$taxonomy)&columns=id%2Creviewed%2Cname%2Ccount%2Cmembers%2Corganisms%2Clength%2Cidentity&format=fasta" - wget -q "$url" -O "$dbref.fasta" && stat=ok || stat=err + Rscript $dir/uniprot_query.R id $dbref $dbref.fasta $taxonomy && stat=ok || stat=err newmd5=$(md5sum $dbref.fasta | cut -d " " -f1) newcount=$(cat $dbref.fasta | grep ">" | wc -l) [[ "$oldmd5" != "$newmd5" ]] && echo "`date +"%d/%m/%Y"` `echo $newcount` `echo $newcount-$oldcount | bc` $seqpath/$dbref.fasta $stat" >> ../updating.log diff --git a/src/uniprot_query.R b/src/uniprot_query.R new file mode 100644 index 00000000..63bf81f6 --- /dev/null +++ b/src/uniprot_query.R @@ -0,0 +1,169 @@ +library(httr) +library(stringr) + +# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~# +# Programmatic download of protein cluster sequences from EC / Protein name / # +# Gene ID queries via Uniprot REST requires 3 steps: # +# (1) Find UniprotKB ID/AC of query-matching proteins # +# (2) Match IDs/Accessions from (1) to identity-based protein clusters # +# (UniRef_90/UniRef_50) # +# (3) Retrieve representative Sequences of protein clusters # +# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~# + +# query_type <- "ec" # (either "ec", "protein_name", "gene_exact" or "id"). +# query_term <- "1.2.1.3" +batch_size_query_results <- 500 +batch_size_clusters <- 200 # 200 should work: https://github.com/ebi-uniprot/uniprot-rest-api/issues/275#issuecomment-1173888616 +max_attempts <- 10 # Maximum number of attempts per query to receive a status==200 response from uniprot website. + +GET_retries <- function(url) { + attempts <- 1 + get_success <- FALSE + while(get_success == FALSE && attempts <= max_attempts) { + res <- GET(url) + attempts <- attempts + 1 + get_success <- res$status_code + } + if(get_success == FALSE) { + quit(save = "no", status = 1) + } + + return(res) +} +# rev_status <- "true" +# uniref_id <- "0.9" +# output_fasta_file <- "test.fasta" +# taxonomy <- "Bacteria" + +# positional arguments: +# 1: query type (EC, protein_name, gene_exact) +# 2: query term +# 3: output file name +# 4: taxonomy +# 5: reviewed status (true or false) +# 6: UniRef identity clustering cutoff (0.9 or 0.5) + +args = commandArgs(trailingOnly=TRUE) + +# parse positional arguments +query_type <- args[1] +query_term <- args[2] +output_fasta_file <- args[3] +taxonomy <- args[4] +if(length(args)==6) { + rev_status <- args[5] + uniref_id <- args[6] +} + +# check if all required arguments as provided +if (query_type!="id" & length(args)!=6) { + stop("") +} +if (query_type=="id" & length(args)!=4) { + stop("") +} + + +if(query_type != "id") { + # parse taxonomy term + taxonomy <- unlist(strsplit(taxonomy, ",")) + taxonomy <- paste0("(taxonomy_name:",taxonomy,")", collapse = "%20OR%20") + taxonomy <- paste0("(",taxonomy,")") + + # (0) - Build uniprot entry URL + urli <- paste0("https://rest.uniprot.org/uniprotkb/search?format=list&query=(", + query_type,":",query_term, + ")%20AND%20",taxonomy,"%20AND%20(reviewed:", + rev_status,")&size=", + batch_size_query_results) + + # (1) - Find uniprot accession numbers of matching entries + accessions <- c() + total <- NA_integer_ + # cat("Retrieving sequence accession IDs ...\n") + while(length(urli)==1) { + ri <- GET_retries(urli) + ri_header <- headers(ri) + + # get metrics and accessions + total <- as.numeric(ri_header$`x-total-results`) + accessions <- c(accessions, unlist(strsplit(httr::content(ri, as = "text", encoding = "UTF-8"),"\n"))) + cat("\r\t\t\t",length(accessions), "/", total) + + ind_next <- grep("\"next\"$",ri_header$link) + + next_url <- str_match(ri_header$link[ind_next], "<\\s*(.*?)\\s*>")[,2] + + urli <- next_url + } + cat("\n") + if(length(accessions) == 0) { + cat(NULL, file = output_fasta_file) + quit(save = "no", status = 0) + } + + + # (2) retrieve corresponding uniref90 cluster IDs (not sequences) + batch_ids <- floor(1:length(accessions)/batch_size_clusters) + cluster_ids <- c() + k <- 0 + + for(batchi in unique(batch_ids)) { + k <- k + sum(batch_ids == batchi) + cat("\r\t\t\t",k,"/",total) + acc_concat <- paste0("%28uniprot_id%3A",accessions[batch_ids == batchi],"%29", + collapse = "%20OR%20") + urlc <- paste0("https://rest.uniprot.org/uniref/search?format=list&query=%28%28", + acc_concat,"%29%20AND%20%28identity%3A",uniref_id,"%29%29", + "&size=", batch_size_query_results) + + ri <- GET_retries(urlc) + cluster_ids <- c(cluster_ids, + unlist(strsplit(httr::content(ri, as = "text", + encoding = "UTF-8"),"\n"))) + } + cluster_ids <- unique(cluster_ids) + if(length(cluster_ids)==0) + quit(save = "no", status = 1) + cat(" (",length(cluster_ids),"cluster sequences found)\n") + total_uniref <- length(cluster_ids) + + # (3) retrieve Uniref sequences + batch_ids <- floor(1:length(cluster_ids)/batch_size_clusters) + seqs <- c() + k <- 0 + + for(batchi in unique(batch_ids)) { + k <- k + sum(batch_ids == batchi) + cat("\r\t\t\t",k,"/",total_uniref) + uniref_acc_concat <- paste0("id%3A",cluster_ids[batch_ids == batchi], + collapse = "%20OR%20") + + urls <- paste0("https://rest.uniprot.org/uniref/search?compressed=false&format=fasta&query=", + uniref_acc_concat,"&size=", batch_size_query_results) + + ri <- GET_retries(urls) + + seqs <- c(seqs, httr::content(ri, as = "text", encoding = "UTF-8")) + } + cat("\n") + if(length(seqs)==0) + quit(save = "no", status = 1) + + cat(seqs, sep = "", file = output_fasta_file) + quit(save = "no", status = 0) +} + + +if(query_type == "id") { + # (0) - Build uniprot entry URL + urli <- paste0("https://rest.uniprot.org/uniprotkb/search?compressed=false&format=fasta&query=%28%28accession%3A", + query_term,"%29%20AND%20",taxonomy,"%29") + + ri <- GET_retries(urli) + + seqi <- httr::content(ri, as = "text", encoding = "UTF-8") + cat(seqi, sep = "", file = output_fasta_file) + quit(save = "no", status = 0) +} +