Merge pull request #172 from replikation/revision_gigascience

Revision gigascience
replikation · Aug 17, 2022 · 79faf1a · 79faf1a
2 parents 928cd60 + 3db94ce
commit 79faf1a
Show file tree

Hide file tree

Showing 12 changed files with 118 additions and 74 deletions.
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@
 
 * by Christian Brandt & Mike Marquet
 * **this tool is under active development,feel free to report issues and add suggestions**
-* Use a release candidate for a stable experience via `-r` e.g. `-r v1.1.0`
+* Use a release candidate for a stable experience via `-r` e.g. `-r v1.2.0`
   * These are extensively tested release versions of WtP
   * [releases of WtP are listed here](https://github.com/replikation/What_the_Phage/releases)  
 

diff --git a/bin/contig_by_tool_count.sh b/bin/contig_by_tool_count.sh
@@ -8,7 +8,15 @@ done
 awk '{print $0"\t"FILENAME}' *.txt >  tmp_result.tsv
 awk '{ gsub(/.txt/,"", $3); print }' OFS='\t' tmp_result.tsv > tmp_results2.tsv
 rm *.txt
-sed -e '1i\contig_name\tp_value\ttoolname' tmp_results2.tsv > contig_tool_p-value_overview.tsv
+sed -e '1i\contig_name\tp_value\ttoolname' tmp_results2.tsv > tmp_results3.tsv
+
+## if r markdown code breaks .... these are the files can generate the error 
+## error prevention
+grep -E -v ".txt" tmp_results3.tsv > contig_tool_p-value_overview.tsv
+
+
+
+
 
 ## which tools were used?
 #cut -f3 tmp_results2.tsv | sort -u > tools_used_for_phage_prediction.tsv

diff --git a/nextflow.config b/nextflow.config
@@ -15,6 +15,8 @@ params {
     cloudDatabase = false
     filter = '1500'
     setup = ''
+    all_tools = false
+    annotation_db = false
 
     // folder structure
     output = 'results'
@@ -46,18 +48,14 @@ params {
     // parameter
     hmm_params = '-E 1e-30'
 
-     // raw tool output filters
-
+    // raw tool output filters  
     dv_filter = '0.9'
-    // ma_filter = '75'
     mp_filter = '50'
     vf_filter = '0.9'
     sm_filter = '0.5'
     vn_filter = '0.5'
     vs2_filter = '0.9'
 	sk_filter = '0.75'
-    // pp_filter = ''
-    // vb_filter = ''
 
 }
 // runinfo

diff --git a/phage.nf b/phage.nf
@@ -171,11 +171,37 @@ workflow {
     else if (params.fasta && !params.identify && !params.annotate && !params.setup ) { prediction_channel =  input_validation_wf(fasta_input_ch) }
 
 /************************** 
-* Prediction
+* Prediction via benchmarked tools only
 **************************/
     // run annotation if identify flag or no flag at all
-    if (params.fasta && params.identify && !params.annotate && !params.setup || params.fasta && !params.identify && !params.annotate && !params.setup )  { 
+    if (params.fasta && params.identify && !params.annotate && !params.setup && !params.all_tools || params.fasta  && !params.identify && !params.annotate && !params.setup && !params.all_tools )  { 
     // actual tools     
+        results = deepvirfinder_wf( prediction_channel)
+                .concat( seeker_wf(prediction_channel))
+                .concat( virfinder_wf(prediction_channel))
+                .concat( pprmeta_wf(prediction_channel))
+                .concat( metaphinder_wf(prediction_channel))
+                .concat( vibrant_wf(prediction_channel))
+                .concat( vibrant_virome_wf(prediction_channel))
+                .concat( virsorter_wf(prediction_channel))
+                .concat( virsorter_virome_wf(prediction_channel))
+                .concat( virsorter2_wf(prediction_channel))
+                .filter { it != 'deactivated' } // removes deactivated tool channels
+                .groupTuple()
+
+        prepare_results_wf(results, prediction_channel)
+
+        // markdown report input
+        // map identify output for input of annotaion tools
+        annotation_channel = input_validation_wf.out.join(results)
+    }
+    //&& !params.all_tools
+/************************** 
+* Prediction via all tools
+**************************/
+    // run annotation if identify flag or no flag at all
+    if (params.fasta  && params.identify && !params.annotate && !params.setup && params.all_tools|| params.fasta && params.all_tools && !params.identify && !params.annotate && !params.setup )  { 
+    // benchmarked tools     
         results = deepvirfinder_wf( prediction_channel)
                 .concat( phigaro_wf(prediction_channel))
                 .concat( seeker_wf(prediction_channel))
@@ -199,7 +225,7 @@ workflow {
         // map identify output for input of annotaion tools
         annotation_channel = input_validation_wf.out.join(results)
     }
-    
+
 /************************** 
 * Annotation
 **************************/
@@ -256,10 +282,11 @@ def helpMSG() {
     c_reset = "\033[0m";
     c_yellow = "\033[0;33m";
     c_blue = "\033[0;34m";
+    c_purple = "\033[0;35m";
     c_dim = "\033[2m";
     log.info """
     .
-    ${c_yellow}Usage examples:${c_reset}
+    ${c_purple}Usage examples:${c_reset}
     nextflow run replikation/What_the_Phage --fasta '*/*.fasta' --cores 20 --max_cores 40 \\
         --output results -profile local,docker 
 
@@ -268,15 +295,15 @@ def helpMSG() {
         --cachedir /images/singularity_images \\
         --databases /databases/WtP_databases/ 
 
-    ${c_yellow}Input:${c_reset}
+    ${c_purple}Input:${c_reset}
      --fasta             '*.fasta'   -> assembly file(s)
      --fastq             '*.fastq'   -> long read file(s)
     ${c_dim}  ..change above input to csv via --list ${c_reset}  
     ${c_dim}   e.g. --fasta inputs.csv --list    
         the .csv contains per line: name,/path/to/file${c_reset}  
      --setup              skips analysis and just downloads databases and containers
 
-    ${c_yellow}Execution/Engine profiles:${c_reset}
+    ${c_purple}Execution/Engine profiles:${c_reset}
      WtP supports profiles to run via different ${c_green}Executers${c_reset} and ${c_blue}Engines${c_reset} e.g.:
      -profile ${c_green}local${c_reset},${c_blue}docker${c_reset}
 
@@ -291,13 +318,16 @@ def helpMSG() {
     
     For a test run (~ 1h), add "smalltest" to the profile, e.g. -profile smalltest,local,singularity 
     
-    ${c_yellow}Options:${c_reset}
+    ${c_purple}Options:${c_reset}
     --filter            min contig size [bp] to analyse [default: $params.filter]
     --cores             max cores per process for local use [default: $params.cores]
     --max_cores         max cores used on the machine for local use [default: $params.max_cores]    
     --output            name of the result folder [default: $params.output]
 
-    ${c_yellow}Tool control:${c_reset}
+    ${c_purple}Tool control:${c_reset}
+    Deploy all integrated phage prediction tools
+    --all_tools         activate all phage prediction tools     
+
     Deactivate tools individually by adding one or more of these flags
     --dv                deactivates deepvirfinder
     --mp                deactivates metaphinder
@@ -311,8 +341,12 @@ def helpMSG() {
     --vs2               deactivates virsorter2
     --sk                deactivates seeker
 
+    ${c_purple}Custom phage annotation Database:${c_reset}
+    --annotation_db     /path/to/your/custom_phage_annotation_db.tar.gz
+                        Please provide a custom_phage_annotation_db.tar.gz archive that contains the following file formats:
+                        *.hmm  *.hmm.h3f  *.hmm.h3i  *.hmm.h3m  *.hmm.h3p
 
-    Workflow control:
+    ${c_yellow}Workflow control:${c_reset}
     --identify          only phage identification, skips analysis
     --annotate          only annotation, skips phage identification
 

diff --git a/submodule_report/Heatmap_table.Rmd b/submodule_report/Heatmap_table.Rmd
@@ -60,6 +60,26 @@ The tool's output and what WtP assigns are shown in the table below.
 
 </div>
 
+#### **Explanation tool output**
+
+**Tab.2**: The output of each tool and the values WtP assigns in Tab.1 .
+
+Tool            | Standard output                     | WtP displayed value | F1 scores by Ho et al.
+|-|-|-|-|
+|deepvirfinder  | p-value: 0 to 1                      | 0 to 1      | >0.83
+|metaphinder    | string: phage                        | 1           | >0.83
+|metaphinder own| string: phage                        | 1           | N/A
+|phigaro        | score: 0 to 1                        | 0 to 1      | N/A
+|pprmeta        | phage_score: 0 to 1                  | 0 to 1      | 0.92
+|seeker         | score: 0 to 1                        | 0 to 1      | <0.5
+|sourmash       | similarity: 0 to 1                   | 0 to 1      | N/A
+|vibrant        | prediction: virus                    | 1           | >0.83
+|vibrant-virome | prediction: virus                    | 1           | N/A
+|virfinder      | p-value: 0 to 1                      | 0 to 1      | >0.83
+|virnet         | score: 0 to 1                        | 0 to 1      | N/A
+|virsorter      | category 1, category 2, category 3   | 1, 0.5, 0   | >0.83
+|virsorter-virome|category 1, category 2, category 3   | 1, 0.5, 0   | N/A
+|virsorter2     | dsDNAphage: 0 to 1                   | 0 to 1      | 0.93
 
 
 #### **Extract contigs of interest**
@@ -89,25 +109,5 @@ seqkit grep --pattern-file contig_IDs_of_interest.txt your_input_fasta.fa.gz > c
 
 
 
-#### **Explanation tool output**
-
-**Tab.2**: The output of each tool and the values WtP assigns in Tab.1 .
-
-Tool | Standard output | WtP displayed value | 
-|-|-|-|
-|deepvirfinder  |p-value: 0 to 1          | 0 to 1  |
-|metaphinder    |string: phage          | 1     |
-|metaphinder own| string: phage         | 1     |
-|phigaro        | score: 0 to 1           | 0 to 1  |
-|pprmeta        |phage_score: 0 to 1      | 0 to 1  |
-|seeker         |score: 0 to 1            | 0 to 1  |
-|sourmash       |similarity: 0 to 1       | 0 to 1  |
-|vibrant        |prediction: virus      | 1     |
-|vibrant-virome |prediction: virus      | 1     |
-|virfinder      |p-value: 0 to 1         | 0 to 1  |
-|virnet         |score: 0 to 1           | 0 to 1  |
-|virsorter      |category 1, category 2, category 3| 1, 0.5, 0  |
-|virsorter-virome|category 1, category 2, category 3| 1, 0.5, 0  |
-|virsorter2     |dsDNAphage: 0 to 1          | 0 to 1     |
 
 <a href="#top">Back to top</a>
diff --git a/submodule_report/UpsetR.Rmd b/submodule_report/UpsetR.Rmd
@@ -24,10 +24,11 @@ div.blue { background-color:#e6f0ff; border-radius: 5px; padding: 20px;}
 </style>
 <div class = "blue">
 
-The "UpSet" plot is an alternative but more detailed Venn diagram, and it summarizes the prediction performance of each tool for your sample.  
-The total amount of identified phage-contigs per tool is shown in blue bars on the left.  
-Black bars visualize the number of contigs that each tool or tool combination has uniquely identified, and each tool combination is shown below each black bar as a dot matrix. 
-E.g., a black bar with two dots below it means that these two tools identified the same contigs as phages.
+The total amount of identified phage contigs per tool is shown in blue bars on the left.  
+Black, vertical bars visualize the number of contigs that each tool or tool combination has uniquely identified.  
+Each tool combination is shown below the vertical barplot as a dot matrix. How to read the diagram: For example, 53 phage contigs are found by six tools (DeepVirFinder, Metaphinder-own-DB, Metaphinder, PPRmeta, Seeker and VirFinder).  
+Another 42 contigs are found by these tools but also virnet.
+
 </div>
 
 

diff --git a/workflows/phage_annotation_wf.nf b/workflows/phage_annotation_wf.nf
@@ -8,7 +8,11 @@ include { chromomap } from './process/phage_annotation/chromomap'
 workflow phage_annotation_wf {
     take:   fasta_and_tool_results
     main:
-
+           // Input for custom annotation database
+           if (params.annotation_db) { annotation_custom_db_ch = Channel
+                                            .fromPath( params.annotation_db, checkIfExists: true)
+                                            .view()}
+            // map input for prodigal
             fasta = fasta_and_tool_results.map {it -> tuple(it[0],it[1])}
             //Databases 
             //Database for hmmscan
@@ -31,7 +35,8 @@ workflow phage_annotation_wf {
             }
             //annotation-process
             prodigal(fasta)
-            hmmscan(prodigal.out, pvog_DB.out)            
+            if (!params.annotation_db) {hmmscan(prodigal.out, pvog_DB.out)}
+            else {hmmscan(prodigal.out, annotation_custom_db_ch)}         
             chromomap_parser(fasta.join(hmmscan.out), vogtable_DB.out)
             chromomap(chromomap_parser.out[0].mix(chromomap_parser.out[1]))
 

diff --git a/workflows/process/metaphinder/metaphinder.nf b/workflows/process/metaphinder/metaphinder.nf
@@ -23,29 +23,4 @@ process metaphinder {
 }
 
 
-process metaphinder_own_DB {
-    label 'metaphinder'
-    errorStrategy 'ignore'
-    input:
-        tuple val(name), file(fasta)
-        file(database)
-    output:
-        tuple val(name), file("${name}_*.list")
-        // output collection stream
-        tuple val(name), file("${name}_*.list"), file("${name}_*_blast.out")
-    script:
-        """
-        rnd=${Math.random()}
-        mkdir ${name}
-        tar xzf ${database}
-        MetaPhinder.py -i ${fasta} -o ${name} -d phage_blast_DB/phage_db
-        mv ${name}/output.txt ${name}_\${PWD##*/}.list
-        mv ${name}/blast.out ${name}_\${PWD##*/}_blast.out
-        """
-    stub:
-        """
-        echo "#contigID       classification  ANI [%] merged coverage [%]     number of hits  size[bp]" > ${name}_\${PWD##*/}.list
-        echo "pos_phage_0     phage   80.754  96.97   172     146647" >> ${name}_\${PWD##*/}.list
-        touch ${name}_\${PWD##*/}_blast.out
-        """
-}
+
diff --git a/workflows/process/metaphinder_own_DB/phage_references_blastDB.nf b/workflows/process/metaphinder_own_DB/phage_references_blastDB.nf
@@ -3,7 +3,7 @@ process phage_references_blastDB {
     else { storeDir "${params.databases}/blast_DB_phage" }
     label 'metaphinder'
     errorStrategy = "retry"
-    maxRetries = 1
+    maxRetries = 2
     input:
         path(references)
     output:

diff --git a/workflows/process/phage_annotation/hmmscan.nf b/workflows/process/phage_annotation/hmmscan.nf
@@ -3,16 +3,24 @@ process hmmscan {
         label 'hmmscan'
     input:
         tuple val(name), path(faa) 
-        path(pvog_db)
+        path(annotation_db)
     output:
-        tuple val(name), path("${name}_${pvog_db}_hmmscan.tbl"), path(faa) 
+        tuple val(name), path("${name}_${annotation_db}_hmmscan.tbl"), path(faa) 
     script:
+        if (!params.annotation_db)
         """
-        hmmscan --cpu ${task.cpus} ${params.hmm_params} --noali --domtblout ${name}_${pvog_db}_hmmscan.tbl ${pvog_db}/${pvog_db}.hmm ${faa}
+        hmmscan --cpu ${task.cpus} ${params.hmm_params} --noali --domtblout ${name}_${annotation_db}_hmmscan.tbl ${annotation_db}/${annotation_db}.hmm ${faa}
         """
+        else
+        """
+        mkdir custom_annotation_db
+        tar -xvzf ${annotation_db} -C custom_annotation_db
+        hmmscan --cpu ${task.cpus} ${params.hmm_params} --noali --domtblout ${name}_${annotation_db}_hmmscan.tbl custom_annotation_db/*.hmm ${faa}
+        """
+
     stub:
         """
-        touch ${name}_${pvog_db}_hmmscan.tbl
+        touch ${name}_${annotation_db}_hmmscan.tbl
         """
 }
 

diff --git a/workflows/process/prepare_results/upsetr.nf b/workflows/process/prepare_results/upsetr.nf
@@ -32,7 +32,7 @@ process upsetr_plot {
 
       #nsets = 20, nintersects = 40
       upset(fromList(sets), sets = names(sets),
-          mainbar.y.label = "No. of shared viral contigs", sets.x.label = "Number of identified \\nviral contigs", 
+          mainbar.y.label = "No. of shared phage contigs", sets.x.label = "Number of phage \\ncontigs per tool ", 
           order.by = "freq", sets.bar.color = "#56B4E9", keep.order = F, 
           text.scale = 1.4, point.size = 2.6, line.size = 0.8, set_size.show = FALSE)
 

diff --git a/workflows/process/virsorter2/filter_virsorter2.nf b/workflows/process/virsorter2/filter_virsorter2.nf
@@ -7,6 +7,7 @@ process filter_virsorter2 {
     script:
         """
         tail -n+2 *.tsv |  awk '{ print \$1, \$2 }' OFS='\\t' | awk '{ print \$1, \$2 }' OFS='||' | cut -d "|" -f1,5 | awk -F"|" '{ print \$1, \$2 }' OFS='\t' > virsorter2_\${PWD##*/}.tsv
+        sed -i 's/nan/0/g' virsorter2_\${PWD##*/}.tsv
         """
 }
 
@@ -19,3 +20,17 @@ ctg6_len=5303||full     0.967   0.733   0.967   dsDNAphage      3585    1
 
 //awk '{ gsub(/||full/,"", $3); print }' OFS='\t' tmp_result.tsv > tmp_results2.tsv
 //awk '{ print \$1, \$2 }' OFS='||' virsorter2_\${PWD##*/}.tsv
+
+// nan  if row contains nan make to 0
+// GT1_27252	0.840	virsorter2
+// GT1_27276	0.640	virsorter2
+// GT1_27283	0.560	virsorter2
+// GT1_27330	0.633	virsorter2
+// GT1_27380	0.180	virsorter2
+// GT1_1	0.840	virsorter2
+// GT1_396	0.567	virsorter2
+// GT1_2577	nan	virsorter2
+// GT1_5431	nan	virsorter2
+// GT1_5765	nan	virsorter2
+// GT1_7346	nan	virsorter2
+//sed -i 's/nan/0/g' virsorter2_test_nan.txt