diff --git a/README.md b/README.md index dd50714..c54289f 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ * by Christian Brandt & Mike Marquet * **this tool is under active development,feel free to report issues and add suggestions** -* Use a release candidate for a stable experience via `-r` e.g. `-r v1.1.0` +* Use a release candidate for a stable experience via `-r` e.g. `-r v1.2.0` * These are extensively tested release versions of WtP * [releases of WtP are listed here](https://github.com/replikation/What_the_Phage/releases) diff --git a/bin/contig_by_tool_count.sh b/bin/contig_by_tool_count.sh index fd0b5f8..da9de11 100755 --- a/bin/contig_by_tool_count.sh +++ b/bin/contig_by_tool_count.sh @@ -8,7 +8,15 @@ done awk '{print $0"\t"FILENAME}' *.txt > tmp_result.tsv awk '{ gsub(/.txt/,"", $3); print }' OFS='\t' tmp_result.tsv > tmp_results2.tsv rm *.txt -sed -e '1i\contig_name\tp_value\ttoolname' tmp_results2.tsv > contig_tool_p-value_overview.tsv +sed -e '1i\contig_name\tp_value\ttoolname' tmp_results2.tsv > tmp_results3.tsv + +## if r markdown code breaks .... these are the files can generate the error +## error prevention +grep -E -v ".txt" tmp_results3.tsv > contig_tool_p-value_overview.tsv + + + + ## which tools were used? #cut -f3 tmp_results2.tsv | sort -u > tools_used_for_phage_prediction.tsv diff --git a/nextflow.config b/nextflow.config index 160039f..171a321 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,6 +15,8 @@ params { cloudDatabase = false filter = '1500' setup = '' + all_tools = false + annotation_db = false // folder structure output = 'results' @@ -46,18 +48,14 @@ params { // parameter hmm_params = '-E 1e-30' - // raw tool output filters - + // raw tool output filters dv_filter = '0.9' - // ma_filter = '75' mp_filter = '50' vf_filter = '0.9' sm_filter = '0.5' vn_filter = '0.5' vs2_filter = '0.9' sk_filter = '0.75' - // pp_filter = '' - // vb_filter = '' } // runinfo diff --git a/phage.nf b/phage.nf index ee9e722..b2c1441 100755 --- a/phage.nf +++ b/phage.nf @@ -171,11 +171,37 @@ workflow { else if (params.fasta && !params.identify && !params.annotate && !params.setup ) { prediction_channel = input_validation_wf(fasta_input_ch) } /************************** -* Prediction +* Prediction via benchmarked tools only **************************/ // run annotation if identify flag or no flag at all - if (params.fasta && params.identify && !params.annotate && !params.setup || params.fasta && !params.identify && !params.annotate && !params.setup ) { + if (params.fasta && params.identify && !params.annotate && !params.setup && !params.all_tools || params.fasta && !params.identify && !params.annotate && !params.setup && !params.all_tools ) { // actual tools + results = deepvirfinder_wf( prediction_channel) + .concat( seeker_wf(prediction_channel)) + .concat( virfinder_wf(prediction_channel)) + .concat( pprmeta_wf(prediction_channel)) + .concat( metaphinder_wf(prediction_channel)) + .concat( vibrant_wf(prediction_channel)) + .concat( vibrant_virome_wf(prediction_channel)) + .concat( virsorter_wf(prediction_channel)) + .concat( virsorter_virome_wf(prediction_channel)) + .concat( virsorter2_wf(prediction_channel)) + .filter { it != 'deactivated' } // removes deactivated tool channels + .groupTuple() + + prepare_results_wf(results, prediction_channel) + + // markdown report input + // map identify output for input of annotaion tools + annotation_channel = input_validation_wf.out.join(results) + } + //&& !params.all_tools +/************************** +* Prediction via all tools +**************************/ + // run annotation if identify flag or no flag at all + if (params.fasta && params.identify && !params.annotate && !params.setup && params.all_tools|| params.fasta && params.all_tools && !params.identify && !params.annotate && !params.setup ) { + // benchmarked tools results = deepvirfinder_wf( prediction_channel) .concat( phigaro_wf(prediction_channel)) .concat( seeker_wf(prediction_channel)) @@ -199,7 +225,7 @@ workflow { // map identify output for input of annotaion tools annotation_channel = input_validation_wf.out.join(results) } - + /************************** * Annotation **************************/ @@ -256,10 +282,11 @@ def helpMSG() { c_reset = "\033[0m"; c_yellow = "\033[0;33m"; c_blue = "\033[0;34m"; + c_purple = "\033[0;35m"; c_dim = "\033[2m"; log.info """ . - ${c_yellow}Usage examples:${c_reset} + ${c_purple}Usage examples:${c_reset} nextflow run replikation/What_the_Phage --fasta '*/*.fasta' --cores 20 --max_cores 40 \\ --output results -profile local,docker @@ -268,7 +295,7 @@ def helpMSG() { --cachedir /images/singularity_images \\ --databases /databases/WtP_databases/ - ${c_yellow}Input:${c_reset} + ${c_purple}Input:${c_reset} --fasta '*.fasta' -> assembly file(s) --fastq '*.fastq' -> long read file(s) ${c_dim} ..change above input to csv via --list ${c_reset} @@ -276,7 +303,7 @@ def helpMSG() { the .csv contains per line: name,/path/to/file${c_reset} --setup skips analysis and just downloads databases and containers - ${c_yellow}Execution/Engine profiles:${c_reset} + ${c_purple}Execution/Engine profiles:${c_reset} WtP supports profiles to run via different ${c_green}Executers${c_reset} and ${c_blue}Engines${c_reset} e.g.: -profile ${c_green}local${c_reset},${c_blue}docker${c_reset} @@ -291,13 +318,16 @@ def helpMSG() { For a test run (~ 1h), add "smalltest" to the profile, e.g. -profile smalltest,local,singularity - ${c_yellow}Options:${c_reset} + ${c_purple}Options:${c_reset} --filter min contig size [bp] to analyse [default: $params.filter] --cores max cores per process for local use [default: $params.cores] --max_cores max cores used on the machine for local use [default: $params.max_cores] --output name of the result folder [default: $params.output] - ${c_yellow}Tool control:${c_reset} + ${c_purple}Tool control:${c_reset} + Deploy all integrated phage prediction tools + --all_tools activate all phage prediction tools + Deactivate tools individually by adding one or more of these flags --dv deactivates deepvirfinder --mp deactivates metaphinder @@ -311,8 +341,12 @@ def helpMSG() { --vs2 deactivates virsorter2 --sk deactivates seeker + ${c_purple}Custom phage annotation Database:${c_reset} + --annotation_db /path/to/your/custom_phage_annotation_db.tar.gz + Please provide a custom_phage_annotation_db.tar.gz archive that contains the following file formats: + *.hmm *.hmm.h3f *.hmm.h3i *.hmm.h3m *.hmm.h3p - Workflow control: + ${c_yellow}Workflow control:${c_reset} --identify only phage identification, skips analysis --annotate only annotation, skips phage identification diff --git a/submodule_report/Heatmap_table.Rmd b/submodule_report/Heatmap_table.Rmd index ad1f88c..58812ed 100644 --- a/submodule_report/Heatmap_table.Rmd +++ b/submodule_report/Heatmap_table.Rmd @@ -60,6 +60,26 @@ The tool's output and what WtP assigns are shown in the table below. +#### **Explanation tool output** + +**Tab.2**: The output of each tool and the values WtP assigns in Tab.1 . + +Tool | Standard output | WtP displayed value | F1 scores by Ho et al. +|-|-|-|-| +|deepvirfinder | p-value: 0 to 1 | 0 to 1 | >0.83 +|metaphinder | string: phage | 1 | >0.83 +|metaphinder own| string: phage | 1 | N/A +|phigaro | score: 0 to 1 | 0 to 1 | N/A +|pprmeta | phage_score: 0 to 1 | 0 to 1 | 0.92 +|seeker | score: 0 to 1 | 0 to 1 | <0.5 +|sourmash | similarity: 0 to 1 | 0 to 1 | N/A +|vibrant | prediction: virus | 1 | >0.83 +|vibrant-virome | prediction: virus | 1 | N/A +|virfinder | p-value: 0 to 1 | 0 to 1 | >0.83 +|virnet | score: 0 to 1 | 0 to 1 | N/A +|virsorter | category 1, category 2, category 3 | 1, 0.5, 0 | >0.83 +|virsorter-virome|category 1, category 2, category 3 | 1, 0.5, 0 | N/A +|virsorter2 | dsDNAphage: 0 to 1 | 0 to 1 | 0.93 #### **Extract contigs of interest** @@ -89,25 +109,5 @@ seqkit grep --pattern-file contig_IDs_of_interest.txt your_input_fasta.fa.gz > c -#### **Explanation tool output** - -**Tab.2**: The output of each tool and the values WtP assigns in Tab.1 . - -Tool | Standard output | WtP displayed value | -|-|-|-| -|deepvirfinder |p-value: 0 to 1 | 0 to 1 | -|metaphinder |string: phage | 1 | -|metaphinder own| string: phage | 1 | -|phigaro | score: 0 to 1 | 0 to 1 | -|pprmeta |phage_score: 0 to 1 | 0 to 1 | -|seeker |score: 0 to 1 | 0 to 1 | -|sourmash |similarity: 0 to 1 | 0 to 1 | -|vibrant |prediction: virus | 1 | -|vibrant-virome |prediction: virus | 1 | -|virfinder |p-value: 0 to 1 | 0 to 1 | -|virnet |score: 0 to 1 | 0 to 1 | -|virsorter |category 1, category 2, category 3| 1, 0.5, 0 | -|virsorter-virome|category 1, category 2, category 3| 1, 0.5, 0 | -|virsorter2 |dsDNAphage: 0 to 1 | 0 to 1 | Back to top \ No newline at end of file diff --git a/submodule_report/UpsetR.Rmd b/submodule_report/UpsetR.Rmd index 8088703..444973f 100644 --- a/submodule_report/UpsetR.Rmd +++ b/submodule_report/UpsetR.Rmd @@ -24,10 +24,11 @@ div.blue { background-color:#e6f0ff; border-radius: 5px; padding: 20px;}
-The "UpSet" plot is an alternative but more detailed Venn diagram, and it summarizes the prediction performance of each tool for your sample. -The total amount of identified phage-contigs per tool is shown in blue bars on the left. -Black bars visualize the number of contigs that each tool or tool combination has uniquely identified, and each tool combination is shown below each black bar as a dot matrix. -E.g., a black bar with two dots below it means that these two tools identified the same contigs as phages. +The total amount of identified phage contigs per tool is shown in blue bars on the left. +Black, vertical bars visualize the number of contigs that each tool or tool combination has uniquely identified. +Each tool combination is shown below the vertical barplot as a dot matrix. How to read the diagram: For example, 53 phage contigs are found by six tools (DeepVirFinder, Metaphinder-own-DB, Metaphinder, PPRmeta, Seeker and VirFinder). +Another 42 contigs are found by these tools but also virnet. +
diff --git a/workflows/phage_annotation_wf.nf b/workflows/phage_annotation_wf.nf index baaf397..6dff716 100644 --- a/workflows/phage_annotation_wf.nf +++ b/workflows/phage_annotation_wf.nf @@ -8,7 +8,11 @@ include { chromomap } from './process/phage_annotation/chromomap' workflow phage_annotation_wf { take: fasta_and_tool_results main: - + // Input for custom annotation database + if (params.annotation_db) { annotation_custom_db_ch = Channel + .fromPath( params.annotation_db, checkIfExists: true) + .view()} + // map input for prodigal fasta = fasta_and_tool_results.map {it -> tuple(it[0],it[1])} //Databases //Database for hmmscan @@ -31,7 +35,8 @@ workflow phage_annotation_wf { } //annotation-process prodigal(fasta) - hmmscan(prodigal.out, pvog_DB.out) + if (!params.annotation_db) {hmmscan(prodigal.out, pvog_DB.out)} + else {hmmscan(prodigal.out, annotation_custom_db_ch)} chromomap_parser(fasta.join(hmmscan.out), vogtable_DB.out) chromomap(chromomap_parser.out[0].mix(chromomap_parser.out[1])) diff --git a/workflows/process/metaphinder/metaphinder.nf b/workflows/process/metaphinder/metaphinder.nf index 1d4c582..d002303 100644 --- a/workflows/process/metaphinder/metaphinder.nf +++ b/workflows/process/metaphinder/metaphinder.nf @@ -23,29 +23,4 @@ process metaphinder { } -process metaphinder_own_DB { - label 'metaphinder' - errorStrategy 'ignore' - input: - tuple val(name), file(fasta) - file(database) - output: - tuple val(name), file("${name}_*.list") - // output collection stream - tuple val(name), file("${name}_*.list"), file("${name}_*_blast.out") - script: - """ - rnd=${Math.random()} - mkdir ${name} - tar xzf ${database} - MetaPhinder.py -i ${fasta} -o ${name} -d phage_blast_DB/phage_db - mv ${name}/output.txt ${name}_\${PWD##*/}.list - mv ${name}/blast.out ${name}_\${PWD##*/}_blast.out - """ - stub: - """ - echo "#contigID classification ANI [%] merged coverage [%] number of hits size[bp]" > ${name}_\${PWD##*/}.list - echo "pos_phage_0 phage 80.754 96.97 172 146647" >> ${name}_\${PWD##*/}.list - touch ${name}_\${PWD##*/}_blast.out - """ -} + diff --git a/workflows/process/metaphinder_own_DB/phage_references_blastDB.nf b/workflows/process/metaphinder_own_DB/phage_references_blastDB.nf index 46b5cc0..e3a837e 100644 --- a/workflows/process/metaphinder_own_DB/phage_references_blastDB.nf +++ b/workflows/process/metaphinder_own_DB/phage_references_blastDB.nf @@ -3,7 +3,7 @@ process phage_references_blastDB { else { storeDir "${params.databases}/blast_DB_phage" } label 'metaphinder' errorStrategy = "retry" - maxRetries = 1 + maxRetries = 2 input: path(references) output: diff --git a/workflows/process/phage_annotation/hmmscan.nf b/workflows/process/phage_annotation/hmmscan.nf index 24e8106..474823f 100644 --- a/workflows/process/phage_annotation/hmmscan.nf +++ b/workflows/process/phage_annotation/hmmscan.nf @@ -3,16 +3,24 @@ process hmmscan { label 'hmmscan' input: tuple val(name), path(faa) - path(pvog_db) + path(annotation_db) output: - tuple val(name), path("${name}_${pvog_db}_hmmscan.tbl"), path(faa) + tuple val(name), path("${name}_${annotation_db}_hmmscan.tbl"), path(faa) script: + if (!params.annotation_db) """ - hmmscan --cpu ${task.cpus} ${params.hmm_params} --noali --domtblout ${name}_${pvog_db}_hmmscan.tbl ${pvog_db}/${pvog_db}.hmm ${faa} + hmmscan --cpu ${task.cpus} ${params.hmm_params} --noali --domtblout ${name}_${annotation_db}_hmmscan.tbl ${annotation_db}/${annotation_db}.hmm ${faa} """ + else + """ + mkdir custom_annotation_db + tar -xvzf ${annotation_db} -C custom_annotation_db + hmmscan --cpu ${task.cpus} ${params.hmm_params} --noali --domtblout ${name}_${annotation_db}_hmmscan.tbl custom_annotation_db/*.hmm ${faa} + """ + stub: """ - touch ${name}_${pvog_db}_hmmscan.tbl + touch ${name}_${annotation_db}_hmmscan.tbl """ } diff --git a/workflows/process/prepare_results/upsetr.nf b/workflows/process/prepare_results/upsetr.nf index 0bede7a..ba0a50a 100644 --- a/workflows/process/prepare_results/upsetr.nf +++ b/workflows/process/prepare_results/upsetr.nf @@ -32,7 +32,7 @@ process upsetr_plot { #nsets = 20, nintersects = 40 upset(fromList(sets), sets = names(sets), - mainbar.y.label = "No. of shared viral contigs", sets.x.label = "Number of identified \\nviral contigs", + mainbar.y.label = "No. of shared phage contigs", sets.x.label = "Number of phage \\ncontigs per tool ", order.by = "freq", sets.bar.color = "#56B4E9", keep.order = F, text.scale = 1.4, point.size = 2.6, line.size = 0.8, set_size.show = FALSE) diff --git a/workflows/process/virsorter2/filter_virsorter2.nf b/workflows/process/virsorter2/filter_virsorter2.nf index f8c31ac..6c88a01 100644 --- a/workflows/process/virsorter2/filter_virsorter2.nf +++ b/workflows/process/virsorter2/filter_virsorter2.nf @@ -7,6 +7,7 @@ process filter_virsorter2 { script: """ tail -n+2 *.tsv | awk '{ print \$1, \$2 }' OFS='\\t' | awk '{ print \$1, \$2 }' OFS='||' | cut -d "|" -f1,5 | awk -F"|" '{ print \$1, \$2 }' OFS='\t' > virsorter2_\${PWD##*/}.tsv + sed -i 's/nan/0/g' virsorter2_\${PWD##*/}.tsv """ } @@ -19,3 +20,17 @@ ctg6_len=5303||full 0.967 0.733 0.967 dsDNAphage 3585 1 //awk '{ gsub(/||full/,"", $3); print }' OFS='\t' tmp_result.tsv > tmp_results2.tsv //awk '{ print \$1, \$2 }' OFS='||' virsorter2_\${PWD##*/}.tsv + +// nan if row contains nan make to 0 +// GT1_27252 0.840 virsorter2 +// GT1_27276 0.640 virsorter2 +// GT1_27283 0.560 virsorter2 +// GT1_27330 0.633 virsorter2 +// GT1_27380 0.180 virsorter2 +// GT1_1 0.840 virsorter2 +// GT1_396 0.567 virsorter2 +// GT1_2577 nan virsorter2 +// GT1_5431 nan virsorter2 +// GT1_5765 nan virsorter2 +// GT1_7346 nan virsorter2 +//sed -i 's/nan/0/g' virsorter2_test_nan.txt \ No newline at end of file