diff --git a/README.md b/README.md index c7c5eb8..260603f 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ flowchart TD ``` -Note: the process WITHOUT gubbins occurs if --skip_gubbins is true, or if there are only two samples input as gubbins requires a minimum of 3 samples. +Note: the process WITHOUT gubbins occurs if --skip_gubbins is true. This parameter should be used when your input has only 2 samples as gubbins requires a minimum of 3 samples. ## Inputs @@ -47,4 +47,64 @@ nextflow run BCCDC-PHL/snippy-core-phylogenomics \ ## Outputs +## Provenance +In the output directory, a provenance file will be written with the following format: + +``` +- pipeline_name: BCCDC-PHL/snippy-core-phylogenomics + pipeline_version: 0.1.2 + nextflow_session_id: 15245f59-7acc-4a0d-88ee-2232ddad329f + nextflow_run_name: zen_goldwasser + timestamp_analysis_start: 2024-11-21T14:59:18.014789-08:00 +- process_name: snippy_core + tools: + - tool_name: snippy_core + tool_version: 4.6.0 + parameters: + - parameter: --ref + value: H37Rv_NC000962.3.fasta + - parameter: --mask + value: mask.bed +- process_name: gubbins + tools: + - tool_name: gubbins + tool_version: 3.3.1 + parameters: + - parameter: --threads + value: 8 + - parameter: -p + value: gubbins +- process_name: snp_sites + tools: + - tool_name: snp_sites + tool_version: 2.5.1 + parameters: + - parameter: -c + value: gubbins.filtered_polymorphic_sites.fasta +- process_name: iqtree + tools: + - tool_name: iqtree + tool_version: 2.3.6 + parameters: + - parameter: -nt + value: 16 + - parameter: -fconst + value: 0,0,0,0 + - parameter: -s + value: clean.core.aln + - parameter: -st + value: DNA + - parameter: -m + value: GTR+G +- process_name: shiptv + tools: + - tool_name: shiptv + tool_version: 0.4.1 + parameters: + - parameter: -n + value: clean.core.aln.treefile + - parameter: -0 + value: clean.core.aln.treefile.html + +``` \ No newline at end of file diff --git a/main.nf b/main.nf index 744c847..ccc3d5e 100644 --- a/main.nf +++ b/main.nf @@ -2,15 +2,27 @@ nextflow.enable.dsl = 2 -include { snippy_core } from './modules/snippy_core.nf' -include { snp_sites } from './modules/snp_sites.nf' -include { snp_dists } from './modules/snp_dists.nf' -include { gubbins } from './modules/gubbins.nf' -include { iqtree } from './modules/iqtree.nf' -include { shiptv } from './modules/shiptv.nf' +include { snippy_core } from './modules/snippy_core.nf' +include { snp_sites } from './modules/snp_sites.nf' +include { snp_dists } from './modules/snp_dists.nf' +include { gubbins } from './modules/gubbins.nf' +include { iqtree } from './modules/iqtree.nf' +include { shiptv } from './modules/shiptv.nf' +include { pipeline_provenance } from './modules/provenance.nf' +include { collect_provenance } from './modules/provenance.nf' workflow { + ch_workflow_metadata = Channel.value([ + workflow.sessionId, + workflow.runName, + workflow.manifest.name, + workflow.manifest.version, + workflow.start, + ]) + + ch_pipeline_provenance = pipeline_provenance(ch_workflow_metadata) + if (params.samplesheet_input != 'NO_FILE') { ch_snippy_dirs = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['SNIPPY_DIR']] }.map{ it -> it[1] } } else { @@ -43,10 +55,32 @@ workflow { snp_sites(ch_alignment) - iqtree(snp_sites.out) + iqtree(snp_sites.out.clean_core_aln) - snp_dists(snp_sites.out) + snp_dists(snp_sites.out.clean_core_aln) - shiptv(iqtree.out) + shiptv(iqtree.out.tree) + + // Provenance collection processes + // The basic idea is to build up a channel with the following structure: + // [provenance_file_1.yml, provenance_file_2.yml, provenance_file_3.yml...]] + // ...and then concatenate them all together in the 'collect_provenance' process. + ch_pipeline_prov = pipeline_provenance.out + ch_snippy_prov = snippy_core.out.provenance + ch_gubbins_prov = gubbins.out.provenance + ch_snp_sites_prov = snp_sites.out.provenance + ch_iqtree_prov = iqtree.out.provenance + ch_shiptv_prov = shiptv.out.provenance + +// Now, combine these channels in the desired order + ch_provenance = ch_pipeline_prov + .concat(ch_snippy_prov) + .concat(ch_gubbins_prov) + .concat(ch_snp_sites_prov) + .concat(ch_iqtree_prov) + .concat(ch_shiptv_prov) + .collect() + + collect_provenance(ch_provenance) } diff --git a/modules/gubbins.nf b/modules/gubbins.nf index 5f04078..9b0be89 100644 --- a/modules/gubbins.nf +++ b/modules/gubbins.nf @@ -11,10 +11,21 @@ process gubbins { path('gubbins.filtered_polymorphic_sites.fasta'), emit: filtered_polymorphic_sites path('gubbins.recombination_predictions.gff'), emit: recombination_predictions_gff path('gubbins.per_branch_statistics.tsv'), emit: per_branch_statistics + path("gubbins_provenance.yml"), emit: provenance script: """ + printf -- "- process_name: gubbins\\n" >> gubbins_provenance.yml + printf -- " tools:\\n" >> gubbins_provenance.yml + printf -- " - tool_name: gubbins\\n" >> gubbins_provenance.yml + printf -- " tool_version: \$(gubbins -h | grep -i \"Version:\" | awk '{print \$2}')\\n" >> gubbins_provenance.yml + printf -- " parameters:\\n" >> gubbins_provenance.yml + printf -- " - parameter: --threads\\n" >> gubbins_provenance.yml + printf -- " value: ${task.cpus}\\n" >> gubbins_provenance.yml + printf -- " - parameter: -p\\n" >> gubbins_provenance.yml + printf -- " value: gubbins\\n" >> gubbins_provenance.yml + run_gubbins.py \ --threads ${task.cpus} \ -p gubbins \ diff --git a/modules/iqtree.nf b/modules/iqtree.nf index a7c89f2..b3fdaf0 100644 --- a/modules/iqtree.nf +++ b/modules/iqtree.nf @@ -6,10 +6,27 @@ process iqtree { path(alignment) output: - path('*.treefile') + path('*.treefile'), emit: tree + path("iqtree_provenance.yml"), emit: provenance script: """ + printf -- "- process_name: iqtree\\n" >> iqtree_provenance.yml + printf -- " tools:\\n" >> iqtree_provenance.yml + printf -- " - tool_name: iqtree\\n" >> iqtree_provenance.yml + printf -- " tool_version: \$(iqtree --version 2>&1 | awk '/IQ-TREE/ {print \$4}')\\n" >> iqtree_provenance.yml + printf -- " parameters:\\n" >> iqtree_provenance.yml + printf -- " - parameter: -nt\\n" >> iqtree_provenance.yml + printf -- " value: ${task.cpus}\\n" >> iqtree_provenance.yml + printf -- " - parameter: -fconst\\n" >> iqtree_provenance.yml + printf -- " value: \$(snp-sites -C ${alignment})\\n" >> iqtree_provenance.yml + printf -- " - parameter: -s\\n" >> iqtree_provenance.yml + printf -- " value: ${alignment}\\n" >> iqtree_provenance.yml + printf -- " - parameter: -st\\n" >> iqtree_provenance.yml + printf -- " value: DNA\\n" >> iqtree_provenance.yml + printf -- " - parameter: -m\\n" >> iqtree_provenance.yml + printf -- " value: GTR+G\\n" >> iqtree_provenance.yml + iqtree \ -nt ${task.cpus} \ -fconst \$(snp-sites -C ${alignment}) \ diff --git a/modules/provenance.nf b/modules/provenance.nf new file mode 100644 index 0000000..fb75a7e --- /dev/null +++ b/modules/provenance.nf @@ -0,0 +1,40 @@ +process collect_provenance { + + + executor 'local' + + publishDir "${params.outdir}", pattern: "*_provenance.yml", mode: 'copy' + + input: + path(provenance_files) + + output: + file("*_provenance.yml") + + script: + """ + cat ${provenance_files} >\$(date +%Y%m%d%H%M%S)_provenance.yml + """ +} + +process pipeline_provenance { + + tag { pipeline_name + " / " + pipeline_version } + + executor 'local' + + input: + tuple val(session_id), val(run_name), val(pipeline_name), val(pipeline_version), val(analysis_start) + + output: + file("pipeline_provenance.yml") + + script: + """ + printf -- "- pipeline_name: ${pipeline_name}\\n" >> pipeline_provenance.yml + printf -- " pipeline_version: ${pipeline_version}\\n" >> pipeline_provenance.yml + printf -- " nextflow_session_id: ${session_id}\\n" >> pipeline_provenance.yml + printf -- " nextflow_run_name: ${run_name}\\n" >> pipeline_provenance.yml + printf -- " timestamp_analysis_start: ${analysis_start}\\n" >> pipeline_provenance.yml + """ +} diff --git a/modules/shiptv.nf b/modules/shiptv.nf index 4d4cf67..f6833bd 100644 --- a/modules/shiptv.nf +++ b/modules/shiptv.nf @@ -7,9 +7,20 @@ process shiptv { output: path("${tree}.html") + path("shiptv_provenance.yml"), emit: provenance script: """ + printf -- "- process_name: shiptv\\n" >> shiptv_provenance.yml + printf -- " tools:\\n" >> shiptv_provenance.yml + printf -- " - tool_name: shiptv\\n" >> shiptv_provenance.yml + printf -- " tool_version: \$(shiptv --version | awk '{print \$3}')\\n" >> shiptv_provenance.yml + printf -- " parameters:\\n" >> shiptv_provenance.yml + printf -- " - parameter: -n\\n" >> shiptv_provenance.yml + printf -- " value: ${tree}\\n" >> shiptv_provenance.yml + printf -- " - parameter: -o\\n" >> shiptv_provenance.yml + printf -- " value: ${tree}.html\\n" >> shiptv_provenance.yml + shiptv -n ${tree} -o ${tree}.html """ -} \ No newline at end of file +} diff --git a/modules/snippy.nf b/modules/snippy.nf index 02a3efe..fc36a44 100644 --- a/modules/snippy.nf +++ b/modules/snippy.nf @@ -5,7 +5,8 @@ process snippy { tuple val(grouping_key), file(fastq), path(ref) output: - path("${sample_id}", type: 'dir') + path("${sample_id}", type: 'dir'), emit: sample_id_dir + tuple val(sample_id), path("snippy_provenance.yml"), emit: provenance script: if (grouping_key =~ '_S[0-9]+_') { @@ -16,6 +17,24 @@ process snippy { read_1 = fastq[0] read_2 = fastq[1] """ + printf -- "- process_name: snippy\\n" >> snippy_provenance.yml + printf -- " tools:\\n" >> snippy_provenance.yml + printf -- " - tool_name: snippy\\n" >> snippy_provenance.yml + printf -- " tool_version: \$(snippy --version | awk '{print $2}')\\n" >> snippy_provenance.yml + printf -- " parameters:\\n" >> snippy_provenance.yml + printf -- " - parameter: --cpus\\n" >> snippy_provenance.yml + printf -- " value: 8\\n" >> snippy_provenance.yml + printf -- " - parameter: -report\\n" >> snippy_provenance.yml + printf -- " value: null\\n" >> snippy_provenance.yml + printf -- " - parameter: --ref\\n" >> snippy_provenance.yml + printf -- " value: "${ref}"\\n" >> snippy_provenance.yml + printf -- " - parameter: -R1\\n" >> snippy_provenance.yml + printf -- " value: ${read_1}\\n" >> snippy_provenance.yml + printf -- " - parameter: -R2\\n" >> snippy_provenance.yml + printf -- " value: ${read_2}\\n" >> snippy_provenance.yml + printf -- " - parameter: -outdir\\n" >> snippy_provenance.yml + printf -- " value: ${sample_id}\\n" >> snippy_provenance.yml + snippy \ --cpus 8 \ --report \ diff --git a/modules/snippy_core.nf b/modules/snippy_core.nf index 9408ccb..073cdce 100644 --- a/modules/snippy_core.nf +++ b/modules/snippy_core.nf @@ -12,10 +12,21 @@ process snippy_core { path('core.tsv'), emit: core_stats path('core.full.aln'), emit: full_alignment path('clean.full.aln'), emit: clean_full_alignment + path("snippy_core_provenance.yml"), emit: provenance script: """ + printf -- "- process_name: snippy_core\\n" >> snippy_core_provenance.yml + printf -- " tools:\\n" >> snippy_core_provenance.yml + printf -- " - tool_name: snippy_core\\n" >> snippy_core_provenance.yml + printf -- " tool_version: \$(snippy-core --version | awk '{print \$2}')\\n" >> snippy_core_provenance.yml + printf -- " parameters:\\n" >> snippy_core_provenance.yml + printf -- " - parameter: --ref\\n" >> snippy_core_provenance.yml + printf -- " value: ${ref}\\n" >> snippy_core_provenance.yml + printf -- " - parameter: --mask\\n" >> snippy_core_provenance.yml + printf -- " value: ${mask}\\n" >> snippy_core_provenance.yml + snippy-core \ --ref ${ref} \ --mask ${mask} \ diff --git a/modules/snp_dists.nf b/modules/snp_dists.nf index ba82ed6..4d2e2a7 100644 --- a/modules/snp_dists.nf +++ b/modules/snp_dists.nf @@ -6,10 +6,16 @@ process snp_dists { path(alignment) output: - path("${alignment.baseName}.distances.tsv") + path("${alignment.baseName}.distances.tsv"), emit: distances + path("snp_dists_provenance.yml"), emit: provenance script: """ + printf -- "- process_name: snp_dists\\n" >> snp_dists_provenance.yml + printf -- " tools:\\n" >> snp_dists_provenance.yml + printf -- " - tool_name: snp-dists\\n" >> snp_dists_provenance.yml + printf -- " tool_version: \$(snp-dists -v | awk '{print \$2}')\\n" >> snp_dists_provenance.yml + snp-dists \ '${alignment}' \ > ${alignment.baseName}.distances.tsv diff --git a/modules/snp_sites.nf b/modules/snp_sites.nf index 28f85ed..8053963 100644 --- a/modules/snp_sites.nf +++ b/modules/snp_sites.nf @@ -6,10 +6,19 @@ process snp_sites { path(alignment) output: - path('clean.core.aln') + path('clean.core.aln'), emit: clean_core_aln + path("snp_sites_provenance.yml"), emit: provenance script: """ + printf -- "- process_name: snp_sites\\n" >> snp_sites_provenance.yml + printf -- " tools:\\n" >> snp_sites_provenance.yml + printf -- " - tool_name: snp_sites\\n" >> snp_sites_provenance.yml + printf -- " tool_version: \$(snp-sites -V | awk '{print \$2}')\\n" >> snp_sites_provenance.yml + printf -- " parameters:\\n" >> snp_sites_provenance.yml + printf -- " - parameter: -c\\n" >> snp_sites_provenance.yml + printf -- " value: ${alignment}\\n" >> snp_sites_provenance.yml + snp-sites \ -c '${alignment}' \ > clean.core.aln diff --git a/nextflow.config b/nextflow.config index b7cb5b1..a0d33e0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,6 +2,7 @@ manifest { name = "BCCDC-PHL/snippy-core-phylogenomics" mainScript = 'main.nf' nextflowVersion = '>=20.01.0' + version= '0.1.2' } params {