Skip to content

Commit

Permalink
Merge pull request #19 from BCCDC-PHL/add_prov
Browse files Browse the repository at this point in the history
add provenance
  • Loading branch information
dfornika authored Nov 22, 2024
2 parents 409718c + 8cea019 commit f603314
Show file tree
Hide file tree
Showing 11 changed files with 234 additions and 15 deletions.
62 changes: 61 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ flowchart TD
```

Note: the process WITHOUT gubbins occurs if --skip_gubbins is true, or if there are only two samples input as gubbins requires a minimum of 3 samples.
Note: the process WITHOUT gubbins occurs if --skip_gubbins is true. This parameter should be used when your input has only 2 samples as gubbins requires a minimum of 3 samples.

## Inputs

Expand All @@ -47,4 +47,64 @@ nextflow run BCCDC-PHL/snippy-core-phylogenomics \

## Outputs

## Provenance

In the output directory, a provenance file will be written with the following format:

```
- pipeline_name: BCCDC-PHL/snippy-core-phylogenomics
pipeline_version: 0.1.2
nextflow_session_id: 15245f59-7acc-4a0d-88ee-2232ddad329f
nextflow_run_name: zen_goldwasser
timestamp_analysis_start: 2024-11-21T14:59:18.014789-08:00
- process_name: snippy_core
tools:
- tool_name: snippy_core
tool_version: 4.6.0
parameters:
- parameter: --ref
value: H37Rv_NC000962.3.fasta
- parameter: --mask
value: mask.bed
- process_name: gubbins
tools:
- tool_name: gubbins
tool_version: 3.3.1
parameters:
- parameter: --threads
value: 8
- parameter: -p
value: gubbins
- process_name: snp_sites
tools:
- tool_name: snp_sites
tool_version: 2.5.1
parameters:
- parameter: -c
value: gubbins.filtered_polymorphic_sites.fasta
- process_name: iqtree
tools:
- tool_name: iqtree
tool_version: 2.3.6
parameters:
- parameter: -nt
value: 16
- parameter: -fconst
value: 0,0,0,0
- parameter: -s
value: clean.core.aln
- parameter: -st
value: DNA
- parameter: -m
value: GTR+G
- process_name: shiptv
tools:
- tool_name: shiptv
tool_version: 0.4.1
parameters:
- parameter: -n
value: clean.core.aln.treefile
- parameter: -0
value: clean.core.aln.treefile.html
```
52 changes: 43 additions & 9 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,27 @@

nextflow.enable.dsl = 2

include { snippy_core } from './modules/snippy_core.nf'
include { snp_sites } from './modules/snp_sites.nf'
include { snp_dists } from './modules/snp_dists.nf'
include { gubbins } from './modules/gubbins.nf'
include { iqtree } from './modules/iqtree.nf'
include { shiptv } from './modules/shiptv.nf'
include { snippy_core } from './modules/snippy_core.nf'
include { snp_sites } from './modules/snp_sites.nf'
include { snp_dists } from './modules/snp_dists.nf'
include { gubbins } from './modules/gubbins.nf'
include { iqtree } from './modules/iqtree.nf'
include { shiptv } from './modules/shiptv.nf'
include { pipeline_provenance } from './modules/provenance.nf'
include { collect_provenance } from './modules/provenance.nf'


workflow {
ch_workflow_metadata = Channel.value([
workflow.sessionId,
workflow.runName,
workflow.manifest.name,
workflow.manifest.version,
workflow.start,
])

ch_pipeline_provenance = pipeline_provenance(ch_workflow_metadata)

if (params.samplesheet_input != 'NO_FILE') {
ch_snippy_dirs = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['SNIPPY_DIR']] }.map{ it -> it[1] }
} else {
Expand Down Expand Up @@ -43,10 +55,32 @@ workflow {

snp_sites(ch_alignment)

iqtree(snp_sites.out)
iqtree(snp_sites.out.clean_core_aln)

snp_dists(snp_sites.out)
snp_dists(snp_sites.out.clean_core_aln)

shiptv(iqtree.out)
shiptv(iqtree.out.tree)

// Provenance collection processes
// The basic idea is to build up a channel with the following structure:
// [provenance_file_1.yml, provenance_file_2.yml, provenance_file_3.yml...]]
// ...and then concatenate them all together in the 'collect_provenance' process.
ch_pipeline_prov = pipeline_provenance.out
ch_snippy_prov = snippy_core.out.provenance
ch_gubbins_prov = gubbins.out.provenance
ch_snp_sites_prov = snp_sites.out.provenance
ch_iqtree_prov = iqtree.out.provenance
ch_shiptv_prov = shiptv.out.provenance

// Now, combine these channels in the desired order
ch_provenance = ch_pipeline_prov
.concat(ch_snippy_prov)
.concat(ch_gubbins_prov)
.concat(ch_snp_sites_prov)
.concat(ch_iqtree_prov)
.concat(ch_shiptv_prov)
.collect()

collect_provenance(ch_provenance)

}
11 changes: 11 additions & 0 deletions modules/gubbins.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,21 @@ process gubbins {
path('gubbins.filtered_polymorphic_sites.fasta'), emit: filtered_polymorphic_sites
path('gubbins.recombination_predictions.gff'), emit: recombination_predictions_gff
path('gubbins.per_branch_statistics.tsv'), emit: per_branch_statistics
path("gubbins_provenance.yml"), emit: provenance


script:
"""
printf -- "- process_name: gubbins\\n" >> gubbins_provenance.yml
printf -- " tools:\\n" >> gubbins_provenance.yml
printf -- " - tool_name: gubbins\\n" >> gubbins_provenance.yml
printf -- " tool_version: \$(gubbins -h | grep -i \"Version:\" | awk '{print \$2}')\\n" >> gubbins_provenance.yml
printf -- " parameters:\\n" >> gubbins_provenance.yml
printf -- " - parameter: --threads\\n" >> gubbins_provenance.yml
printf -- " value: ${task.cpus}\\n" >> gubbins_provenance.yml
printf -- " - parameter: -p\\n" >> gubbins_provenance.yml
printf -- " value: gubbins\\n" >> gubbins_provenance.yml
run_gubbins.py \
--threads ${task.cpus} \
-p gubbins \
Expand Down
19 changes: 18 additions & 1 deletion modules/iqtree.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,27 @@ process iqtree {
path(alignment)

output:
path('*.treefile')
path('*.treefile'), emit: tree
path("iqtree_provenance.yml"), emit: provenance

script:
"""
printf -- "- process_name: iqtree\\n" >> iqtree_provenance.yml
printf -- " tools:\\n" >> iqtree_provenance.yml
printf -- " - tool_name: iqtree\\n" >> iqtree_provenance.yml
printf -- " tool_version: \$(iqtree --version 2>&1 | awk '/IQ-TREE/ {print \$4}')\\n" >> iqtree_provenance.yml
printf -- " parameters:\\n" >> iqtree_provenance.yml
printf -- " - parameter: -nt\\n" >> iqtree_provenance.yml
printf -- " value: ${task.cpus}\\n" >> iqtree_provenance.yml
printf -- " - parameter: -fconst\\n" >> iqtree_provenance.yml
printf -- " value: \$(snp-sites -C ${alignment})\\n" >> iqtree_provenance.yml
printf -- " - parameter: -s\\n" >> iqtree_provenance.yml
printf -- " value: ${alignment}\\n" >> iqtree_provenance.yml
printf -- " - parameter: -st\\n" >> iqtree_provenance.yml
printf -- " value: DNA\\n" >> iqtree_provenance.yml
printf -- " - parameter: -m\\n" >> iqtree_provenance.yml
printf -- " value: GTR+G\\n" >> iqtree_provenance.yml
iqtree \
-nt ${task.cpus} \
-fconst \$(snp-sites -C ${alignment}) \
Expand Down
40 changes: 40 additions & 0 deletions modules/provenance.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
process collect_provenance {


executor 'local'

publishDir "${params.outdir}", pattern: "*_provenance.yml", mode: 'copy'

input:
path(provenance_files)

output:
file("*_provenance.yml")

script:
"""
cat ${provenance_files} >\$(date +%Y%m%d%H%M%S)_provenance.yml
"""
}

process pipeline_provenance {

tag { pipeline_name + " / " + pipeline_version }

executor 'local'

input:
tuple val(session_id), val(run_name), val(pipeline_name), val(pipeline_version), val(analysis_start)

output:
file("pipeline_provenance.yml")

script:
"""
printf -- "- pipeline_name: ${pipeline_name}\\n" >> pipeline_provenance.yml
printf -- " pipeline_version: ${pipeline_version}\\n" >> pipeline_provenance.yml
printf -- " nextflow_session_id: ${session_id}\\n" >> pipeline_provenance.yml
printf -- " nextflow_run_name: ${run_name}\\n" >> pipeline_provenance.yml
printf -- " timestamp_analysis_start: ${analysis_start}\\n" >> pipeline_provenance.yml
"""
}
13 changes: 12 additions & 1 deletion modules/shiptv.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,20 @@ process shiptv {

output:
path("${tree}.html")
path("shiptv_provenance.yml"), emit: provenance

script:
"""
printf -- "- process_name: shiptv\\n" >> shiptv_provenance.yml
printf -- " tools:\\n" >> shiptv_provenance.yml
printf -- " - tool_name: shiptv\\n" >> shiptv_provenance.yml
printf -- " tool_version: \$(shiptv --version | awk '{print \$3}')\\n" >> shiptv_provenance.yml
printf -- " parameters:\\n" >> shiptv_provenance.yml
printf -- " - parameter: -n\\n" >> shiptv_provenance.yml
printf -- " value: ${tree}\\n" >> shiptv_provenance.yml
printf -- " - parameter: -o\\n" >> shiptv_provenance.yml
printf -- " value: ${tree}.html\\n" >> shiptv_provenance.yml
shiptv -n ${tree} -o ${tree}.html
"""
}
}
21 changes: 20 additions & 1 deletion modules/snippy.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ process snippy {
tuple val(grouping_key), file(fastq), path(ref)

output:
path("${sample_id}", type: 'dir')
path("${sample_id}", type: 'dir'), emit: sample_id_dir
tuple val(sample_id), path("snippy_provenance.yml"), emit: provenance

script:
if (grouping_key =~ '_S[0-9]+_') {
Expand All @@ -16,6 +17,24 @@ process snippy {
read_1 = fastq[0]
read_2 = fastq[1]
"""
printf -- "- process_name: snippy\\n" >> snippy_provenance.yml
printf -- " tools:\\n" >> snippy_provenance.yml
printf -- " - tool_name: snippy\\n" >> snippy_provenance.yml
printf -- " tool_version: \$(snippy --version | awk '{print $2}')\\n" >> snippy_provenance.yml
printf -- " parameters:\\n" >> snippy_provenance.yml
printf -- " - parameter: --cpus\\n" >> snippy_provenance.yml
printf -- " value: 8\\n" >> snippy_provenance.yml
printf -- " - parameter: -report\\n" >> snippy_provenance.yml
printf -- " value: null\\n" >> snippy_provenance.yml
printf -- " - parameter: --ref\\n" >> snippy_provenance.yml
printf -- " value: "${ref}"\\n" >> snippy_provenance.yml
printf -- " - parameter: -R1\\n" >> snippy_provenance.yml
printf -- " value: ${read_1}\\n" >> snippy_provenance.yml
printf -- " - parameter: -R2\\n" >> snippy_provenance.yml
printf -- " value: ${read_2}\\n" >> snippy_provenance.yml
printf -- " - parameter: -outdir\\n" >> snippy_provenance.yml
printf -- " value: ${sample_id}\\n" >> snippy_provenance.yml
snippy \
--cpus 8 \
--report \
Expand Down
11 changes: 11 additions & 0 deletions modules/snippy_core.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,21 @@ process snippy_core {
path('core.tsv'), emit: core_stats
path('core.full.aln'), emit: full_alignment
path('clean.full.aln'), emit: clean_full_alignment
path("snippy_core_provenance.yml"), emit: provenance


script:
"""
printf -- "- process_name: snippy_core\\n" >> snippy_core_provenance.yml
printf -- " tools:\\n" >> snippy_core_provenance.yml
printf -- " - tool_name: snippy_core\\n" >> snippy_core_provenance.yml
printf -- " tool_version: \$(snippy-core --version | awk '{print \$2}')\\n" >> snippy_core_provenance.yml
printf -- " parameters:\\n" >> snippy_core_provenance.yml
printf -- " - parameter: --ref\\n" >> snippy_core_provenance.yml
printf -- " value: ${ref}\\n" >> snippy_core_provenance.yml
printf -- " - parameter: --mask\\n" >> snippy_core_provenance.yml
printf -- " value: ${mask}\\n" >> snippy_core_provenance.yml
snippy-core \
--ref ${ref} \
--mask ${mask} \
Expand Down
8 changes: 7 additions & 1 deletion modules/snp_dists.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,16 @@ process snp_dists {
path(alignment)

output:
path("${alignment.baseName}.distances.tsv")
path("${alignment.baseName}.distances.tsv"), emit: distances
path("snp_dists_provenance.yml"), emit: provenance

script:
"""
printf -- "- process_name: snp_dists\\n" >> snp_dists_provenance.yml
printf -- " tools:\\n" >> snp_dists_provenance.yml
printf -- " - tool_name: snp-dists\\n" >> snp_dists_provenance.yml
printf -- " tool_version: \$(snp-dists -v | awk '{print \$2}')\\n" >> snp_dists_provenance.yml
snp-dists \
'${alignment}' \
> ${alignment.baseName}.distances.tsv
Expand Down
11 changes: 10 additions & 1 deletion modules/snp_sites.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,19 @@ process snp_sites {
path(alignment)

output:
path('clean.core.aln')
path('clean.core.aln'), emit: clean_core_aln
path("snp_sites_provenance.yml"), emit: provenance

script:
"""
printf -- "- process_name: snp_sites\\n" >> snp_sites_provenance.yml
printf -- " tools:\\n" >> snp_sites_provenance.yml
printf -- " - tool_name: snp_sites\\n" >> snp_sites_provenance.yml
printf -- " tool_version: \$(snp-sites -V | awk '{print \$2}')\\n" >> snp_sites_provenance.yml
printf -- " parameters:\\n" >> snp_sites_provenance.yml
printf -- " - parameter: -c\\n" >> snp_sites_provenance.yml
printf -- " value: ${alignment}\\n" >> snp_sites_provenance.yml
snp-sites \
-c '${alignment}' \
> clean.core.aln
Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ manifest {
name = "BCCDC-PHL/snippy-core-phylogenomics"
mainScript = 'main.nf'
nextflowVersion = '>=20.01.0'
version= '0.1.2'
}

params {
Expand Down

0 comments on commit f603314

Please sign in to comment.