diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6f342a0f..60c30f2f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,10 +55,10 @@ jobs: # Remember that you can parallelise this by using strategy.matrix # We also test basic visualization and reporting options here run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --viz - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --smoothxg consensus_spec 10,100,1000 - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --vcf_spec "gi|568815561:#,gi|568815567:#" - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --smoothxg_write_maf - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --wfmash_chunks 2 - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --wfmash_only + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --n_mappings 11 + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --n_mappings 11 --no_viz --no_layout + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --n_mappings 11 --smoothxg consensus_spec 10,100,1000 + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --n_mappings 11 --vcf_spec "gi|568815561:#,gi|568815567:#" + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --n_mappings 11 --smoothxg_write_maf + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --n_mappings 11 --wfmash_chunks 2 + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --n_mappings 11 --wfmash_only diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index fcde400c..ad77dd56 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,7 +14,7 @@ jobs: - uses: actions/checkout@v2 - uses: actions/setup-node@v1 with: - node-version: '10' + node-version: '14' - name: Install markdownlint run: npm install -g markdownlint-cli - name: Run Markdownlint @@ -53,7 +53,7 @@ jobs: - uses: actions/checkout@v1 - uses: actions/setup-node@v1 with: - node-version: '10' + node-version: '14' - name: Install yaml-lint run: npm install -g yaml-lint - name: Run yaml-lint @@ -101,7 +101,7 @@ jobs: - uses: actions/setup-python@v1 with: - python-version: '3.6' + python-version: '3.9' architecture: 'x64' - name: Install dependencies @@ -114,7 +114,7 @@ jobs: GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }} - run: nf-core -l lint_log.txt lint ${GITHUB_WORKSPACE} --markdown lint_results.md + run: nf-core -l lint_log.txt lint -d ${GITHUB_WORKSPACE} --markdown lint_results.md - name: Save PR number if: ${{ always() }} diff --git a/Dockerfile b/Dockerfile index 0845e437..9b6af64a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ghcr.io/pangenome/pggb:20211103204137531f85 +FROM ghcr.io/pangenome/pggb:2022092217355879ede8 LABEL authors="Simon Heumos, Michael Heuer, Lukas Heumos, Erik Garrison, Andrea Guarracino" \ description="Docker image containing all software requirements for the nf-core/pangenome pipeline" diff --git a/README.md b/README.md index 52cc788f..957fd16d 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,11 @@ ## Introduction -**Warning:** This pipeline is currently UNDER CONSTRUCTION. Some features may not work or not work as intended! +> **Warning:** This pipeline is currently UNDER CONSTRUCTION. Some features may not work or not work as intended! **nf-core/pangenome** is a bioinformatics best-practise analysis pipeline for the rendering of a collection of sequences into a pangenome graph. -Its goal is to build a graph that is locally directed and acyclic while preserving large-scale variation. Maintaining local linearity is important for interpretation, visualization, mapping, comparative genomics, and reuse of pangenome graphs**. +Its goal is to build a graph that is locally directed and acyclic while preserving large-scale variation. Maintaining local linearity is important for interpretation, visualization, mapping, comparative genomics, and reuse of pangenome graphs. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. @@ -35,7 +35,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool 4. Test the workflow on a minimal dataset ```bash - nextflow run nf-core/pangenome -profile test, + nextflow run nf-core/pangenome -profile test, --n_mappings 11 ``` [//]: # (```bash nextflow run nf-core/pangenome -profile test,```) @@ -45,10 +45,10 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool 5. Start running your own analysis! ```bash - nextflow run nf-core/pangenome -profile --input "input.fa.gz" + nextflow run nf-core/pangenome -profile --input "input.fa.gz" --n_mappings 11 ``` -See [usage docs](https://nf-co.re/pangenome/usage) for all of the available options when running the pipeline. +Be careful, the input FASTA must have been compressed with [bgzip](http://www.htslib.org/doc/bgzip.html). See [usage docs](https://nf-co.re/pangenome/usage) for all of the available options when running the pipeline. ## Pipeline Summary @@ -68,12 +68,12 @@ Many thanks to all who have helped out and contributed along the way, including | Name | Affiliation | |----------------------------------------------------------|---------------------------------------------------------------------------------------| -| [Philipp Ehmele](https://github.com/imipenem) | [University of Hamburg, Hamburg, Germany](https://www.uni-hamburg.de/en.html) | +| [Philipp Ehmele](https://github.com/imipenem) | [Institute of Computational Biology, Helmholtz Zentrum München, Munich, Germany](https://www.helmholtz-muenchen.de/icb/index.html) | | [Erik Garrison](https://github.com/ekg) | [The University of Tennessee Health Science Center, Memphis, Tennessee, TN, USA](https://uthsc.edu/)| -| [Andrea Guarracino](https://github.com/AndreaGuarracino) | [University of Rome Tor Vergata, Rome, Italy](http://www.scienze.uniroma2.it/) | +| [Andrea Guarracino](https://github.com/AndreaGuarracino) | [Genomics Research Centre, Human Technopole, Milan, Italy](https://humantechnopole.it/en/) | | [Michael Heuer](https://github.com/heuermh) | [UC Berkeley, USA](https://rise.cs.berkeley.edu) | -| [Lukas Heumos](https://github.com/zethson) | [Institute of Computational Biology, Helmholtz Zentrum München, Munich, Germany](https://www.helmholtz-muenchen.de/icb/index.html) \\ [Institute of Lung Biology and Disease and Comprehensive Pneumology Center, Helmholtz Zentrum München, Munich, Germany](https://www.helmholtz-muenchen.de/ilbd/the-institute/cpc/index.html) | -| [Simon Heumos](https://github.com/subwaystation) | [Quantitative Biology Center (QBiC) Tübingen, University of Tübingen, Germany](https://uni-tuebingen.de/en/research/research-infrastructure/quantitative-biology-center-qbic/) | +| [Lukas Heumos](https://github.com/zethson) | [Institute of Computational Biology, Helmholtz Zentrum München, Munich, Germany](https://www.helmholtz-muenchen.de/icb/index.html)
[Institute of Lung Biology and Disease and Comprehensive Pneumology Center, Helmholtz Zentrum München, Munich, Germany](https://www.helmholtz-muenchen.de/ilbd/the-institute/cpc/index.html) | +| [Simon Heumos](https://github.com/subwaystation) | [Quantitative Biology Center (QBiC) Tübingen, University of Tübingen, Germany](https://uni-tuebingen.de/en/research/research-infrastructure/quantitative-biology-center-qbic/)
[Biomedical Data Science, Department of Computer Science, University of Tübingen, Germany](https://uni-tuebingen.de/en/faculties/faculty-of-science/departments/computer-science/department/) | > \* Listed in alphabetical order @@ -100,8 +100,22 @@ In addition, references of tools and data used in this pipeline are as follows: > **ODGI: understanding pangenome graphs.** > -> Andrea Guarracino, Simon Heumos, Sven Nahnsen, Pjotr Prins & Erik Garrison. +> Andrea Guarracino*, Simon Heumos*, Sven Nahnsen, Pjotr Prins & Erik Garrison. > -> _bioRxiv_ 2021 Nov 11 doi: [10.1101/2021.11.10.467921](https://doi.org/10.1101/2021.11.10.467921). +> _Bioinformatics_ 2022 Jul 01 doi: [10.1093/bioinformatics/btac308](https://doi.org/10.1093/bioinformatics/btac308). +> +> *_contributed equally_ + +> **Unbiased pangenome graphs** +> +> Erik Garrison, Andrea Guarracino. +> +> _bioRxiv_ 2022 Feb 02 doi: [10.1101/2022.02.14.480413](https://doi.org/10.1101/2022.02.14.480413). + +## Attention + +### MultiQC Report + +In the resulting MultiQC report, in the **Detailed ODGI stats table**, it says `smoothxg`. To be clear, these are the stats of the graph after polishing with `gfaffix`! Some tools were hardcoded in the ODGI MultiQC module, but hopefully this will be fixed in the future. diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 779d97b4..c2d56baa 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -1,5 +1,8 @@ # Report section config for nice titles and descriptions custom_data: + odgi_O: + section_name: ODGI Compressed 1D visualization + description: This image shows a 1D rendering of the built pangenome graph. The graph nodes are arranged from left to right, forming the pangenome sequence. Summarization of path coverage across all paths. A heatmap color-coding from https://colorbrewer2.org/#type=diverging&scheme=RdBu&n=11 is used. Dark blue means highest coverage. Dark red means lowest coverage. The path names are placed on the left. The black lines under the paths are the links, which represent the graph topology. odgi_viz: section_name: ODGI 1D visualization description: This image shows a 1D rendering of the built pangenome graph. The graph nodes are arranged from left to right, forming the pangenome sequence. The colored bars represent the paths versus the pangenome sequence in a binary matrix. The path names are placed on the left. The black lines under the paths are the links, which represent the graph topology. @@ -18,6 +21,8 @@ custom_data: # Custom search patterns to find the image outputs sp: + odgi_O: + fn: "*O_multiqc.png" odgi_draw: fn: "*draw_multiqc.png" odgi_viz: @@ -40,6 +45,7 @@ module_order: # Set the order that the custom content plots should come in custom_content: order: + - odgi_O - odgi_viz - odgi_viz_pos - odgi_viz_inv diff --git a/main.nf b/main.nf index c4fc67ba..6b57f499 100644 --- a/main.nf +++ b/main.nf @@ -17,24 +17,102 @@ if (params.help){ exit 0 } +if (params.input == null) { + log.info""" + + Mandatory argument --input missing! For more details run with --help. + + """.stripIndent() + + exit 1 +} + +if (params.n_mappings == null) { + log.info""" + + Mandatory argument --n_mappings missing! For more details run with --help. + + """.stripIndent() + + exit 1 +} + ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) // ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() // We can't change global parameters inside this scope, so we build the ones we need locally +def n_haps = 0 +if (!params.smoothxg_num_haps) { + n_haps = params.n_mappings +} + def wfmash_merge_cmd = params.wfmash_merge_segments ? "-M" : "" def wfmash_exclude_cmd = params.wfmash_exclude_delim ? "-Y${params.wfmash_exclude_delim}" : "-X" def wfmash_split_cmd = params.wfmash_no_splits ? "-N" : "" -def smoothxg_poa_params_display = params.smoothxg_poa_params.replaceAll(/,/, "_") +def wfmash_block_length_cmd = params.wfmash_block_length ? "-l${params.wfmash_block_length}" : "" +def wfmash_mash_kmer_cmd = params.wfmash_mash_kmer ? "-k${params.wfmash_mash_kmer}" : "" +def wfmash_kmer_thres_cmd = params.wfmash_mash_kmer_thres ? "-H${params.wfmash_kmer_thres}" : "" +def wfmash_n_mappings_minus_1 = params.n_mappings - 1 +def wfmash_sparse_map_cmd = "" +if (params.wfmash_sparse_map == "auto") { + n = n_haps + x = Math.log(n)/n * 10 + wfmash_sparse_map_frac = 1 + if (x >= 1) { + wfmash_sparse_map_frac = x + } + wfmash_sparse_map_cmd = "-x${wfmash_sparse_map_frac}" +} else { + if (params.wfmash_sparse_map != null) { + wfmash_sparse_map_cmd = "-x${params.wfmash_sparse_map}" + } +} +def wfmash_temp_dir = params.wfmash_temp_dir ? "-B${params.wfmash_temp_dir}" : "" + +def seqwish_temp_dir = params.seqwish_temp_dir ? "--temp-dir${params.seqwish_temp_dir}" : "" + +def smoothxg_block_id_min = params.wfmash_map_pct_id / 100.0 +// TODO: CHANGE TO LARGE P ONCE WE ARE THERE +def smoothxg_poa_params_cmd = "" +if (params.smoothxg_poa_params == null) { + smoothxg_poa_params = "-P 1,19,39,3,81,1" +} else { + if (params.smoothxg_poa_params == "asm5") { + smoothxg_poa_params = "-P 1,19,39,3,81,1" + } else if (params.smoothxg_poa_params == "asm10") { + smoothxg_poa_params = "-P 1,9,16,2,41,1" + } else if (params.smoothxg_poa_params == "asm15") { + smoothxg_poa_params = "-P 1,7,11,2,33,1" + } else if (params.smoothxg_poa_params == "asm20") { + smoothxg_poa_params = "-P 1,4,6,2,26,1"B + } else { + smoothxg_poa_params = "-P${params.smoothxg_poa_params}" + } +} +def smoothxg_poa_params_display = smoothxg_poa_params.replaceAll(/,/, "_") +def smoothxg_temp_dir = params.smoothxg_temp_dir ? "-b${params.smoothxg_temp_dir}" : "" +def smoothxg_keep_intermediate_files = params.smoothxg_keep_intermediate_files ? "-K" : "" +def smoothxg_xpoa = "-S" +if (params.smoothxg_run_abpoa != null) { + smoothxg_xpoa = "" +} +def smoothxg_poa_mode = params.smoothxg_run_global_poa ? "-Z" : "" +// disabling consensus graph mode +def smoothxg_consensus_spec = false + def wfmash_prefix = "wfmash" def seqwish_prefix = ".seqwish" def smoothxg_prefix = ".smoothxg" -def n_haps = 0 -def do_1d = false -def do_2d = false -if (params.viz) { - do_1d = true - do_2d = true +def do_1d = true +def do_2d = true + +if (params.no_viz) { + do_1d = false +} + +if (params.no_layout) { + do_2d = false } def make_file_prefix = { f -> """\ @@ -42,16 +120,32 @@ ${f.getName()}\ """ } fasta = channel.fromPath("${params.input}").map { f -> tuple(make_file_prefix(f), f) } +fai_path = file("${params.input}.fai") +gzi_path = file("${params.input}.gzi") -if (!params.smoothxg_num_haps) { - n_haps = params.wfmash_n_mappings +process samtoolsFaidx { + publishDir "${params.outdir}/samtools_faidx", mode: "${params.publish_dir_mode}" + + input: + tuple val(f), path(fasta) + + output: + path("${f}.fai"), emit: samtools_fai + path("${f}.gzi"), emit: samtools_gzi + + """ + samtools faidx $fasta + """ } + process wfmashMap { publishDir "${params.outdir}/wfmash_map", mode: "${params.publish_dir_mode}" input: tuple val(f), path(fasta) + path(fai) + path(gzi) output: tuple val(f), path("${f}.${wfmash_prefix}.map.paf") @@ -59,12 +153,15 @@ process wfmashMap { """ wfmash ${wfmash_exclude_cmd} \ -s ${params.wfmash_segment_length} \ - -l ${params.wfmash_block_length} \ + ${wfmash_block_length_cmd} \ ${wfmash_merge_cmd} \ ${wfmash_split_cmd} \ + ${wfmash_mash_kmer_cmd} \ + ${wfmash_kmer_thres_cmd} \ + ${wfmash_sparse_map_cmd} \ -p ${params.wfmash_map_pct_id} \ - -n ${params.wfmash_n_mappings} \ - -k ${params.wfmash_mash_kmer} \ + -n ${wfmash_n_mappings_minus_1} \ + ${wfmash_temp_dir} \ -t ${task.cpus} \ -m \ $fasta $fasta \ @@ -88,20 +185,25 @@ process wfmashAlign { publishDir "${params.outdir}/wfmash_align", mode: "${params.publish_dir_mode}" input: - tuple val(f), path(fasta), path(paf) + tuple val(f), path(fasta), path(paf) + path(fai) + path(gzi) output: - path("${paf}.align.paf") + path("${paf}.align.paf"), emit: paf """ wfmash ${wfmash_exclude_cmd} \ -s ${params.wfmash_segment_length} \ - -l ${params.wfmash_block_length} \ + ${wfmash_block_length_cmd} \ ${wfmash_merge_cmd} \ ${wfmash_split_cmd} \ + ${wfmash_mash_kmer_cmd} \ + ${wfmash_kmer_thres_cmd} \ + ${wfmash_sparse_map_cmd} \ -p ${params.wfmash_map_pct_id} \ - -n ${params.wfmash_n_mappings} \ - -k ${params.wfmash_mash_kmer} \ + -n ${wfmash_n_mappings_minus_1} \ + ${wfmash_temp_dir} \ -t ${task.cpus} \ -i $paf \ $fasta $fasta \ @@ -114,22 +216,27 @@ process wfmash { input: tuple val(f), path(fasta) + path(fai) + path(gzi) output: - tuple val(f), path("${f}${wfmash_prefix}.paf") + tuple val(f), path("${f}.${wfmash_prefix}.paf") """ wfmash ${wfmash_exclude_cmd} \ -s ${params.wfmash_segment_length} \ - -l ${params.wfmash_block_length} \ + ${wfmash_block_length_cmd} \ ${wfmash_merge_cmd} \ ${wfmash_split_cmd} \ + ${wfmash_mash_kmer_cmd} \ + ${wfmash_kmer_thres_cmd} \ + ${wfmash_sparse_map_cmd} \ -p ${params.wfmash_map_pct_id} \ - -n ${params.wfmash_n_mappings} \ - -k ${params.wfmash_mash_kmer} \ + -n ${wfmash_n_mappings_minus_1} \ + ${wfmash_temp_dir} \ -t ${task.cpus} \ $fasta $fasta \ - >${f}${wfmash_prefix}.paf + >${f}.${wfmash_prefix}.paf """ } @@ -138,26 +245,23 @@ process seqwish { input: tuple val(f), path(fasta) - path(pafs) + path(paf) output: tuple val(f), path("${f}${seqwish_prefix}.gfa") script: + def input = paf.join(',') """ - if [[ \$(ls *.paf | wc -l) == 1 ]]; then - input=$pafs - else - input=\$(ls *.paf | tr '\\\n' ',') - input=\${input::-1} - fi seqwish \ -t ${task.cpus} \ -s $fasta \ - -p \$input \ + -p $input \ -k ${params.seqwish_min_match_length} \ + -f ${params.seqwish_sparse_factor} \ -g ${f}${seqwish_prefix}.gfa -P \ -B ${params.seqwish_transclose_batch} \ + ${seqwish_temp_dir} \ -P """ } @@ -194,44 +298,45 @@ process smoothxg { -T ${task.cpus} \ -g \$input_gfa \ -w \$(echo "\$poa_length * ${n_haps}" | bc) \ - -K \ + ${smoothxg_temp_dir} \ + ${smoothxg_keep_intermediate_files} \ -X 100 \ - -I ${params.smoothxg_block_id_min} \ + -I ${smoothxg_block_id_min} \ -R ${params.smoothxg_block_ratio_min} \ -j ${params.smoothxg_max_path_jump} \ -e ${params.smoothxg_max_edge_jump} \ -l \$poa_length \ - -p ${params.smoothxg_poa_params} \ + ${smoothxg_poa_params} \ -O ${params.smoothxg_poa_padding} \ -Y \$(echo "${params.smoothxg_pad_max_depth} * ${n_haps}" | bc) \ -d 0 -D 0 \ + ${smoothxg_xpoa} \ + ${smoothxg_poa_mode} \ -V \ -o smooth.\$i.gfa else poa_length=\$(echo ${params.smoothxg_poa_length} | cut -f \$i -d,) - consensus_params="-V" - if [[ ${params.smoothxg_consensus_spec} != false ]]; then - consensus_params="-C ${f}.cons,${params.smoothxg_consensus_spec}" - fi smoothxg \ -t ${task.cpus} \ -T ${task.cpus} \ -g \$input_gfa \ -w \$(echo "\$poa_length * ${n_haps}" | bc) \ - -K \ + ${smoothxg_temp_dir} \ + ${smoothxg_keep_intermediate_files} \ -X 100 \ - -I ${params.smoothxg_block_id_min} \ + -I ${smoothxg_block_id_min} \ -R ${params.smoothxg_block_ratio_min} \ -j ${params.smoothxg_max_path_jump} \ -e ${params.smoothxg_max_edge_jump} \ -l \$poa_length \ - -p ${params.smoothxg_poa_params} \ + ${smoothxg_poa_params} \ -O ${params.smoothxg_poa_padding} \ -Y \$(echo "${params.smoothxg_pad_max_depth} * ${n_haps}" | bc) \ -d 0 -D 0 \ + ${smoothxg_xpoa} \ + ${smoothxg_poa_mode} \ \$maf_params \ - -Q ${params.smoothxg_consensus_prefix} \ - \$consensus_params \ + -V \ -o ${f}${smoothxg_prefix}.gfa fi done @@ -300,6 +405,7 @@ process odgiViz { odgi viz -i $graph -o ${graph}.viz_pos_multiqc.png -x 1500 -y 500 -a 10 -I ${params.smoothxg_consensus_prefix} -u -d odgi viz -i $graph -o ${graph}.viz_depth_multiqc.png -x 1500 -y 500 -a 10 -I ${params.smoothxg_consensus_prefix} -m odgi viz -i $graph -o ${graph}.viz_inv_multiqc.png -x 1500 -y 500 -a 10 -I ${params.smoothxg_consensus_prefix} -z + odgi viz -i $graph -o ${graph}.viz_O_multiqc.png -x 1500 -y 500 -a 10 -I ${params.smoothxg_consensus_prefix} -O """ } @@ -343,24 +449,41 @@ process odgiDraw { """ } -// TODO we can parallelize this for each reference given in ${params.vcf_spec} process vg_deconstruct { publishDir "${params.outdir}/vg_deconstruct", mode: "${params.publish_dir_mode}" input: - path(graph) + tuple path(graph), val(vcf_spec) output: - path("${graph}.*.vcf") + path("${graph}.*.vcf"), emit: vg_deconstruct_vcf + path("*.vcf.stats"), optional: true, emit: vg_deconstruct_bcftools_stats """ - for s in \$(echo "${params.vcf_spec}" | tr ',' ' '); - do - ref=\$(echo "\$s" | cut -f 1 -d:) - delim=\$(echo "\$s" | cut -f 2 -d:) - vcf="${graph}".\$(echo \$ref | tr '/|' '_').vcf - vg deconstruct -P \$ref -H \$delim -e -a -t "${task.cpus}" "${graph}" > \$vcf - done + ref=\$(echo "$vcf_spec" | cut -f 1 -d:) + delim=\$(echo "$vcf_spec" | cut -f 2 -d:) + pop_length=\$(echo "$vcf_spec" | cut -f 3 -d:) + if [[ -z \$pop_length ]]; then + pop_length=0 + fi + vcf="${graph}".\$(echo \$ref | tr '/|' '_').vcf + vg deconstruct -P \$ref -H \$delim -e -a -t "${task.cpus}" "${graph}" > \$vcf + bcftools stats \$vcf > \$vcf.stats + + if [[ \$pop_length -gt 0 ]]; then + vcf_decomposed=${graph}.final.\$(echo \$ref | tr '/|' '_').decomposed.vcf + vcf_decomposed_tmp=\$vcf_decomposed.tmp.vcf + bgzip -c -@ ${task.cpus} \$vcf > \$vcf.gz + vcfbub -l 0 -a \$pop_length --input \$vcf.gz | vcfwave -I 1000 -t ${task.cpus} > \$vcf_decomposed_tmp + + #TODO: to remove when vcfwave will be bug-free + # The TYPE info sometimes is wrong/missing + # There are variants without the ALT allele + bcftools annotate -x INFO/TYPE \$vcf_decomposed_tmp | awk '\$5 != "."' > \$vcf_decomposed + rm \$vcf_decomposed_tmp \$vcf.gz + + bcftools stats \$vcf_decomposed > \$vcf_decomposed.stats +fi """ } @@ -369,6 +492,7 @@ process multiQC { publishDir "${params.outdir}", mode: "${params.publish_dir_mode}" input: + path vg_deconstruct_bcftools_stats path odgi_stats path odgi_viz path odgi_draw @@ -387,51 +511,77 @@ process multiQC { workflow { main: + if (!fai_path.exists() || !gzi_path.exists()) { // the assumption is that none of the files exist if only one does not exist + samtoolsFaidx(fasta) + fai = samtoolsFaidx.out.samtools_fai.collect() + gzi = samtoolsFaidx.out.samtools_gzi.collect() + } else { + fai = channel.fromPath("${params.input}.fai").collect() + gzi = channel.fromPath("${params.input}.gzi").collect() + } if (params.wfmash_only) { // TODO Once we changed the way we changed the publish_dir_mode, we have to emit the .paf file as default, else not if (params.wfmash_chunks == 1) { - wfmash(fasta) + wfmash(fasta, fai, gzi) } else { - wfmashMap(fasta) + wfmashMap(fasta, fai, gzi) splitApproxMappingsInChunks(wfmashMap.out) - wfmashAlign(fasta.combine(splitApproxMappingsInChunks.out.flatten())) + // TODO update this once I understood it + wfmashAlign(fasta.combine(splitApproxMappingsInChunks.out.flatten()), fai, gzi) } } else { - if (params.wfmash_chunks == 1) { - wfmash(fasta) - seqwish(fasta, wfmash.out.collect{it[1]}) + if (params.paf != null) { + paf_ch = Channel.fromPath(params.paf) + seqwish(fasta, paf_ch) } else { - wfmashMap(fasta) - splitApproxMappingsInChunks(wfmashMap.out) - wfmashAlign(fasta.combine(splitApproxMappingsInChunks.out.flatten())) - seqwish(fasta, wfmashAlign.out.collect()) + if (params.wfmash_chunks == 1) { + wfmash(fasta, fai, gzi) + seqwish(fasta, wfmash.out.collect{it[1]}) + } else { + wfmashMap(fasta, fai, gzi) + splitApproxMappingsInChunks(wfmashMap.out) + wfmashAlign(fasta.combine(splitApproxMappingsInChunks.out.flatten()), fai, gzi) + seqwish(fasta, wfmashAlign.out.collect()) + } } smoothxg(seqwish.out) gfaffix(smoothxg.out.gfa_smooth) - odgiBuild(seqwish.out.collect{it[1]}.mix(smoothxg.out.consensus_smooth.flatten(), gfaffix.out.gfa_norm)) - odgiStats(odgiBuild.out) + odgiBuild(seqwish.out.collect{it[1]}.mix(smoothxg.out.consensus_smooth.flatten())) + odgiStats(odgiBuild.out.mix(gfaffix.out.og_norm)) odgiVizOut = Channel.empty() if (do_1d) { - odgiVizOut = odgiViz(odgiBuild.out.filter( ~/.*smoothxg.*/ )) + odgiVizOut = odgiViz(gfaffix.out.og_norm) } odgiDrawOut = Channel.empty() if (do_2d) { - odgiLayout(odgiBuild.out.filter( ~/.*smoothxg.*/ )) + odgiLayout(gfaffix.out.og_norm) odgiDrawOut = odgiDraw(odgiLayout.out) } - if (params.vcf_spec != false) { - vg_deconstruct(gfaffix.out.gfa_norm) - } - - multiQC( + ch_vcf_spec = Channel.empty() + vg_deconstruct = Channel.empty() + if (params.vcf_spec != null) { + ch_vcf_spec = Channel.from(params.vcf_spec).splitCsv().flatten() + vg_deconstruct(gfaffix.out.gfa_norm.combine(ch_vcf_spec)) + // TODO add bcftools + multiQC( + vg_deconstruct.out.vg_deconstruct_bcftools_stats.collect().ifEmpty([]), odgiStats.out.collect().ifEmpty([]), odgiVizOut.collect().ifEmpty([]), odgiDrawOut.collect().ifEmpty([]), ch_multiqc_config - ) + ) + } else { + multiQC( + vg_deconstruct.collect().ifEmpty([]), + odgiStats.out.collect().ifEmpty([]), + odgiVizOut.collect().ifEmpty([]), + odgiDrawOut.collect().ifEmpty([]), + ch_multiqc_config + ) + } } } @@ -461,58 +611,57 @@ def helpMessage() { nextflow run nf-core/pangenome --input 'data/input.fa.gz' -profile docker Mandatory arguments: - --input [file] Path to input FASTA (must be surrounded with quotes) + --input [file] Path to bgzipped input FASTA (must be surrounded with quotes) + -- n_mappings [int] Number of mappings to retain for each segment. -profile [str] Configuration profile to use. Can use multiple (comma separated) Available: conda, docker, singularity, test, awsbatch, and more + PAF options: + --paf [file] Optional input to skip the all vs. all alignment wfmash phase directly starting with seqwish. Wfmash options: --wfmash_map_pct_id [n] percent identity in the wfmash mashmap step [default: 90] - --wfmash_n_mappings [n] number of secondary mappings to retain in 'map' filter mode [default: 10] - --wfmash_segment_length [n] segment length for mapping [default: 3000] - --wfmash_block_length [n] minimum block length filter for mapping [default: 3 * wfmash_segment_length] - --wfmash_mash_kmer [n] kmer size for mashmap [default: 16] + --wfmash_segment_length [n] segment length for mapping [default: 5000] + --wfmash_block_length [n] minimum block length filter for mapping + --wfmash_mash_kmer [n] kmer size for mashmap + --wfmash_mash_kmer_thres [n] ignore the top % most-frequent kmers [default: 0.001] --wfmash_merge_segments merge successive mappings [default: OFF] --wfmash_no_splits disable splitting of input sequences during mapping [default: OFF] --wfmash_exclude--delim [c] skip mappings between sequences with the same name prefix before the given delimiter character [default: all-vs-all and !self] --wfmash_chunks The number of files to generate from the approximate wfmash mappings to scale across a whole cluster. It is recommended to set this to the number of available nodes. If only one machine is available, leave it at 1. [default: 1] --wfmash_only If this parameter is set, only the wfmash alignment step of the pipeline is executed. This option is offered for users who want to use wfmash on a cluster. [default: OFF] + --wfmash_sparse_map keep this fraction of mappings ('auto' for giant component heuristic) [default: 1.0] + --wfmash_temp_dir [str] directory for temporary files Seqwish options: - --seqwish_min_match_length [n] ignore exact matches below this length [default: 47] + --seqwish_min_match_length [n] ignore exact matches below this length [default: 19] --seqwish_transclose_batch [n] number of bp to use for transitive closure batch [default: 10000000] + --seqwish_sparse_factor [n] keep this randomly selected fraction of input matches [default: no sparsification] + --seqwish_temp_dir [str] directory for temporary files Smoothxg options: --smoothxg_num_haps [n] number of haplotypes in the given FASTA [default: wfmash_n_mappings] --smoothxg_max_path_jump [n] maximum path jump to include in block [default: 0] --smoothxg_max_edge_jump [n] maximum edge jump before breaking [default: 0] - --smoothxg_max_poa_length [n] maximum sequence length to put into POA, can be a comma-separated list; - for each element smoothxg will be executed once [default: 4001,4507] - --smoothxg_consensus_spec [str] consensus graph specification: write the consensus graph to - BASENAME.cons_[spec].gfa; where each spec contains at least a min_len parameter - (which defines the length of divergences from consensus paths to preserve in the - output), optionally a file containing reference paths to preserve in the output, - a flag (y/n) indicating whether we should also use the POA consensus paths, a - minimum coverage of consensus paths to retain (min_cov), and a maximum allele - length (max_len, defaults to 1e6); implies -a; example: - cons,100,1000:refs1.txt:n,1000:refs2.txt:y:2.3:1000000,10000 - [default: OFF] + --smoothxg_poa_length [n] maximum sequence length to put into POA, can be a comma-separated list; + for each element smoothxg will be executed once [default: 700,900,1100] --smoothxg_consensus_prefix [n] use this prefix for consensus path names [default: Consensus_] --smoothxg_block_ratio_min [n] minimum small / large length ratio to cluster in a block [default: 0.0] - --smoothxg_block_id_min [n] split blocks into groups connected by this identity threshold [default: 0.95] + --smoothxg_block_id_min [n] split blocks into groups connected by this identity threshold [default: wfmash_map_pct_id / 100.0] --smoothxg_pad_max_depth [n] path depth at which we don't pad the POA problem [default: 100] --smoothxg_poa_padding [n] pad each end of each sequence in POA with N*(longest_poas_seq) bp [default: 0.03] - --smoothxg_poa_params [str] score parameters for POA in the form of match,mismatch,gap1,ext1,gap2,ext2 - [default: 1,9,16,2,41,1] + --smoothxg_poa_params [str] score parameters for POA in the form of match,mismatch,gap1,ext1,gap2,ext2 may also be given as presets: asm5, asm10, asm15, asm20 [default: 1,19,39,3,81,1 = asm5] + --smoothxg_run_abpoa run abPOA [default: SPOA] + --smoothxg_run_global_poa run the POA in global mode [default: local mode] --smoothxg_write_maf [n] write MAF output representing merged POA blocks [default: OFF] + --smoothxg_keep_intermediate_files keep intermediate graphs during smoothxg step + --smoothxg_temp_dir [str] directory for temporary files Visualization options: - --viz Generate 1D and 2D visualisations of the built graphs [default: OFF] + --no_viz Set if you don't want the 1D visualizations. + --no_layout Set if you don't want the computational expensive 2D layout. VCF options: - --vcf_spec specify a set of VCFs to produce with SPEC = REF:DELIM[,REF:DELIM]* - the paths matching ^REF are used as a reference, while the sample haplotypes - are derived from path names, e.g. when DELIM=# and with '-V chm13:#', - a path named HG002#1#ctg would be assigned to sample HG002 phase 1 [default: OFF] + --vcf_spec specify a set of VCFs to produce with SPEC = REF:DELIM[:LEN][,REF:DELIM:[LEN]]* the paths matching ^REF are used as a reference, while the sample haplotypes are derived from path names, e.g. when DELIM=# and with '-V chm13:#', a path name HG002#1#ctg would be assigned to sample HG002 phase 1. If LEN is specified and greater than 0, the VCFs are decomposed, filtering sites whose max allele length is greater than LEN. [default: off] Other options: --outdir [file] The output directory where the results will be saved [default: ./results] diff --git a/nextflow.config b/nextflow.config index 5ad6b56f..851f7b3f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,7 +11,13 @@ params { // Workflow flags // Input options + // FASTA input = null + // number of mappings + n_mappings = null // default could be the number of input sequences, but then I would have to add another process + + // Optional PAF input + paf = null // Output options outdir = "./results" @@ -20,49 +26,58 @@ params { // nf-core: Specify your pipeline's command line flags // Visualization - viz = false + no_viz = false + no_layout = false // Alignment options wfmash_map_pct_id = 90 - wfmash_n_mappings = 10 // default could be the number of input sequences - wfmash_segment_length = 3000 - wfmash_block_length = 3 * wfmash_segment_length - wfmash_mash_kmer = 16 + wfmash_segment_length = 5000 + wfmash_block_length = null + wfmash_mash_kmer = null + wfmash_mash_kmer_thres = null + wfmash_sparse_map = null wfmash_merge_segments = false wfmash_no_splits = false - wfmash_exclude_delim = false + wfmash_exclude_delim = null wfmash_chunks = 1 wfmash_only = false + wfmash_temp_dir = null // Seqwish options - seqwish_min_match_length = 47 + seqwish_min_match_length = 19 seqwish_transclose_batch = 10000000 + seqwish_sparse_factor = 0 + seqwish_temp_dir = null // Smoothxg options - smoothxg_num_haps = false + smoothxg_num_haps = null smoothxg_max_path_jump = 0 smoothxg_max_edge_jump = 0 - smoothxg_poa_length = "4001,4507" - smoothxg_block_id_min = 0.95 + smoothxg_poa_length = "700,900,1100" + smoothxg_block_id_min = null smoothxg_block_ratio_min = 0 smoothxg_pad_max_depth = 100 - smoothxg_poa_padding = 0.03 + smoothxg_poa_padding = 0.001 // poa param suggestions from minimap2 // - asm5, --poa-params 1,19,39,3,81,1, ~0.1 divergence // - asm10, --poa-params 1,9,16,2,41,1, ~1 divergence // - asm20, --poa-params 1,4,6,2,26,1, ~5% divergence - smoothxg_poa_params = "1,9,16,2,41,1" + smoothxg_poa_params = null smoothxg_write_maf = false - smoothxg_consensus_spec = false + // smoothxg_consensus_spec = false smoothxg_consensus_prefix = "Consensus_" + smoothxg_temp_dir = null + smoothxg_keep_intermediate_files = null + smoothxg_run_abpoa = null + smoothxg_run_global_poa = null - // vcf_spec = "gi|568815561:#,gi|568815567:#" - vcf_spec = false + // vcf_spec = "gi|568815561:#,gi|568815567:#:10000" + vcf_spec = null // Boilerplate options - multiqc_config = false - email = false - email_on_fail = false + multiqc_config = null + email = null + email_on_fail = null max_multiqc_email_size = 25.MB plaintext_email = false monochrome_logs = false @@ -72,10 +87,10 @@ params { // Config options custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - hostnames = false - config_profile_description = false - config_profile_contact = false - config_profile_url = false + hostnames = null + config_profile_description = null + config_profile_contact = null + config_profile_url = null validate_params = true show_hidden_params = false schema_ignore_params = 'genomes,input_paths' @@ -224,4 +239,4 @@ def check_max(obj, type) { return obj } } -} +} \ No newline at end of file diff --git a/nextflow_schema.json b/nextflow_schema.json index 84f105bc..87d04218 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -11,7 +11,8 @@ "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", "required": [ - "input" + "input", + "n_mappings" ], "properties": { "input": { @@ -20,6 +21,11 @@ "description": "Input FASTA file.", "help_text": "Use this to specify the location of your input FASTA file. For example:\n\n```bash\n--input 'path/to/data/input.fa.gz'\n```\n\n." }, + "n_mappings": { + "type": "integer", + "fa_icon": "fas fa-dna", + "description": "Number of mappings to retain for each segment." + }, "outdir": { "type": "string", "description": "The output directory where the results will be saved.", @@ -32,6 +38,11 @@ "fa_icon": "fas fa-envelope", "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, + "paf": { + "type": "string", + "fa_icon": "fas fa-align-center", + "description": "Optional input to skip the all vs. all alignment phase directly starting with seqwish." } } }, @@ -47,15 +58,9 @@ "description": "percent identity in the wfmash mashmap step", "fa_icon": "fas fa-align-center" }, - "wfmash_n_mappings": { - "type": "integer", - "default": 10, - "description": "number of secondary mappings to retain in 'map' filter mode", - "fa_icon": "fas fa-align-center" - }, "wfmash_segment_length": { "type": "integer", - "default": 3000, + "default": 5000, "description": "segment length for mapping", "fa_icon": "fas fa-align-center" }, @@ -66,10 +71,13 @@ }, "wfmash_mash_kmer": { "type": "integer", - "default": 16, "description": "kmer size for mashmap", "fa_icon": "fas fa-align-center" }, + "wfmash_mash_kmer_thres": { + "type": "number", + "description": "ignore the top % most-frequent kmers [default: 0.001]" + }, "wfmash_merge_segments": { "type": "boolean", "description": "merge successive mappings", @@ -93,6 +101,14 @@ "wfmash_only": { "type": "boolean", "description": "If this parameter is set, only the wfmash alignment step of the pipeline is executed. This option is offered for users who want to use wfmash on a cluster." + }, + "wfmash_sparse_map": { + "type": "string", + "description": "keep this fraction of mappings ('auto' for giant component heuristic) [default: 1.0]" + }, + "wfmash_temp_dir": { + "type": "string", + "description": "directory for temporary files" } } }, @@ -110,9 +126,18 @@ }, "seqwish_transclose_batch": { "type": "integer", - "default": 1000000, + "default": 10000000, "description": "Number of bp to use for transitive closure batch.", "fa_icon": "fas fa-project-diagram" + }, + "seqwish_sparse_factor": { + "type": "integer", + "default": 0, + "description": "keep this randomly selected fraction of input matches [default: no sparsification]" + }, + "seqwish_temp_dir": { + "type": "string", + "description": "directory for temporary files" } }, "fa_icon": "fas fa-project-diagram" @@ -124,8 +149,8 @@ "default": "", "properties": { "smoothxg_num_haps": { - "type": "string", - "default": "wfmash_n_mappings", + "type": "integer", + "default": 0, "description": "number of haplotypes in the given FASTA" }, "smoothxg_max_path_jump": { @@ -140,14 +165,9 @@ }, "smoothxg_poa_length": { "type": "string", - "default": "4001,4507", + "default": "700,900,1100", "description": "maximum sequence length to put into POA, can be a comma-separated list; for each element smoothxg will be executed once" }, - "smoothxg_consensus_spec": { - "type": "string", - "description": "Consensus graph specification: write the consensus graph to BASENAME.cons_[spec].gfa; where each spec contains at least a min_len parameter (which defines the length of divergences from consensus paths to preserve in the output), optionally a file containing reference paths to preserve in the output, a flag (y/n) indicating whether we should also use the POA consensus paths, a minimum coverage of consensus paths to retain (min_cov), and a maximum allele length (max_len, defaults to 1e6); implies -a; example: cons,100,1000:refs1.txt:n,1000:refs2.txt:y:2.3:1000000,10000.", - "fa_icon": "fab fa-superpowers" - }, "smoothxg_consensus_prefix": { "type": "string", "default": "Consensus_", @@ -155,14 +175,13 @@ }, "smoothxg_block_ratio_min": { "type": "number", - "default": 0, - "description": "minimum small / large length ratio to cluster in a block" + "description": "minimum small / large length ratio to cluster in a block", + "default": 0 }, "smoothxg_block_id_min": { "type": "number", "description": "Split blocks into groups connected by this identity threshold.", - "fa_icon": "fas fa-percentage", - "default": 0.95 + "fa_icon": "fas fa-percentage" }, "smoothxg_pad_max_depth": { "type": "integer", @@ -171,18 +190,34 @@ }, "smoothxg_poa_padding": { "type": "number", - "default": 0.03, + "default": 0.001, "description": "pad each end of each sequence in POA with N*(longest_poas_seq) bp" }, "smoothxg_poa_params": { "type": "string", - "default": "1,9,16,2,41,1", - "description": "Score parameters for POA in the form of match,mismatch,gap1,ext1,gap2,ext2.", + "default": "1,19,39,3,81,1", + "description": "score parameters for POA in the form of match,mismatch,gap1,ext1,gap2,ext2 may also be given as presets: asm5, asm10, asm15, asm20 [default: 1,19,39,3,81,1 = asm5]", "fa_icon": "fab fa-superpowers" }, + "smoothxg_run_abpoa": { + "type": "boolean", + "description": "run abPOA [default: SPOA]" + }, + "smoothxg_run_global_poa": { + "type": "string", + "description": "run the POA in global mode [default: local mode]" + }, "smoothxg_write_maf": { "type": "boolean", "description": "write MAF output representing merged POA blocks" + }, + "smoothxg_keep_intermediate_files": { + "type": "string", + "description": "keep intermediate graphs during smoothxg step" + }, + "smoothxg_temp_dir": { + "type": "string", + "description": "directory for temporary files" } }, "fa_icon": "fas fa-project-diagram" @@ -191,11 +226,11 @@ "title": "VCF options", "type": "object", "description": "Optios for vg deconstruct.", - "default": "", + "default": "false", "properties": { "vcf_spec": { "type": "string", - "description": "specify a set of VCFs to produce with SPEC = REF:DELIM[,REF:DELIM]* the paths matching ^REF are used as a reference, while the sample haplotypes are derived from path names, e.g. when DELIM=# and with '-V chm13:#', a path named HG002#1#ctg would be assigned to sample HG002 phase 1" + "description": "specify a set of VCFs to produce with SPEC = REF:DELIM[:LEN][,REF:DELIM:[LEN]]* the paths matching ^REF are used as a reference, while the sample haplotypes are derived from path names, e.g. when DELIM=# and with '-V chm13:#', a path name HG002#1#ctg would be assigned to sample HG002 phase 1. If LEN is specified and greater than 0, the VCFs are decomposed, filtering sites whose max allele length is greater than LEN. [default: off]" } } }, @@ -206,10 +241,13 @@ "default": "", "fa_icon": "fas fa-project-diagram", "properties": { - "viz": { + "no_viz": { "type": "boolean", - "description": "Generate 1D and 2D visualisations of the built graphs", - "fa_icon": "fas fa-project-diagram" + "description": "Set if you don't want the 1D visualizations." + }, + "no_layout": { + "type": "boolean", + "description": "Set if you don't want the computational expensive 2D layout." } } }, @@ -415,4 +453,4 @@ "$ref": "#/definitions/institutional_config_options" } ] -} \ No newline at end of file +}