diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d48b275..c5ea8246 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` +- [#135](https://github.com/nf-core/bacass/pull/135) Replaced nf-core MultiQC module with a custom MultiQC module. + ### `Added` +- [#135](https://github.com/nf-core/bacass/pull/135) Implementation of KmerFinder subworkflow Custom Quast, and Custom MultiQC Reports: + + - Added KmerFinder subworkflow for read quality control, purity assessment, and sample grouping based on reference genome estimation. + - Enhanced Quast Assembly QC to run both general and reference genome-based analyses when KmerFinder is invoked. + - Implemented custom MultiQC module with multiqc_config.yml files for different assembly modes (short, long, hybrid). + - Generated custom MultiQC HTML report consolidating metrics from KmerFinder, Quast, and other relevant sources. + - [#133](https://github.com/nf-core/bacass/pull/133) Update nf-core/bacass to the new nf-core 2.14.1 `TEMPLATE`. ### `Fixed` diff --git a/README.md b/README.md index af950fd1..589004f5 100644 --- a/README.md +++ b/README.md @@ -29,11 +29,12 @@ On release, automated continuous integration tests run the pipeline on a full-si ### Short Read Assembly -This pipeline is primarily for bacterial assembly of next-generation sequencing reads. It can be used to quality trim your reads using [FastP](https://github.com/OpenGene/fastp) and performs basic sequencing QC using [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Afterwards, the pipeline performs read assembly using [Unicycler](https://github.com/rrwick/Unicycler). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) to verify sample purity. +This pipeline is primarily for bacterial assembly of next-generation sequencing reads. It can be used to quality trim your reads using [FastP](https://github.com/OpenGene/fastp) and performs basic sequencing QC using [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Afterwards, the pipeline performs read assembly using [Unicycler](https://github.com/rrwick/Unicycler). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) and [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) to verify sample purity. ### Long Read Assembly -For users that only have Nanopore data, the pipeline quality trims these using [PoreChop](https://github.com/rrwick/Porechop) and assesses basic sequencing QC utilizing [NanoPlot](https://github.com/wdecoster/NanoPlot) and [PycoQC](https://github.com/a-slide/pycoQC). +For users that only have Nanopore data, the pipeline quality trims these using [PoreChop](https://github.com/rrwick/Porechop) and assesses basic sequencing QC utilizing [NanoPlot](https://github.com/wdecoster/NanoPlot) and [PycoQC](https://github.com/a-slide/pycoQC). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) and [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) to verify sample purity. + The pipeline can then perform long read assembly utilizing [Unicycler](https://github.com/rrwick/Unicycler), [Miniasm](https://github.com/lh3/miniasm) in combination with [Racon](https://github.com/isovic/racon), [Canu](https://github.com/marbl/canu) or [Flye](https://github.com/fenderglass/Flye) by using the [Dragonflye](https://github.com/rpetit3/dragonflye)(\*) pipeline. Long reads assembly can be polished using [Medaka](https://github.com/nanoporetech/medaka) or [NanoPolish](https://github.com/jts/nanopolish) with Fast5 files. > [!NOTE] @@ -47,6 +48,10 @@ For users specifying both short read and long read (NanoPore) data, the pipeline In all cases, the assembly is assessed using [QUAST](http://bioinf.spbau.ru/quast). The resulting bacterial assembly is furthermore annotated using [Prokka](https://github.com/tseemann/prokka), [Bakta](https://github.com/oschwengers/bakta) or [DFAST](https://github.com/nigyta/dfast_core). +If Kmerfinder is invoked, the pipeline will group samples according to the [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/)-estimated reference genomes. Afterwards, two QUAST steps will be carried out: an initial ('general') [QUAST](http://bioinf.spbau.ru/quast) of all samples without reference genomes, and subsequently, a 'by reference genome' [QUAST](http://bioinf.spbau.ru/quast) to aggregate samples with their reference genomes. + +> NOTE: This scenario is supported when [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) analysis is performed only. + ## Usage > [!NOTE] diff --git a/assets/multiqc_config_hybrid.yml b/assets/multiqc_config_hybrid.yml new file mode 100644 index 00000000..4c036265 --- /dev/null +++ b/assets/multiqc_config_hybrid.yml @@ -0,0 +1,166 @@ +report_comment: > + This report has been generated by the nf-core/bacass + analysis pipeline. For information about how to interpret these results, please see the + documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - fastqc + - fastp + - nanostat + - porechop + - pycoqc + - kraken2 + - quast + - prokka + - bakta + +exclude_modules: + - general_stats + +module_order: + - fastqc: + name: "PREPROCESS: FastQC (raw reads)" + info: "This section of the report shows FastQC results for the raw reads before adapter trimming." + path_filters: + - "./fastqc/*.zip" + - fastp: + name: "PREPROCESS: fastp (adapter trimming)" + info: "This section of the report shows fastp results for reads after adapter and quality trimming." + path_filters: + - "./fastp/*.json" + - nanostat: + name: "PREPROCESS: Nanoplot" + info: "This section of the report shows Nanoplot results for nanopore sequencing data." + path_filters: + - "./nanoplot/*.txt" + - porechop: + name: "PREPROCESS: Porechop" + info: "This section of the report shows Porechop results for reads after adapter trimming." + path_filters: + - "./porechop/*.log" + - pycoqc: + name: "PREPROCESS: PycoQC" + info: "This section of the report shows PycoQC results for quality control of long-read sequencing data." + path_filters: + - "./pycoqc/*.txt" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + +report_section_order: + fastqc: + after: general_stats + fastp: + after: general_stats + nanostat: + after: general_stats + porechop: + before: nanostat + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: + order: -1000 + software_versions: + order: -1001 + nf-core-bacass-summary: + order: -1002 + +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics (shorts & long reads)" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Input short reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Trimmed short reads (fastp)": + description: "Total number of reads remaining after adapter/quality trimming with fastp" + format: "{:,.0f}" + "# Input long reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Median long reads lenght": + description: "Median read lenght (bp)" + format: "{:,.0f}" + "# Median long reads quality": + description: "Median read quality (Phred scale)" + format: "{:,.0f}" + "# Contigs (hybrid assembly)": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig (hybrid assembly)": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50 (hybrid assembly)": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction (hybrid assembly)": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + +export_plots: true + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details +sp: + fastp: + fn: "*.fastp.json" diff --git a/assets/multiqc_config_long.yml b/assets/multiqc_config_long.yml new file mode 100644 index 00000000..51795ec6 --- /dev/null +++ b/assets/multiqc_config_long.yml @@ -0,0 +1,140 @@ +report_comment: > + This report has been generated by the nf-core/bacass + analysis pipeline. For information about how to interpret these results, please see the + documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - nanostat + - porechop + - pycoqc + - kraken2 + - quast + - prokka + - bakta + +exclude_modules: + - general_stats + +module_order: + - nanostat: + name: "PREPROCESS: Nanoplot" + info: "This section of the report shows Nanoplot results for nanopore sequencing data." + path_filters: + - "./nanoplot/*.txt" + - porechop: + name: "PREPROCESS: Porechop" + info: "This section of the report shows Porechop results for reads after adapter trimming." + path_filters: + - "./porechop/*.log" + - pycoqc: + name: "PREPROCESS: PycoQC" + info: "This section of the report shows PycoQC results for quality control of long-read sequencing data." + path_filters: + - "./pycoqc/*.txt" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + +report_section_order: + nanostat: + after: general_stats + porechop: + before: nanostat + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: + order: -1000 + software_versions: + order: -1001 + nf-core-bacass-summary: + order: -1002 + +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics (long-reads)" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Input reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Median read lenght": + description: "Median read lenght (bp)" + format: "{:,.0f}" + "# Median read quality": + description: "Median read quality (Phred scale)" + format: "{:,.0f}" + "# Contigs": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder" + format: "{:,.0f}" + +export_plots: true +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details diff --git a/assets/multiqc_config_short.yml b/assets/multiqc_config_short.yml new file mode 100644 index 00000000..2ce2eca6 --- /dev/null +++ b/assets/multiqc_config_short.yml @@ -0,0 +1,135 @@ +report_comment: > + This report has been generated by the nf-core/bacass + analysis pipeline. For information about how to interpret these results, please see the + documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - fastqc + - fastp + - kraken2 + - quast + - prokka + - bakta + +exclude_modules: + - general_stats + +module_order: + - fastqc: + name: "PREPROCESS: FastQC (raw reads)" + info: "This section of the report shows FastQC results for the raw reads before adapter trimming." + path_filters: + - "./fastqc/*.zip" + - fastp: + name: "PREPROCESS: fastp (adapter trimming)" + info: "This section of the report shows fastp results for reads after adapter and quality trimming." + path_filters: + - "./fastp/*.json" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + +report_section_order: + fastqc: + after: general_stats + fastp: + after: general_stats + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: + order: -1000 + software_versions: + order: -1001 + nf-core-bacass-summary: + order: -1002 + +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics (short-reads)" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Input reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Trimmed reads (fastp)": + description: "Total number of reads remaining after adapter/quality trimming with fastp" + format: "{:,.0f}" + "# Contigs": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder" + format: "{:,.0f}" + +export_plots: true + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details +sp: + fastp: + fn: "*.fastp.json" diff --git a/bin/csv_to_yaml.py b/bin/csv_to_yaml.py new file mode 100755 index 00000000..2a14249b --- /dev/null +++ b/bin/csv_to_yaml.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +import sys +import argparse +import csv +import yaml + + +def parse_args(args=None): + Description = "Create a yaml file from csv input file grouping samples as keys and resting fields as their value pair." + + Epilog = "Example usage: python csv_to_yaml.py -i myfile.csv -k 'sample_name' -o converted_file" + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-i", "--input", type=str, dest="CSV_FILE", help="Input file in CSV format." + ) + + parser.add_argument( + "-k", + "--key_field", + type=str, + dest="KEY_FIELD", + help="Name of the key/column grupping field in the input csv.", + ) + + parser.add_argument( + "-op", + "--output_prefix", + type=str, + default="output_file", + dest="OUT_PREFIX", + help="Output file name", + ) + return parser.parse_args(args) + + +def parse_csv(csv_file): + with open(csv_file, "r") as c: + csv_reader = csv.DictReader(c) + data = [row for row in csv_reader] + return data + + +def create_yaml(data, key, output_prefix): + yaml_data = { + entry[key]: {k: v for k, v in entry.items() if k != key} for entry in data + } + with open(output_prefix + ".yaml", "w") as yaml_file: + yaml.dump(yaml_data, yaml_file, default_flow_style=False) + + +def main(args=None): + args = parse_args(args) + file_list = parse_csv(args.CSV_FILE) + + create_yaml(data=file_list, key=args.KEY_FIELD, output_prefix=args.OUT_PREFIX) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/download_reference.py b/bin/download_reference.py new file mode 100755 index 00000000..88e89364 --- /dev/null +++ b/bin/download_reference.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python +""" +============================================================= +HEADER +============================================================= +INSTITUTION: BU-ISCIII +AUTHOR: Guillermo J. Gorines Cordero +EDITED BY: Daniel VM +VERSION: 0.1 +CREATED: Early 2022 +REVISED: 18-2-2022 +EDITED: 14-11-2023 +DESCRIPTION: 20-05-2024 + Given a file with the kmerfinder results and frequencies (probably + created by find_common_reference.py), and the NCBI assembly sheet, + download the top-reference genome, gff and protein files from + the NCBI ftp. + +INPUT: + -FILE: file containing the ranking of references from kmerfinder created by the script find_common_references + -REFERENCE: file with the NCBI reference list + -OUTDIR: name of the output dir + +OUTPUT: + - *_fna.gz: file with the top-reference genome + - *_gff.gz: file with the top-reference gff + - *_protein.gz: file with the top-reference proteins + +USAGE: + python download_reference.py + -file [FILE] + -reference [REFERENCE] + -out_dir [OUTDIR] + +REQUIREMENTS: + -Python >= 3.6 + -Python wget + +DISCLAIMER: + This script has been designed for the assembly pipeline of BU-ISCIII. + Feel free to use it at will, however we dont guarantee its success + outside its purpose. +================================================================ +END_OF_HEADER +================================================================ +""" + +import sys +import argparse +import os + +# import wget +import requests + + +# TODO: Generate report +def parse_args(args=None): + Description = "download the reference files \ + (fna, faa, gff)from the reference NCBI file." + Epilog = """Usage example: \ + python download_reference.py \ + -file \ + -reference \ + -out_dir """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-file", help="File containing the ranking of references from kmerfinder." + ) + parser.add_argument( + "-reference", + help="File containing the paths to bacterial references. See example in: https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt", + ) + parser.add_argument("-out_dir", help="Output directory.") + + return parser.parse_args(args) + + +def download_references(file, reference, out_dir): + """ + Downloads the top reference from the NCBI database + """ + + reference_ends = ["_genomic.fna.gz", "_protein.faa.gz", "_genomic.gff.gz"] + + # extract the most common reference from file + with open(file) as infile: + infile = infile.readlines() + infile = [ + item.replace("\n", "").split("\t") + for item in infile + if not item.startswith("#") + ] + top_reference = infile[0][0] + + with open(str(top_reference) + ".winner", "w") as topref: + topref.write(top_reference) + + # create the outdir (do nothing if already there) + try: + os.mkdir(out_dir) + except FileExistsError: + pass + + # open the reference and find the reference + with open(reference) as inref: + inref = inref.readlines() + inref = [ + item.replace("\n", "").split("\t") + for item in inref + if not item.startswith("#") + ] + + # Initialize an empty list to store the URLs + dir_url = [] + + # Iterate over each row in the inref + for row in inref: + # Construct the ref_query using assembly_accession and asm_name + assembly_accession = row[0] + asm_name = row[15] + ref_query = f"{assembly_accession}_{asm_name}" + + # Check if ref_query matches the search value + if ref_query == top_reference: + # make url # Append the 20th element of the row to the URL list: + assembly_url = row[19] + "/" + ref_query + dir_url.append(assembly_url) + + if len(dir_url) == 0: + print( + "No assemblies responding to the top reference: ", + top_reference, + " were found", + ) + sys.exit(1) + + dir_url = str(dir_url[0]) + + # get url and reference file + for r_end in reference_ends: + out_file = out_dir + "/" + top_reference + r_end + file_url = dir_url + r_end + print(file_url) + + # wget.download(file_url, out_file) + response = requests.get(file_url, stream=True) + with open(out_file, "wb") as out: + for chunk in response.iter_content(chunk_size=8192): + out.write(chunk) + + return + + +def main(args=None): + args = parse_args(args) + download_references(args.file, args.reference, args.out_dir) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/find_common_reference.py b/bin/find_common_reference.py new file mode 100755 index 00000000..e26aaf53 --- /dev/null +++ b/bin/find_common_reference.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +""" +============================================================= +HEADER +============================================================= +INSTITUTION: BU-ISCIII +AUTHOR: Guillermo J. Gorines Cordero +MAIL: guillermo.gorines@urjc.es +VERSION: 0.1 +CREATED: Early 2022 +REVISED: 18-2-2022 +DESCRIPTION: + Given a directory with kmerfinder results, sum them up + in an outfile named by the user. + +INPUT: + -DIRECTORY: directory containing all kmerfinder results. + -OUTFILE: Name of the file to write the whole results in. + +OUTPUT: + -OUTFILE: file containing the kmerfinder results. + +USAGE: + python find_common_reference.py -d [DIRECTORY] -o [OUTFILE] +REQUIREMENTS: + -Python >= 3.6 + +DISCLAIMER: This script has been designed for the assembly pipeline of BU-ISCIII. + Feel free to use it at will, however we dont guarantee its success + outside its purpose. + +================================================================ +END_OF_HEADER +================================================================ +""" +import os +import sys +import errno +import argparse + + +def parse_args(args=None): + """ + Parse the args given to argparser + """ + Description = "Fetch kmerfinder result files and get the most used reference." + Epilog = """Example usage: python find_common_reference.py -d -o """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("-d", help="Input directory.") + parser.add_argument("-o", help="Output file.") + return parser.parse_args(args) + + +def group_references(kmer_result_dir, out_file): + """ + Unifies the kmerfinder results, and counts their occurrences + """ + reference_assembly = {} + + # for file in dir + for k_file in os.listdir(kmer_result_dir): + # open file + with open(os.path.join(kmer_result_dir, k_file), "r") as fh: + file_lines = fh.readlines() + + # remove heading + try: + heading = file_lines[0].split("\t") + first_line = file_lines[1].split("\t") + + # where is the assembly in the header? + # find reference according to index + index_assembly = heading.index("# Assembly") + reference = first_line[index_assembly] + + # add it to the dict if not there + if reference not in reference_assembly: + index_description = heading.index("Description") + reference_assembly[reference] = [0, first_line[index_description]] + # sum 1 for another occurrence + reference_assembly[reference][0] += 1 + except IndexError: + pass + + # sort it (more occurrences first in file) + order_reference = dict( + sorted(reference_assembly.items(), key=lambda x: x[1][0], reverse=True) + ) + + # write it + with open(out_file, "w") as f_out: + for key, value in order_reference.items(): + f_out.write(key + "\t" + str(value[0]) + "\t" + value[1] + "\n") + return + + +def main(args=None): + args = parse_args(args) + group_references(args.d, args.o) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/kmerfinder_summary.py b/bin/kmerfinder_summary.py new file mode 100755 index 00000000..5d9fb513 --- /dev/null +++ b/bin/kmerfinder_summary.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 + + +import argparse +import sys +import re +import csv +import pickle +import os + + +################# +### FUNCTIONS ### +################# + + +def check_arg(args=None): + """ + Description: + Function collect arguments from command line using argparse + Input: + args # command line arguments + Constant: + None + Variables + parser + Return + parser.parse_args() # Parsed arguments + """ + + parser = argparse.ArgumentParser( + prog="07-kmerfinder.py", + formatter_class=argparse.RawDescriptionHelpFormatter, + description="07-kmerfinder.py creates a csv file from results.txt file", # FIXME + ) + + parser.add_argument( + "--path", + "-p", + required=True, + help="Insert path of results.txt file like /home/user/Service_folder/ANALYSIS/07-kmerfinder", # FIXME + ) + + parser.add_argument( + "--output_bn", "-b", required=True, help="The output in binary file" + ) + + parser.add_argument( + "--output_csv", "-c", required=True, help="The output in csv file" + ) + + # Example: python3 parse_kmerfinder.py -p /home/s.gonzalez/07-kmerfinder -b p_dic.dicke -c p_kmer.csv + + return parser.parse_args() + + +################# +### FUNCTIONS ### +################# + + +def kmerfinder_dictionary(file_txt): + """ + Description: + Function to extract the relevant part of result.txt file + Input: + result.txt file + Return: + dictionary + """ + + step = "07-kmerfinder_" # FIXME + + num_lines = sum(1 for line in open(file_txt)) + hits = num_lines - 1 # to count the total number of hits + lookupfile = open(file_txt, "r") + lines = lookupfile.readlines() + parameters = lines[0].strip().split("\t") + if num_lines > 1: + values_best_hit = lines[1].strip().split("\t") + if num_lines > 2: + values_second_hit = lines[2].strip().split("\t") + + kmer_dict = {} + + for i in range(len(parameters)): + if num_lines > 1: + kmer_dict[step + "best_hit_" + parameters[i]] = values_best_hit[i] + else: + kmer_dict[step + "best_hit_" + parameters[i]] = "" + + kmer_dict.update(Total_hits_07_kmerfinder=hits) + + if num_lines > 2: + + kmer_dict[step + "second_hit_" + parameters[i]] = values_second_hit[i] + + else: + + kmer_dict[step + "second_hit_" + parameters[i]] = "" + + return kmer_dict + + +################# +### FUNCTIONS ### +################# + + +def dictionary2bn(dictionary, binary_file): + """ + + Description: + Function to create a binary file from a dictionary + Input: + dictionary + Return: + binary file + """ + + pickle_out = open(binary_file, "wb") + pickle.dump(dictionary, pickle_out) + pickle_out.close() + + return + + +################# +### FUNCTIONS ### +################# + + +def dictionary2csv(dictionary, csv_file): + """ + + Description: + Function to create a csv from a dictionary + Input: + dictionary + Return: + csv file + + """ + + header = sorted(set(i for b in map(dict.keys, dictionary.values()) for i in b)) + with open(csv_file, "w", newline="") as f: + write = csv.writer(f) + write.writerow(["sample_name", *header]) + for a, b in dictionary.items(): + write.writerow([a] + [b.get(i, "") for i in header]) + return + + +################### +### MAIN SCRIPT ### +################### + + +if __name__ == "__main__": + + # Variables + version = "07-kmerfinder.py v 0.1.0." # Script version # FIXME + arguments = check_arg(sys.argv[1:]) + + # Create sample_id_list + path = arguments.path + sample_list = [] + tmp = os.listdir(path) + for item in tmp: + if os.path.isdir(os.path.join(path, item)): + if item != "logs": + sample_name = item.replace("_results.txt", "") + sample_list.append(sample_name) + else: + sample_name = item.replace("_results.txt", "") + sample_list.append(sample_name) + + print("sample_list done") + + # Create a dictionary + kmer_all = {} + + for sample in sample_list: + file_name = os.path.join(path, sample + "_results.txt") + kmer_all[sample] = kmerfinder_dictionary(file_name) + + print("kmerfinder_dictionary done") + # print (kmer_all) + + # Save the dicctionary to binary file + + dictionary2bn(kmer_all, arguments.output_bn) + + print("kmerfinder_dictionary_bn done") + + # Convert the dictionary to csv file + + dictionary2csv(kmer_all, arguments.output_csv) + + print("kmerfinder_dictionary_csv done") diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py new file mode 100755 index 00000000..3838ab5f --- /dev/null +++ b/bin/multiqc_to_custom_csv.py @@ -0,0 +1,331 @@ +#!/usr/bin/env python +# Sourced and Edited from nf-core/viralrecon: +# https://github.com/nf-core/viralrecon/blob/master/bin/multiqc_to_custom_csv.py#L59 +import os +import sys +import errno +import argparse +import yaml + + +def parse_args(args=None): + Description = "Create custom spreadsheet for pertinent MultiQC metrics generated by the nf-core/viralrecon pipeline." + Epilog = "Example usage: python multiqc_to_custom_tsv.py" + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-md", + "--multiqc_data_dir", + type=str, + dest="MULTIQC_DATA_DIR", + default="multiqc_data", + help="Full path to directory containing YAML files for each module, as generated by MultiQC. (default: 'multiqc_data').", + ) + parser.add_argument( + "-t", + "--assembly_type", + type=str, + dest="ASSEMBLY_TYPE", + default="short", + help="String defining the assembly mode for genome de novo assembly (options: short, long, hybrid).", + ) + parser.add_argument( + "-op", + "--out_prefix", + type=str, + dest="OUT_PREFIX", + default="summary", + help="Full path to output prefix (default: 'summary').", + ) + return parser.parse_args(args) + + +def make_dir(path): + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + +# Find key in dictionary created from YAML file recursively +# From https://stackoverflow.com/a/37626981 +def find_tag(d, tag): + if tag in d: + yield d[tag] + for k, v in d.items(): + if isinstance(v, dict): + for i in find_tag(v, tag): + yield i + + +def yaml_fields_to_dict( + yaml_file, append_dict={}, field_mapping_list=[], valid_sample_list=[] +): + integer_fields = [ + "# contigs", + "# contigs (>= 5000 bp)", + "Largest contig", + ] + if os.path.exists(yaml_file): + with open(yaml_file) as f: + yaml_dict = yaml.safe_load(f) + for k in yaml_dict.keys(): + key = k + include_sample = True + if len(valid_sample_list) != 0 and key not in valid_sample_list: + include_sample = False + if include_sample: + if key not in append_dict: + append_dict[key] = {} + if field_mapping_list != []: + for i, j in field_mapping_list: + val = list(find_tag(yaml_dict[k], j[0])) + ## Fix for Cutadapt reporting reads/pairs as separate values + if j[0] == "r_written" and len(val) == 0: + val = [ + list(find_tag(yaml_dict[k], "pairs_written"))[0] * 2 + ] + if len(val) != 0: + val = val[0] + if len(j) == 2: + val = list(find_tag(val, j[1]))[0] + if j[0] in integer_fields: + val = int(val) + if i not in append_dict[key]: + append_dict[key][i] = val + else: + print( + "WARNING: {} key already exists in dictionary so will be overwritten. YAML file {}.".format( + i, yaml_file + ) + ) + else: + append_dict[key] = yaml_dict[k] + else: + print("WARNING: File does not exist: {}".format(yaml_file)) + if len(valid_sample_list) != 0: + for key in valid_sample_list: + if key not in append_dict: + append_dict[key] = {} + if field_mapping_list != []: + for i, j in field_mapping_list: + if i not in append_dict[key]: + append_dict[key][i] = "NA" + else: + print( + "WARNING: {} key already exists in dictionary so will be overwritten. YAML file {}.".format( + i, yaml_file + ) + ) + else: + append_dict[key] = "NA" + return append_dict + + +def metrics_dict_to_file( + file_field_list, multiqc_data_dir, out_file, valid_sample_list=[] +): + metrics_dict = {} + field_list = [] + for yaml_file, mapping_list in file_field_list: + yaml_file = os.path.join(multiqc_data_dir, yaml_file) + metrics_dict = yaml_fields_to_dict( + yaml_file=yaml_file, + append_dict=metrics_dict, + field_mapping_list=mapping_list, + valid_sample_list=valid_sample_list, + ) + field_list += [x[0] for x in mapping_list] + + if metrics_dict != {}: + make_dir(os.path.dirname(out_file)) + fout = open(out_file, "w") + header = ["Sample"] + field_list + fout.write("{}\n".format(",".join(header))) + for k in sorted(metrics_dict.keys()): + row_list = [k] + for field in field_list: + if field in metrics_dict[k]: + if metrics_dict[k][field]: + row_list.append(str(metrics_dict[k][field]).replace(",", ";")) + else: + row_list.append("NA") + else: + row_list.append("NA") + fout.write("{}\n".format(",".join(row_list))) + fout.close() + return metrics_dict + + +def main(args=None): + args = parse_args(args) + + ## File names for MultiQC YAML along with fields to fetch from each file + illumina_assembly_files = [ + ( + "multiqc_fastp.yaml", + [ + ("# Input reads", ["before_filtering", "total_reads"]), + ("# Trimmed reads (fastp)", ["after_filtering", "total_reads"]), + ], + ), + ( + "multiqc_quast.yaml", + [ + ("# Contigs", ["# contigs"]), + ("# Largest contig", ["Largest contig"]), + ("# N50", ["N50"]), + ("# % Genome fraction", ["Genome fraction (%)"]), + ], + ), + ( + "multiqc_kmerfinder.yaml", + [ + ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), + ( + "# Best hit assembly ID (Kmerfinder)", + ["07-kmerfinder_best_hit_# Assembly"], + ), + ( + "# Best hit query coverage (Kmerfinder)", + ["07-kmerfinder_best_hit_Query_Coverage"], + ), + ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), + ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), + ( + "# Second hit assembly ID (Kmerfinder)", + ["07-kmerfinder_second_hit_# Assembly"], + ), + ( + "# Second hit query coverage (Kmerfinder)", + ["07-kmerfinder_second_hit_Query_Coverage"], + ), + ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), + ], + ), + ] + + nanopore_assembly_files = [ + ( + "multiqc_nanostat.yaml", + [ + ("# Input reads", ["Number of reads_fastq"]), + ("# Median read lenght", ["Median read length_fastq"]), + ("# Median read quality", ["Median read quality_fastq"]), + ], + ), + ( + "multiqc_quast.yaml", + [ + ("# Contigs", ["# contigs"]), + ("# Largest contig", ["Largest contig"]), + ("# N50", ["N50"]), + ("# % Genome fraction", ["Genome fraction (%)"]), + ], + ), + ( + "multiqc_kmerfinder.yaml", + [ + ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), + ( + "# Best hit assembly ID (Kmerfinder)", + ["07-kmerfinder_best_hit_# Assembly"], + ), + ( + "# Best hit query coverage (Kmerfinder)", + ["07-kmerfinder_best_hit_Query_Coverage"], + ), + ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), + ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), + ( + "# Second hit assembly ID (Kmerfinder)", + ["07-kmerfinder_second_hit_# Assembly"], + ), + ( + "# Second hit query coverage (Kmerfinder)", + ["07-kmerfinder_second_hit_Query_Coverage"], + ), + ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), + ], + ), + ] + + hybrid_assembly_files = [ + ( + "multiqc_fastp.yaml", + [ + ("# Input short reads", ["before_filtering", "total_reads"]), + ("# Trimmed short reads (fastp)", ["after_filtering", "total_reads"]), + ], + ), + ( + "multiqc_nanostat.yaml", + [ + ("# Input long reads", ["Number of reads_fastq"]), + ("# Median long reads lenght", ["Median read length_fastq"]), + ("# Median long reads quality", ["Median read quality_fastq"]), + ], + ), + ( + "multiqc_quast.yaml", + [ + ("# Contigs (hybrid assembly)", ["# contigs"]), + ("# Largest contig (hybrid assembly)", ["Largest contig"]), + ("# N50 (hybrid assembly)", ["N50"]), + ("# % Genome fraction (hybrid assembly)", ["Genome fraction (%)"]), + ], + ), + ( + "multiqc_kmerfinder.yaml", + [ + ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), + ( + "# Best hit assembly ID (Kmerfinder)", + ["07-kmerfinder_best_hit_# Assembly"], + ), + ( + "# Best hit query coverage (Kmerfinder)", + ["07-kmerfinder_best_hit_Query_Coverage"], + ), + ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), + ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), + ( + "# Second hit assembly ID (Kmerfinder)", + ["07-kmerfinder_second_hit_# Assembly"], + ), + ( + "# Second hit query coverage (Kmerfinder)", + ["07-kmerfinder_second_hit_Query_Coverage"], + ), + ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), + ], + ), + ] + + ## Write de novo assembly metrics to file + if args.ASSEMBLY_TYPE == "short": + metrics_dict_to_file( + file_field_list=illumina_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) + elif args.ASSEMBLY_TYPE == "long": + metrics_dict_to_file( + file_field_list=nanopore_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) + elif args.ASSEMBLY_TYPE == "hybrid": + metrics_dict_to_file( + file_field_list=hybrid_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/conf/modules.config b/conf/modules.config index c1b4f8da..78bd92bb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,9 +35,17 @@ process { ext.args = '' ext.prefix = { "${meta.id}.porechop" } publishDir = [ - path: { "${params.outdir}/trimming/longreads" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + [ + path: { "${params.outdir}/trimming/longreads" }, + pattern: "*.fastq.gz", + mode: params.publish_dir_mode, + enabled: params.save_trimmed + ], + [ + path: { "${params.outdir}/trimming/longreads" }, + pattern: "*.log", + mode: params.publish_dir_mode, + ] ] } @@ -176,12 +184,21 @@ process { ] } - withName: 'QUAST' { + withName: 'QUAST|QUAST_BYREFSEQID' { ext.args = '' publishDir = [ path: { "${params.outdir}/QUAST" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> + if (filename.equals('versions.yml') || filename.endsWith('.tsv')){ + null + } else if (filename.startsWith('GCF')){ + "runs_per_reference/${filename}" + } + else { + "${filename}" + } + } ] } @@ -204,8 +221,8 @@ process { ] } - withName: 'MULTIQC' { - ext.args = '' + withName: 'MULTIQC_CUSTOM' { + ext.args = '-k yaml' publishDir = [ path: { "${params.outdir}/multiqc" }, mode: params.publish_dir_mode, @@ -229,13 +246,14 @@ if (!params.skip_fastqc) { if (!params.skip_fastp) { process { withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTP' { - ext.args = '' + ext.args = params.fastp_args ? params.fastp_args : '' publishDir = [ [ path: { "${params.outdir}/trimming/shortreads" }, mode: params.publish_dir_mode, pattern: "*.fastp.fastq.gz", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_trimmed ], [ path: { "${params.outdir}/trimming/shortreads/json_html" }, @@ -270,6 +288,29 @@ if (!params.skip_fastp) { } } } +if (!params.skip_kmerfinder) { + process { + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER' { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/Kmerfinder/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{txt,json}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER_SUMMARY' { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/Kmerfinder" }, + mode: params.publish_dir_mode, + pattern: "*.csv", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} if (params.annotation_tool == 'bakta') { if (params.baktadb_download == true) { diff --git a/conf/test.config b/conf/test.config index e0581847..0b422f03 100644 --- a/conf/test.config +++ b/conf/test.config @@ -28,4 +28,5 @@ params { assembly_type = 'short' skip_pycoqc = true skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_dfast.config b/conf/test_dfast.config index 71231cde..9edc4a7e 100644 --- a/conf/test_dfast.config +++ b/conf/test_dfast.config @@ -28,4 +28,5 @@ params { assembly_type = 'short' skip_pycoqc = true skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_full.config b/conf/test_full.config index cebac444..669ae228 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,6 +15,8 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = params.pipelines_testdata_base_path + 'bacass/bacass_full.tsv' - kraken2db = 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_full.tsv' + kraken2db = 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz' + kmerfinderdb = 'https://zenodo.org/records/10458361/files/20190108_kmerfinder_stable_dirs.tar.gz' + ncbi_assembly_metadata = 'https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt' } diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config index cb25eaca..a524de07 100644 --- a/conf/test_hybrid.config +++ b/conf/test_hybrid.config @@ -23,7 +23,8 @@ params { input = params.pipelines_testdata_base_path + 'bacass/bacass_hybrid.tsv' // some extra args to speed tests up - assembly_type = 'hybrid' - prokka_args = " --fast" + assembly_type ='hybrid' + prokka_args =" --fast" skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_hybrid_dragonflye.config b/conf/test_hybrid_dragonflye.config index e2ca552d..9cf34364 100644 --- a/conf/test_hybrid_dragonflye.config +++ b/conf/test_hybrid_dragonflye.config @@ -27,4 +27,5 @@ params { assembler = 'dragonflye' prokka_args = " --fast" skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_long.config b/conf/test_long.config index 4f1343ba..3cc3a8b6 100644 --- a/conf/test_long.config +++ b/conf/test_long.config @@ -27,4 +27,5 @@ params { assembly_type = 'long' skip_polish = true skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_long_dragonflye.config b/conf/test_long_dragonflye.config index c51c1310..38301d47 100644 --- a/conf/test_long_dragonflye.config +++ b/conf/test_long_dragonflye.config @@ -23,4 +23,5 @@ params { assembler = 'dragonflye' skip_kraken2 = true skip_polish = true + skip_kmerfinder = true } diff --git a/conf/test_long_miniasm.config b/conf/test_long_miniasm.config index 32a63dbf..d6b5874e 100644 --- a/conf/test_long_miniasm.config +++ b/conf/test_long_miniasm.config @@ -27,4 +27,5 @@ params { assembly_type = 'long' assembler = 'miniasm' kraken2db = "https://genome-idx.s3.amazonaws.com/kraken/16S_Greengenes13.5_20200326.tgz" + skip_kmerfinder = true } diff --git a/docs/output.md b/docs/output.md index a83dcd6e..500aa6d4 100644 --- a/docs/output.md +++ b/docs/output.md @@ -118,6 +118,20 @@ Exemplary Kraken2 report screenshot: +## Reads QC and Sample purity + +The pipeline includes a dedicated step for short and long reads QC as well as contamination analysis using [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/). This process helps assess the quality and purity of the samples. + +
+Output files + +- `Kmerfinder/{ID}/` + - `*_results.txt`: Kmerfinder report table containing reads QC results and taxonomic information. +- `Kmerfinder/` + - `kmerfinder_summary.csv`: A CSV file containing the most relevant results of all samples analyzed with Kmerfinder. + +
+ ## Assembly Output Trimmed reads are assembled with [Unicycler](https://github.com/rrwick/Unicycler) in `short` or `hybrid` assembly modes. For long-read assembly, there are also `canu` and `miniasm` available. @@ -180,9 +194,10 @@ The assembly QC is performed with [QUAST](http://quast.sourceforge.net/quast) fo
Output files -- `QUAST` - - `report.tsv`: QUAST's report in text format -- `QUAST/report` +- `QUAST/report/` + - `icarus.html`: QUAST's contig browser as HTML + - `report.html`: QUAST assembly QC as HTML report + - `report.pdf`: QUAST assembly QC as pdf - `icarus.html`: QUAST's contig browser as HTML - `report.html`: QUAST assembly QC as HTML report - `report.pdf`: QUAST assembly QC as pdf @@ -240,6 +255,7 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - `multiqc_plots/`: directory containing static images from the report in various formats. + - summary_assembly_metrics_mqc.csv: custom table containing most relevant assembly QC metrics.
diff --git a/modules.json b/modules.json index 15921ac2..6dc814ab 100644 --- a/modules.json +++ b/modules.json @@ -56,11 +56,6 @@ "git_sha": "2c2d1cf80866dbd6dd0ea5d61ddd59533a72d41e", "installed_by": ["modules"] }, - "multiqc": { - "branch": "master", - "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": ["modules"] - }, "nanoplot": { "branch": "master", "git_sha": "a31407dfaf0cb0d04768d5cb439fc6f4523a6981", diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf new file mode 100644 index 00000000..87f73664 --- /dev/null +++ b/modules/local/find_download_reference.nf @@ -0,0 +1,40 @@ +process FIND_DOWNLOAD_REFERENCE { + tag "${task.process}" + label 'process_medium' + + conda "conda-forge::requests=2.26.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/requests:2.26.0' : + 'biocontainers/requests:2.26.0' }" + + input: + tuple val(refmeta), path(reports, stageAs: 'reports/*') + path(ncbi_metadata_db) + + output: + tuple val(refmeta), path("*.fna.gz") , emit: fna + tuple val(refmeta), path("*.gff.gz") , emit: gff + tuple val(refmeta), path("*.faa.gz") , emit: faa + tuple val(refmeta), path("references_found.tsv") , emit: references_tsv + tuple val(refmeta), path("*.winner") , emit: winner + path "versions.yml" , emit: versions + + script: + """ + ## Find the common reference genome + find_common_reference.py \\ + -d reports/ \\ + -o references_found.tsv + + ## Download the winner reference genome from the ncbi database + download_reference.py \\ + -file references_found.tsv \\ + -reference $ncbi_metadata_db \\ + -out_dir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """ +} diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf new file mode 100644 index 00000000..cca5f359 --- /dev/null +++ b/modules/local/kmerfinder.nf @@ -0,0 +1,39 @@ +process KMERFINDER { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::kmerfinder=3.0.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kmerfinder:3.0.2--hdfd78af_0' : + 'biocontainers/kmerfinder:3.0.2--hdfd78af_0' }" + + input: + tuple val(meta), path(reads), path(kmerfinder_db) + + output: + tuple val(meta), path("*_results.txt") , emit: report + tuple val(meta), path("*_data.json") , emit: json + path "versions.yml" , emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def in_reads = reads[0] && reads[1] ? "${reads[0]} ${reads[1]}" : "${reads}" + // WARNING: Ensure to update software version in this line if you modify the container/environment. + def kmerfinder_version = "3.0.2" + """ + kmerfinder.py \\ + --infile $in_reads \\ + --output_folder . \\ + --db_path ${kmerfinder_db}/bacteria.ATG \\ + -tax ${kmerfinder_db}/bacteria.name \\ + -x + + mv results.txt ${prefix}_results.txt + mv data.json ${prefix}_data.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kmerfinder: \$(echo "${kmerfinder_version}") + END_VERSIONS + """ +} diff --git a/modules/local/kmerfinder_summary.nf b/modules/local/kmerfinder_summary.nf new file mode 100644 index 00000000..8e5fe45f --- /dev/null +++ b/modules/local/kmerfinder_summary.nf @@ -0,0 +1,31 @@ +process KMERFINDER_SUMMARY { + tag "kmerfinder_summary" + label 'process_low' + + conda "bioconda::multiqc=1.19" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" + + input: + path(report, stageAs: 'reports/*') + + output: + path "*.csv" , emit: summary + path "*.yaml" , emit: yaml + path "versions.yml" , emit: versions + + script: + """ + ## summarizing kmerfinder results + kmerfinder_summary.py --path reports/ --output_bn kmerfinder.bn --output_csv kmerfinder_summary.csv + + ## Create a yaml file from csv + csv_to_yaml.py -i kmerfinder_summary.csv -k 'sample_name' -op kmerfinder_summary + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """ +} diff --git a/modules/local/multiqc_custom.nf b/modules/local/multiqc_custom.nf new file mode 100644 index 00000000..5c0cc9f5 --- /dev/null +++ b/modules/local/multiqc_custom.nf @@ -0,0 +1,63 @@ +process MULTIQC_CUSTOM { + label 'process_medium' + + conda "bioconda::multiqc=1.19" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" + + input: + path 'multiqc_config.yaml' + path multiqc_custom_config + path multiqc_logo + path workflow_summary + path methods_description + path software_versions + path ('fastqc/*') + path ('fastp/*') + path ('nanoplot/*') + path ('porechop/*') + path ('pycoqc/*') + path ('kraken2_short/*') + path ('kraken2_long/*') + path ('quast/*') + path ('prokka/*') + path ('bakta/*') + path ('extra/*') + + output: + path "*multiqc_report.html" , emit: report + path "*_data" , emit: data + path "*_assembly_metrics_mqc.csv" , optional:true, emit: csv_assembly + path "*_plots" , optional:true, emit: plots + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def custom_config = multiqc_custom_config ? "--config $multiqc_custom_config" : '' + """ + ## Run MultiQC once to parse tool logs + multiqc -f $args $custom_config . + + ## Collect additional files to be included in the report + if [ -d extra/ ]; then + cp extra/* multiqc_data/ + fi + + ## Create multiqc custom data + multiqc_to_custom_csv.py --assembly_type $params.assembly_type + + ## Avoid the custom Multiqc table when the kmerfinder process is not invoked. + if grep ">skip_kmerfinder<" workflow_summary_mqc.yaml; then + rm *_assembly_metrics_mqc.csv + fi + + ## Run multiqc a second time + multiqc -f $args $custom_config . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml deleted file mode 100644 index ca39fb67..00000000 --- a/modules/nf-core/multiqc/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: multiqc -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::multiqc=1.21 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf deleted file mode 100644 index 47ac352f..00000000 --- a/modules/nf-core/multiqc/main.nf +++ /dev/null @@ -1,55 +0,0 @@ -process MULTIQC { - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.21--pyhdfd78af_0' : - 'biocontainers/multiqc:1.21--pyhdfd78af_0' }" - - input: - path multiqc_files, stageAs: "?/*" - path(multiqc_config) - path(extra_multiqc_config) - path(multiqc_logo) - - output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def config = multiqc_config ? "--config $multiqc_config" : '' - def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' - def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' - """ - multiqc \\ - --force \\ - $args \\ - $config \\ - $extra_config \\ - $logo \\ - . - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ - - stub: - """ - mkdir multiqc_data - touch multiqc_plots - touch multiqc_report.html - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml deleted file mode 100644 index 45a9bc35..00000000 --- a/modules/nf-core/multiqc/meta.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: multiqc -description: Aggregate results from bioinformatics analyses across many samples into a single report -keywords: - - QC - - bioinformatics tools - - Beautiful stand-alone HTML report -tools: - - multiqc: - description: | - MultiQC searches a given directory for analysis logs and compiles a HTML report. - It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. - homepage: https://multiqc.info/ - documentation: https://multiqc.info/docs/ - licence: ["GPL-3.0-or-later"] -input: - - multiqc_files: - type: file - description: | - List of reports / files recognised by MultiQC, for example the html and zip output of FastQC - - multiqc_config: - type: file - description: Optional config yml for MultiQC - pattern: "*.{yml,yaml}" - - extra_multiqc_config: - type: file - description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. - pattern: "*.{yml,yaml}" - - multiqc_logo: - type: file - description: Optional logo file for MultiQC - pattern: "*.{png}" -output: - - report: - type: file - description: MultiQC report file - pattern: "multiqc_report.html" - - data: - type: directory - description: MultiQC data dir - pattern: "multiqc_data" - - plots: - type: file - description: Plots created by MultiQC - pattern: "*_data" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" -maintainers: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test deleted file mode 100644 index f1c4242e..00000000 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ /dev/null @@ -1,84 +0,0 @@ -nextflow_process { - - name "Test Process MULTIQC" - script "../main.nf" - process "MULTIQC" - - tag "modules" - tag "modules_nfcore" - tag "multiqc" - - test("sarscov2 single-end [fastqc]") { - - when { - process { - """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = [] - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("multiqc_versions_single") } - ) - } - - } - - test("sarscov2 single-end [fastqc] [config]") { - - when { - process { - """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("multiqc_versions_config") } - ) - } - } - - test("sarscov2 single-end [fastqc] - stub") { - - options "-stub" - - when { - process { - """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = [] - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out.report.collect { file(it).getName() } + - process.out.data.collect { file(it).getName() } + - process.out.plots.collect { file(it).getName() } + - process.out.versions ).match("multiqc_stub") } - ) - } - - } -} diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap deleted file mode 100644 index bfebd802..00000000 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ /dev/null @@ -1,41 +0,0 @@ -{ - "multiqc_versions_single": { - "content": [ - [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-29T08:48:55.657331" - }, - "multiqc_stub": { - "content": [ - [ - "multiqc_report.html", - "multiqc_data", - "multiqc_plots", - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-29T08:49:49.071937" - }, - "multiqc_versions_config": { - "content": [ - [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-29T08:49:25.457567" - } -} \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml deleted file mode 100644 index bea6c0d3..00000000 --- a/modules/nf-core/multiqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -multiqc: - - modules/nf-core/multiqc/** diff --git a/modules/nf-core/racon/main.nf b/modules/nf-core/racon/main.nf index de29e355..3b45dc5d 100644 --- a/modules/nf-core/racon/main.nf +++ b/modules/nf-core/racon/main.nf @@ -11,7 +11,7 @@ process RACON { tuple val(meta), path(reads), path(assembly), path(paf) output: - tuple val(meta), path('*_assembly_consensus.fasta.gz') , emit: improved_assembly + tuple val(meta), path('*.consensus.fasta.gz') , emit: improved_assembly path "versions.yml" , emit: versions when: @@ -26,9 +26,9 @@ process RACON { "${paf}" \\ $args \\ "${assembly}" > \\ - ${prefix}_assembly_consensus.fasta + ${prefix}.consensus.fasta - gzip -n ${prefix}_assembly_consensus.fasta + gzip -n ${prefix}.consensus.fasta cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/racon/racon.diff b/modules/nf-core/racon/racon.diff new file mode 100644 index 00000000..c6e8d118 --- /dev/null +++ b/modules/nf-core/racon/racon.diff @@ -0,0 +1,26 @@ +Changes in module 'nf-core/racon' +--- modules/nf-core/racon/main.nf ++++ modules/nf-core/racon/main.nf +@@ -11,7 +11,7 @@ + tuple val(meta), path(reads), path(assembly), path(paf) + + output: +- tuple val(meta), path('*_assembly_consensus.fasta.gz') , emit: improved_assembly ++ tuple val(meta), path('*.consensus.fasta.gz') , emit: improved_assembly + path "versions.yml" , emit: versions + + when: +@@ -26,9 +26,9 @@ + "${paf}" \\ + $args \\ + "${assembly}" > \\ +- ${prefix}_assembly_consensus.fasta ++ ${prefix}.consensus.fasta + +- gzip -n ${prefix}_assembly_consensus.fasta ++ gzip -n ${prefix}.consensus.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/nextflow.config b/nextflow.config index 04ffe9d3..0535ddb3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,11 +13,17 @@ params { input = null // QC and trimming options + fastp_args = "" + save_trimmed = false save_trimmed_fail = false save_merged = false // Contamination_screening - kraken2db = "" + kraken2db = '' + kmerfinderdb = '' + reference_fasta = '' + reference_gff = '' + ncbi_assembly_metadata = '' // Assembly parameters assembler = 'unicycler' // Allowed: ['unicycler', 'canu', 'miniasm', 'dragonflye'] @@ -42,6 +48,7 @@ params { skip_fastqc = false skip_fastp = false skip_kraken2 = false + skip_kmerfinder = false skip_pycoqc = false skip_annotation = false skip_polish = false @@ -66,7 +73,7 @@ params { validate_params = true schema_ignore_params = 'modules,igenomes_base' version = false - pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/' // Config options diff --git a/nextflow_schema.json b/nextflow_schema.json index 698df8e9..ce8b1a54 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -42,6 +42,14 @@ "description": "Parameters for QC and trim short-reads", "default": "", "properties": { + "fastp_args": { + "type": "string", + "description": "This can be used to pass arguments to [Fastp](https://github.com/OpenGene/fastp)" + }, + "save_trimmed": { + "type": "boolean", + "description": "save trimmed files" + }, "save_trimmed_fail": { "type": "boolean", "description": "save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz`" @@ -72,6 +80,22 @@ "fa_icon": "fab fa-gitkraken", "help_text": "See [Kraken2 homepage](https://benlangmead.github.io/aws-indexes/k2) for download\nlinks. Minikraken2 8GB is a reasonable choice, since we run Kraken here mainly just to check for\nsample purity.", "description": "Path to Kraken2 database." + }, + "kmerfinderdb": { + "type": "string", + "description": "Path to the Kmerfinder bacteria database." + }, + "reference_fasta": { + "type": "string", + "description": "Reference FASTA file." + }, + "reference_gff": { + "type": "string", + "description": "Reference GFF file." + }, + "ncbi_assembly_metadata": { + "type": "string", + "description": "Master file (*.txt) containing a summary of assemblies available in GeneBank or RefSeq. See: https://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt" } } }, @@ -189,6 +213,11 @@ "fa_icon": "fas fa-forward", "description": "Skip running Kraken2 classifier on reads." }, + "skip_kmerfinder": { + "type": "boolean", + "description": "Skip contamination analysis with Kmerfinder", + "fa_icon": "fas fa-forward" + }, "skip_annotation": { "type": "boolean", "fa_icon": "fas fa-forward", @@ -206,7 +235,8 @@ }, "skip_multiqc": { "type": "boolean", - "description": "Skip MultiQC" + "description": "Skip MultiQC", + "fa_icon": "fas fa-forward" } } }, diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf new file mode 100644 index 00000000..ef777ebc --- /dev/null +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -0,0 +1,83 @@ +// +// Kmerfinder subworkflow for species identification & QC +// +include { UNTAR } from '../../modules/nf-core/untar/main' +include { KMERFINDER } from '../../modules/local/kmerfinder' +include { KMERFINDER_SUMMARY } from '../../modules/local/kmerfinder_summary' +include { FIND_DOWNLOAD_REFERENCE } from '../../modules/local/find_download_reference' + +workflow KMERFINDER_SUBWORKFLOW { + take: + reads // channel: [ meta, reads ] + consensus // channel: [ meta, consensus ] + + main: + ch_versions = Channel.empty() + + // Prepare kmerfinder database + ch_kmerfinderdb = file(params.kmerfinderdb, checkIfExists: true) + ch_ncbi_assembly_metadata = file(params.ncbi_assembly_metadata, checkIfExists: true) + + if ( ch_kmerfinderdb.name.endsWith('.gz') ) { + UNTAR ( [[ id: ch_kmerfinderdb.getSimpleName() ], ch_kmerfinderdb] ) + ch_kmerfinderdb_untar = UNTAR.out.untar.map{ meta, file -> file } + ch_versions = ch_versions.mix(UNTAR.out.versions) + } else { + ch_kmerfinderdb_untar = Channel.from(params.kmerfinderdb) + } + + // MODULE: Kmerfinder, QC for sample purity. Identifies reference specie and reference genome assembly for each sample. + reads + .combine(ch_kmerfinderdb_untar) + .map{ meta, reads, db -> tuple(meta, reads, db) } + .set{ ch_to_kmerfinder } + + KMERFINDER ( + ch_to_kmerfinder + ) + ch_kmerfinder_report = KMERFINDER.out.report + ch_kmerfinder_json = KMERFINDER.out.json + ch_versions = ch_versions.mix(KMERFINDER.out.versions) + + // MODULE: Kmerfinder summary report. Generates a csv report file collecting all sample references. + KMERFINDER_SUMMARY ( + ch_kmerfinder_report.map{ meta, report -> report }.collect() + ) + ch_summary_yaml = KMERFINDER_SUMMARY.out.yaml + ch_versions = ch_versions.mix(KMERFINDER_SUMMARY.out.versions) + + // SUBWORKFLOW: Create a channel to organize assemblies and reports based on the identified Kmerfinder reference. + ch_kmerfinder_json + .join(ch_kmerfinder_report, by:0) + .join(consensus, by:0) + .map{ + meta, report_json, report_txt, fasta -> + specie = report_json.splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Species"] + return tuple(specie, meta, report_txt, fasta) + } + .groupTuple(by:0) // Group by the "Species" field + .set { ch_reports_byreference } + + // SUBWORKFLOW: For each species target, this subworkflow collects reference genome assemblies ('GCF*') and subsequently downloads the best matching reference assembly. + FIND_DOWNLOAD_REFERENCE ( + ch_reports_byreference.map{ specie, meta, report_txt, fasta-> tuple(specie, report_txt) }, + ch_ncbi_assembly_metadata + ) + ch_versions = ch_versions.mix(FIND_DOWNLOAD_REFERENCE.out.versions) + + // Organize sample assemblies into channels based on their corresponding reference files. + ch_reports_byreference + .join(FIND_DOWNLOAD_REFERENCE.out.fna) + .join(FIND_DOWNLOAD_REFERENCE.out.gff) + .join(FIND_DOWNLOAD_REFERENCE.out.winner) + .map { + specie, meta, report_txt, fasta, fna, gff, winner_id -> + return tuple([id: winner_id.getBaseName()], meta, fasta, fna, gff) + } + .set { ch_consensus_byrefseq } + + emit: + versions = ch_versions // channel: [ path(versions.yml) ] + summary_yaml = ch_summary_yaml // channel: [ path(kmerfinder_summary.yml) ] + consensus_byrefseq = ch_consensus_byrefseq // channel: [ refmeta, meta, fasta, fna, gff ] +} diff --git a/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf b/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf index 05cae8bc..b707cbcb 100644 --- a/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf @@ -75,7 +75,7 @@ workflow PIPELINE_INITIALISATION { // // Custom validation for pipeline parameters // - //validateInputParameters() + validateInputParameters() // // Create channel from input file provided through params.input @@ -156,6 +156,26 @@ workflow PIPELINE_COMPLETION { // def validateInputParameters() { // Add functions here for parameters validation + // Check Kraken2 dependencies + if (!params.skip_kraken2 && !params.kraken2db) { + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " Kraken2 database not provided.\n" + + " Please specify the '--kraken2db' parameter to provide the necessary database.\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + error(error_string) + } + + // Check kmerfinder dependencies + if (!params.skip_kmerfinder) { + if (!params.kmerfinderdb || !params.ncbi_assembly_metadata) { + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " Kmerfinder database and NCBI assembly metadata not provided.\n" + + " Please specify the '--kmerfinderdb' and '--ncbi_assembly_metadata' parameters.\n" + + " Both are required to run Kmerfinder.\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + error(error_string) + } + } } // @@ -184,7 +204,8 @@ def toolCitationText() { "ProeChop (Wick RR et al. 2017)", "Nanoplot (Wouter De Coster and Rosa Rademakers 2023)", "PycoQC (Adrien Leger & Tommaso Leonardi 2019)", - "Kreken2 (Derrick E. Wood et al. 2019)", + "Kraken2 (Derrick E. Wood et al. 2019)", + "Kmerfinder (Larsen et al. 2014)", "Unicycler (Ryan R Wick et al. 2017)", "Minimap & Miniasm (Heng Li 2016)", "Dragonflye (Robert A Petit III )", @@ -212,6 +233,7 @@ def toolBibliographyText() { "
  • Wouter De Coster, Rosa Rademakers, NanoPack2: population-scale evaluation of long-read sequencing data, Bioinformatics, Volume 39, Issue 5, May 2023, btad311, https://doi.org/10.1093/bioinformatics/btad311
  • ", "
  • Leger et al., (2019). pycoQC, interactive quality control for Oxford Nanopore Sequencing. Journal of Open Source Software, 4(34), 1236, https://doi.org/10.21105/joss.01236
  • ", "
  • Wood, D.E., Lu, J. & Langmead, B. Improved metagenomic analysis with Kraken 2. Genome Biol 20, 257 (2019). https://doi.org/10.1186/s13059-019-1891-0
  • ", + "
  • RBenchmarking of Methods for Genomic Taxonomy. Larsen MV, Cosentino S, Lukjancenko O, Saputra D, Rasmussen S, Hasman H, Sicheritz-Pontén T, Aarestrup FM, Ussery DW, Lund O. J Clin Microbiol. 2014 Feb 26.
  • ", "
  • Wick RR, Judd LM, Gorrie CL, Holt KE. Unicycler: Resolving bacterial genome assemblies from short and long sequencing reads. PLoS Comput Biol. 2017 Jun 8;13(6):e1005595. doi: 10.1371/journal.pcbi.1005595.
  • ", "
  • Heng Li, Minimap and miniasm: fast mapping and de novo assembly for noisy long sequences, Bioinformatics, Volume 32, Issue 14, July 2016, Pages 2103–2110, https://doi.org/10.1093/bioinformatics/btw152
  • ", "
  • Petit III, R. A. dragonflye: assemble bacterial isolate genomes from Nanopore reads (Version 1.1.2). https://github.com/rpetit3/dragonflye
  • ", diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 076e62c2..07b60b0b 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -8,14 +8,14 @@ def checkPathParamList = [ params.input, params.multiqc_config, params.kraken2db, params.dfast_config ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } -// Check krakendb -if(! params.skip_kraken2){ - if(params.kraken2db){ - kraken2db = file(params.kraken2db) - } else { - exit 1, "Missing Kraken2 DB arg" - } -} +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + CONFIG FILES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// Place config files here + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -26,12 +26,13 @@ if(! params.skip_kraken2){ // // MODULE: Local to the pipeline // -include { PYCOQC } from '../modules/local/pycoqc/main' -include { UNICYCLER } from '../modules/local/unicycler/main' -include { NANOPOLISH } from '../modules/local/nanopolish/main' -include { MEDAKA } from '../modules/local/medaka/main' -include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation/main' -include { DFAST } from '../modules/local/dfast/main' +include { PYCOQC } from '../modules/local/pycoqc' +include { UNICYCLER } from '../modules/local/unicycler' +include { NANOPOLISH } from '../modules/local/nanopolish' +include { MEDAKA } from '../modules/local/medaka' +include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' +include { DFAST } from '../modules/local/dfast' +include { MULTIQC_CUSTOM } from '../modules/local/multiqc_custom' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -60,14 +61,15 @@ include { SAMTOOLS_INDEX } from '../modules/nf-core/samto include { KRAKEN2_KRAKEN2 as KRAKEN2 } from '../modules/nf-core/kraken2/kraken2/main' include { KRAKEN2_KRAKEN2 as KRAKEN2_LONG } from '../modules/nf-core/kraken2/kraken2/main' include { QUAST } from '../modules/nf-core/quast/main' +include { QUAST as QUAST_BYREFSEQID } from '../modules/nf-core/quast/main' include { GUNZIP } from '../modules/nf-core/gunzip/main' include { PROKKA } from '../modules/nf-core/prokka/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' // // SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules // include { FASTQ_TRIM_FASTP_FASTQC } from '../subworkflows/nf-core/fastq_trim_fastp_fastqc/main' +include { KMERFINDER_SUBWORKFLOW } from '../subworkflows/local/kmerfinder_subworkflow' include { BAKTA_DBDOWNLOAD_RUN } from '../subworkflows/local/bakta_dbdownload_run' include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' @@ -237,6 +239,7 @@ workflow BACASS { // // MODULE: Miniasm, genome assembly, long reads + // if ( params.assembler == 'miniasm' ) { MINIMAP2_ALIGN ( ch_for_assembly.map{ meta,sr,lr -> tuple(meta,lr) }, @@ -315,7 +318,7 @@ workflow BACASS { ch_for_polish // tuple val(meta), val(reads), file(longreads), file(assembly) .join( MINIMAP2_POLISH.out.bam ) // tuple val(meta), file(bam) .join( SAMTOOLS_INDEX.out.bai ) // tuple val(meta), file(bai) - .join( ch_fast5 ) // tuple val(meta), file(fast5) + .join( ch_fast5 ) // tuple val(meta), file(fast5) .set { ch_for_nanopolish } // tuple val(meta), val(reads), file(longreads), file(assembly), file(bam), file(bai), file(fast5) // TODO: 'nanopolish index' couldn't be tested. No fast5 provided in test datasets. @@ -345,7 +348,7 @@ workflow BACASS { ch_kraken_long_multiqc = Channel.empty() if ( !params.skip_kraken2 ) { KRAKEN2_DB_PREPARATION ( - kraken2db + params.kraken2db ) ch_versions = ch_versions.mix(KRAKEN2_DB_PREPARATION.out.versions) KRAKEN2 ( @@ -374,21 +377,68 @@ workflow BACASS { ch_versions = ch_versions.mix(KRAKEN2_LONG.out.versions) } + // + // SUBWORKFLOW: Kmerfinder, QC for sample purity. + // + // Executes both kmerfinder and classifies samples by their reference genome (all this through the kmerfinder_subworkflow()). + + ch_kmerfinder_multiqc = Channel.empty() + if (!params.skip_kmerfinder) { + // Set kmerfinder channel based on assembly type + if( params.assembly_type == 'short' || params.assembly_type == 'hybrid' ) { + ch_for_kmerfinder = FASTQ_TRIM_FASTP_FASTQC.out.reads + } else if ( params.assembly_type == 'long' ) { + ch_for_kmerfinder = PORECHOP_PORECHOP.out.reads + } + // RUN kmerfinder subworkflow + KMERFINDER_SUBWORKFLOW ( + ch_for_kmerfinder, + ch_assembly + ) + ch_kmerfinder_multiqc = KMERFINDER_SUBWORKFLOW.out.summary_yaml + ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq + ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions) + + // Set channel to perform by refseq QUAST based on reference genome identified with KMERFINDER. + ch_consensus_byrefseq + .map { + refmeta, meta, consensus, ref_fna, ref_gff -> + return tuple(refmeta, consensus.flatten(), ref_fna, ref_gff) + } + .set { ch_to_quast_byrefseq } + } + // // MODULE: QUAST, assembly QC // ch_assembly - .collect{ it[1] } - .map { consensus_collect -> tuple([id: "report"], consensus_collect) } - .set { ch_to_quast } - - QUAST ( - ch_to_quast, - [[:],[]], - [[:],[]] - ) - ch_quast_multiqc = QUAST.out.tsv - ch_versions = ch_versions.mix(QUAST.out.versions) + .collect{it[1]} + .map{ consensus -> tuple([id:'report'], consensus) } + .set{ ch_to_quast } + + if(params.skip_kmerfinder){ + QUAST( + ch_to_quast, + params.reference_fasta ?: [[:],[]], + params.reference_gff ?: [[:],[]] + ) + ch_quast_multiqc = QUAST.out.results + } else if (!params.skip_kmerfinder) { + // Quast runs twice if kmerfinder is allowed. + // This approach allow Quast to calculate relevant parameters such as genome fraction based on a reference genome. + QUAST( + ch_to_quast, + [[:],[]], + [[:],[]] + ) + QUAST_BYREFSEQID( + ch_to_quast_byrefseq.map{ refmeta, consensus, ref_fasta, ref_gff -> tuple( refmeta, consensus)}, + ch_to_quast_byrefseq.map{ refmeta, consensus, ref_fasta, ref_gff -> tuple( refmeta, ref_fasta)}, + ch_to_quast_byrefseq.map{ refmeta, consensus, ref_fasta, ref_gff -> tuple( refmeta, ref_gff)} + ) + ch_quast_multiqc = QUAST_BYREFSEQID.out.results + } + ch_versions = ch_versions.mix(QUAST.out.versions) // Check assemblies that require further processing for gene annotation ch_assembly @@ -461,40 +511,37 @@ workflow BACASS { // // MODULE: MultiQC // - ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) + ch_multiqc_config = !params.skip_kmerfinder && params.assembly_type ? Channel.fromPath("$projectDir/assets/multiqc_config_${params.assembly_type}.yml", checkIfExists: true) : Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath(params.multiqc_logo, checkIfExists: true) : Channel.empty() summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - - ch_methods_description = Channel.value(methodsDescriptionText(ch_multiqc_custom_methods_description)) - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml', sort: false)) - ch_multiqc_files = ch_multiqc_files.mix(ch_fastqc_raw_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_fastqc_trim_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_trim_json_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_quast_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_prokka_txt_multiqc.collect().ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_bakta_txt_multiqc.collect().ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_nanoplot_txt_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_porechop_log_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_pycoqc_multiqc.collect{it[1]}.ifEmpty([])) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config, - ch_multiqc_custom_config.collect().ifEmpty([]), - ch_multiqc_logo.collect().ifEmpty([]) + ch_multiqc_custom_methods_description = params.multiqc_methods_description ? Channel.fromPath(params.multiqc_methods_description, checkIfExists: true) : Channel.fromPath("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + + MULTIQC_CUSTOM ( + ch_multiqc_config.ifEmpty([]), + ch_multiqc_custom_config.ifEmpty([]), + ch_multiqc_logo.ifEmpty([]), + ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'), + ch_multiqc_custom_methods_description.ifEmpty([]), + ch_collated_versions.ifEmpty([]), + ch_fastqc_raw_multiqc.collect{it[1]}.ifEmpty([]), + ch_trim_json_multiqc.collect{it[1]}.ifEmpty([]), + ch_nanoplot_txt_multiqc.collect{it[1]}.ifEmpty([]), + ch_porechop_log_multiqc.collect{it[1]}.ifEmpty([]), + ch_pycoqc_multiqc.collect{it[1]}.ifEmpty([]), + ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([]), + ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([]), + ch_quast_multiqc.collect{it[1]}.ifEmpty([]), + ch_prokka_txt_multiqc.collect().ifEmpty([]), + ch_bakta_txt_multiqc.collect().ifEmpty([]), + ch_kmerfinder_multiqc.collectFile(name: 'multiqc_kmerfinder.yaml').ifEmpty([]), ) - multiqc_report = MULTIQC.out.report.toList() + multiqc_report = MULTIQC_CUSTOM.out.report.toList() emit: - multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [ path(versions.yml) ] + multiqc_report = MULTIQC_CUSTOM.out.report.toList() // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] } /*