From ebcd833a4ae29cc9441d433130e7ee1bc8bc9ffe Mon Sep 17 00:00:00 2001 From: Dani VM Date: Mon, 6 Nov 2023 16:55:08 +0100 Subject: [PATCH 01/58] add kmerfinder for shortreads --- conf/modules.config | 10 ++++++++++ modules/local/kmerfinder.nf | 37 +++++++++++++++++++++++++++++++++++++ nextflow.config | 2 ++ workflows/bacass.nf | 23 +++++++++++++++++++++-- 4 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 modules/local/kmerfinder.nf diff --git a/conf/modules.config b/conf/modules.config index 8c3f11cf..28794331 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -159,6 +159,16 @@ process { ] } + withName: 'KMERFINDER' { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/Kmerfinder/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*_results.txt", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'KRAKEN2_LONG' { ext.args = '' publishDir = [ diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf new file mode 100644 index 00000000..db7fddf1 --- /dev/null +++ b/modules/local/kmerfinder.nf @@ -0,0 +1,37 @@ +process KMERFINDER { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::kmerfinder=3.0.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kmerfinder:3.0.2--hdfd78af_0' : + 'biocontainers/kmerfinder:3.0.2--hdfd78af_0' }" + + input: + tuple val(meta), path(reads) + path(kmerfinderDB) + + output: + tuple val(meta), path("*_results.txt") , emit: report + path "versions.yml" , emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def in_reads = reads.size() == 1 ? "${reads}" : "${reads[0]} ${reads[1]}" + + """ + kmerfinder.py \\ + --infile $in_reads \\ + --output_folder . \\ + --db_path ${kmerfinderDB}/bacteria.ATG \\ + -tax ${kmerfinderDB}/bacteria.name \\ + -x + + mv results.txt ${prefix}_results.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kmerfinder: \$(echo "3.0.2") + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index fdf6d8c4..49106820 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,6 +18,7 @@ params { // Contamination_screening kraken2db = "" + kmerfinderdb = "" // Assembly parameters assembler = 'unicycler' // Allowed: ['unicycler', 'canu', 'miniasm', 'dragonflye'] @@ -42,6 +43,7 @@ params { skip_fastqc = false skip_fastp = false skip_kraken2 = false + skip_kmerfinder = false skip_pycoqc = false skip_annotation = false skip_polish = false diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 7a2ce5cc..bb1ee142 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -59,6 +59,7 @@ include { UNICYCLER } from '../modules/local/unicycler' include { NANOPOLISH } from '../modules/local/nanopolish' include { MEDAKA } from '../modules/local/medaka' include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' +include { KMERFINDER } from '../modules/local/kmerfinder' include { DFAST } from '../modules/local/dfast' // @@ -219,7 +220,7 @@ workflow BACASS { .dump(tag: 'ch_for_assembly') .set { ch_for_assembly } } - +/* // // ASSEMBLY: Unicycler, Canu, Miniasm, Dragonflye // @@ -352,7 +353,7 @@ workflow BACASS { MEDAKA ( ch_for_medaka.dump(tag: 'into_medaka') ) ch_versions = ch_versions.mix(MEDAKA.out.versions.ifEmpty(null)) } - +*/ // // MODULE: Kraken2, QC for sample purity // @@ -388,6 +389,23 @@ workflow BACASS { ch_versions = ch_versions.mix(KRAKEN2_LONG.out.versions.ifEmpty(null)) } + // + // MODULE: Kmerfinder, QC for sample purity + // + + // TODO: add check contamination module // CALLIT PARSE_KMERFINDER + // TODO: if not provided, download reference from kmerfinder results --> module FIND_DOWNLOAD_COMMON_REFFERENCE + // TODO: Create kmerfinder mode for short and longreads + // TODO: When no kmerfinder database is found, allow nf-core/bacass to download it + if ( !params.skip_kmerfinder && params.kmerfinderdb ) { + KMERFINDER ( + ch_for_assembly.map{ meta, sr, lr -> tuple( meta, sr) }, // [meta, reads] + params.kmerfinderdb // path(kmerfinder database) + ) + ch_kmerfinder_report = KMERFINDER.out.report + ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) + } +/* // // MODULE: QUAST, assembly QC // @@ -501,6 +519,7 @@ workflow BACASS { ) multiqc_report = MULTIQC.out.report.toList() } +*/ } /* From 68a9c5df89f2d9bf6dee9f849ac0bc5b5237dc15 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Mon, 6 Nov 2023 18:07:00 +0100 Subject: [PATCH 02/58] add module kmerfinder summary report --- bin/kmerfinder_summary.py | 204 ++++++++++++++++++++++++++++ conf/modules.config | 10 ++ modules/local/kmerfinder_summary.nf | 26 ++++ workflows/bacass.nf | 6 + 4 files changed, 246 insertions(+) create mode 100755 bin/kmerfinder_summary.py create mode 100644 modules/local/kmerfinder_summary.nf diff --git a/bin/kmerfinder_summary.py b/bin/kmerfinder_summary.py new file mode 100755 index 00000000..612a2525 --- /dev/null +++ b/bin/kmerfinder_summary.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 + + +import argparse +import sys +import re +import csv +import pickle +import os + + +################# +### FUNCTIONS ### +################# + + +def check_arg(args=None): + + """ + Description: + Function collect arguments from command line using argparse + Input: + args # command line arguments + Constant: + None + Variables + parser + Return + parser.parse_args() # Parsed arguments + """ + + parser = argparse.ArgumentParser( + prog="07-kmerfinder.py", + formatter_class=argparse.RawDescriptionHelpFormatter, + description="07-kmerfinder.py creates a csv file from results.txt file", # FIXME + ) + + parser.add_argument( + "--path", + "-p", + required=True, + help="Insert path of results.txt file like /home/user/Service_folder/ANALYSIS/07-kmerfinder", # FIXME + ) + + parser.add_argument( + "--output_bn", "-b", required=True, help="The output in binary file" + ) + + parser.add_argument( + "--output_csv", "-c", required=True, help="The output in csv file" + ) + + # Example: python3 parse_kmerfinder.py -p /home/s.gonzalez/07-kmerfinder -b p_dic.dicke -c p_kmer.csv + + return parser.parse_args() + + +################# +### FUNCTIONS ### +################# + + +def kmerfinder_dictionary(file_txt): + + """ + Description: + Function to extract the relevant part of result.txt file + Input: + result.txt file + Return: + dictionary + """ + + step = "07-kmerfinder_" # FIXME + + num_lines = sum(1 for line in open(file_txt)) + hits = num_lines - 1 # to count the total number of hits + lookupfile = open(file_txt, "r") + lines = lookupfile.readlines() + parameters = lines[0].strip().split("\t") + if num_lines > 1: + values_best_hit = lines[1].strip().split("\t") + if num_lines > 2: + values_second_hit = lines[2].strip().split("\t") + + kmer_dict = {} + + for i in range(len(parameters)): + if num_lines > 1: + kmer_dict[step + "best_hit_" + parameters[i]] = values_best_hit[i] + else: + kmer_dict[step + "best_hit_" + parameters[i]] = "" + + kmer_dict.update(Total_hits_07_kmerfinder=hits) + + if num_lines > 2: + + kmer_dict[step + "second_hit_" + parameters[i]] = values_second_hit[i] + + else: + + kmer_dict[step + "second_hit_" + parameters[i]] = "" + + return kmer_dict + + +################# +### FUNCTIONS ### +################# + + +def dictionary2bn(dictionary, binary_file): + + """ + + Description: + Function to create a binary file from a dictionary + Input: + dictionary + Return: + binary file + """ + + pickle_out = open(binary_file, "wb") + pickle.dump(dictionary, pickle_out) + pickle_out.close() + + return + + +################# +### FUNCTIONS ### +################# + + +def dictionary2csv(dictionary, csv_file): + + """ + + Description: + Function to create a csv from a dictionary + Input: + dictionary + Return: + csv file + + """ + + header = sorted(set(i for b in map(dict.keys, dictionary.values()) for i in b)) + with open(csv_file, "w", newline="") as f: + write = csv.writer(f) + write.writerow(["sample_name", *header]) + for a, b in dictionary.items(): + write.writerow([a] + [b.get(i, "") for i in header]) + return + + +################### +### MAIN SCRIPT ### +################### + + +if __name__ == "__main__": + + # Variables + version = "07-kmerfinder.py v 0.1.0." # Script version # FIXME + arguments = check_arg(sys.argv[1:]) + + # Create sample_id_list + path = arguments.path + sample_list = [] + tmp = os.listdir(path) + for item in tmp: + if os.path.isdir(os.path.join(path, item)): + if item != "logs": + sample_name = item.replace("_results.txt", "") + sample_list.append(sample_name) + else: + sample_name = item.replace("_results.txt", "") + sample_list.append(sample_name) + + print("sample_list done") + + # Create a dictionary + kmer_all = {} + + for sample in sample_list: + file_name = os.path.join(path, sample + "_results.txt" ) + kmer_all[sample] = kmerfinder_dictionary(file_name) + + print("kmerfinder_dictionary done") + # print (kmer_all) + + # Save the dicctionary to binary file + + dictionary2bn(kmer_all, arguments.output_bn) + + print("kmerfinder_dictionary_bn done") + + # Convert the dictionary to csv file + + dictionary2csv(kmer_all, arguments.output_csv) + + print("kmerfinder_dictionary_csv done") diff --git a/conf/modules.config b/conf/modules.config index 28794331..a32d379d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -169,6 +169,16 @@ process { ] } + withName: 'KMERFINDER_SUMMARY' { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/Kmerfinder" }, + mode: params.publish_dir_mode, + pattern: "*.csv", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'KRAKEN2_LONG' { ext.args = '' publishDir = [ diff --git a/modules/local/kmerfinder_summary.nf b/modules/local/kmerfinder_summary.nf new file mode 100644 index 00000000..af188125 --- /dev/null +++ b/modules/local/kmerfinder_summary.nf @@ -0,0 +1,26 @@ +process KMERFINDER_SUMMARY { + tag "kmerfinder_summary" + label 'process_low' + + conda "bioconda::python=3.10.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.10' : + 'biocontainers/python:3.10' }" + + input: + path(reports, stageAs: 'reports/*') + + output: + path "kmerfinder.csv" , emit: summary + path "versions.yml" , emit: versions + + script: + """ + kmerfinder_summary.py --path kmerfinder_reports/ --output_bn kmerfinder.bn --output_csv kmerfinder.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """ +} diff --git a/workflows/bacass.nf b/workflows/bacass.nf index bb1ee142..8ac32439 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -60,6 +60,7 @@ include { NANOPOLISH } from '../modules/local/nanopolish' include { MEDAKA } from '../modules/local/medaka' include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' include { KMERFINDER } from '../modules/local/kmerfinder' +include { KMERFINDER_SUMMARY } from '../modules/local/kmerfinder_summary' include { DFAST } from '../modules/local/dfast' // @@ -404,6 +405,11 @@ workflow BACASS { ) ch_kmerfinder_report = KMERFINDER.out.report ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) + + KMERFINDER_SUMMARY ( + ch_kmerfinder_report.map{meta, report -> report }.collect() + ) + ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) } /* // From fd8a934cdf1b6c08d93157fd15bbc04151749747 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 14 Nov 2023 13:48:33 +0100 Subject: [PATCH 03/58] add module to find and download reference genome --- bin/download_reference.py | 153 +++++++++++++++++++++++ bin/find_common_reference.py | 104 +++++++++++++++ modules/local/find_download_reference.nf | 37 ++++++ nextflow.config | 3 + nextflow_schema.json | 25 +++- workflows/bacass.nf | 22 +++- 6 files changed, 337 insertions(+), 7 deletions(-) create mode 100755 bin/download_reference.py create mode 100755 bin/find_common_reference.py create mode 100644 modules/local/find_download_reference.nf diff --git a/bin/download_reference.py b/bin/download_reference.py new file mode 100755 index 00000000..907c547a --- /dev/null +++ b/bin/download_reference.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python +""" +============================================================= +HEADER +============================================================= +INSTITUTION: BU-ISCIII +AUTHOR: Guillermo J. Gorines Cordero +MAIL: guillermo.gorines@urjc.es +VERSION: 0.1 +CREATED: Early 2022 +REVISED: 18-2-2022 +EDITED: 14-11-2023 +DESCRIPTION: + Given a file with the kmerfinder results and frequencies (probably + created by find_common_reference.py), and the NCBI assembly sheet, + download the top-reference genome, gff and protein files from + the NCBI ftp. + +INPUT: + -FILE: file containing the ranking of references from kmerfinder created by the script find_common_references + -REFERENCE: file with the NCBI reference list + -OUTDIR: name of the output dir + +OUTPUT: + - *_fna.gz: file with the top-reference genome + - *_gff.gz: file with the top-reference gff + - *_protein.gz: file with the top-reference proteins + +USAGE: + python download_reference.py + -file [FILE] + -reference [REFERENCE] + -out_dir [OUTDIR] + +REQUIREMENTS: + -Python >= 3.6 + -Python wget + +DISCLAIMER: + This script has been designed for the assembly pipeline of BU-ISCIII. + Feel free to use it at will, however we dont guarantee its success + outside its purpose. +================================================================ +END_OF_HEADER +================================================================ +""" + +import sys +import argparse +import os + +#import wget +import requests + + +def parse_args(args=None): + Description = ( + "download the reference files \ + (fna, faa, gff)from the reference NCBI file." + ) + Epilog = """Usage example: \ + python download_reference.py \ + -file \ + -reference \ + -out_dir """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-file", + help="File containing the ranking of references from kmerfinder." + ) + parser.add_argument( + "-reference", + help="File containing the paths to bacterial references." + ) + parser.add_argument( + "-out_dir", + help="Output directory." + ) + + return parser.parse_args(args) + + +def download_references(file, reference, out_dir): + """ + Downloads the top reference from the NCBI database + """ + + reference_ends = ["_genomic.fna.gz", "_protein.faa.gz", "_genomic.gff.gz"] + + # extract the most common reference from file + with open(file) as infile: + infile = infile.readlines() + infile = [ + item.replace("\n", "").split("\t") + for item in infile + if not item.startswith("#") + ] + top_reference = infile[0][0] + + print(top_reference) + + # create the outdir (do nothing if already there) + try: + os.mkdir(out_dir) + except FileExistsError: + pass + + # open the reference and find the reference + with open(reference) as inref: + inref = inref.readlines() + inref = [ + item.replace("\n", "").split("\t") + for item in inref + if not item.startswith("#") + ] + + url = [row[19] for row in inref if row[0] in top_reference] + + if len(url) == 0: + print("No assemblies responding to the top reference: ", top_reference, " were found") + sys.exit(1) + + + url = str(url[0]) + url_https = url.replace('ftp', 'https') + + # get url and reference file + + for r_end in reference_ends: + + out_file = out_dir + "/" + top_reference + r_end + file_url = url_https + "/" + top_reference + r_end + + print(out_file) + print(file_url) + + #wget.download(file_url, out_file) + response = requests.get(file_url, stream=True) + with open(out_file, 'wb') as out: + for chunk in response.iter_content(chunk_size=8192): + out.write(chunk) + + return + + +def main(args=None): + args = parse_args(args) + download_references(args.file, args.reference, args.out_dir) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/find_common_reference.py b/bin/find_common_reference.py new file mode 100755 index 00000000..e26aaf53 --- /dev/null +++ b/bin/find_common_reference.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +""" +============================================================= +HEADER +============================================================= +INSTITUTION: BU-ISCIII +AUTHOR: Guillermo J. Gorines Cordero +MAIL: guillermo.gorines@urjc.es +VERSION: 0.1 +CREATED: Early 2022 +REVISED: 18-2-2022 +DESCRIPTION: + Given a directory with kmerfinder results, sum them up + in an outfile named by the user. + +INPUT: + -DIRECTORY: directory containing all kmerfinder results. + -OUTFILE: Name of the file to write the whole results in. + +OUTPUT: + -OUTFILE: file containing the kmerfinder results. + +USAGE: + python find_common_reference.py -d [DIRECTORY] -o [OUTFILE] +REQUIREMENTS: + -Python >= 3.6 + +DISCLAIMER: This script has been designed for the assembly pipeline of BU-ISCIII. + Feel free to use it at will, however we dont guarantee its success + outside its purpose. + +================================================================ +END_OF_HEADER +================================================================ +""" +import os +import sys +import errno +import argparse + + +def parse_args(args=None): + """ + Parse the args given to argparser + """ + Description = "Fetch kmerfinder result files and get the most used reference." + Epilog = """Example usage: python find_common_reference.py -d -o """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("-d", help="Input directory.") + parser.add_argument("-o", help="Output file.") + return parser.parse_args(args) + + +def group_references(kmer_result_dir, out_file): + """ + Unifies the kmerfinder results, and counts their occurrences + """ + reference_assembly = {} + + # for file in dir + for k_file in os.listdir(kmer_result_dir): + # open file + with open(os.path.join(kmer_result_dir, k_file), "r") as fh: + file_lines = fh.readlines() + + # remove heading + try: + heading = file_lines[0].split("\t") + first_line = file_lines[1].split("\t") + + # where is the assembly in the header? + # find reference according to index + index_assembly = heading.index("# Assembly") + reference = first_line[index_assembly] + + # add it to the dict if not there + if reference not in reference_assembly: + index_description = heading.index("Description") + reference_assembly[reference] = [0, first_line[index_description]] + # sum 1 for another occurrence + reference_assembly[reference][0] += 1 + except IndexError: + pass + + # sort it (more occurrences first in file) + order_reference = dict( + sorted(reference_assembly.items(), key=lambda x: x[1][0], reverse=True) + ) + + # write it + with open(out_file, "w") as f_out: + for key, value in order_reference.items(): + f_out.write(key + "\t" + str(value[0]) + "\t" + value[1] + "\n") + return + + +def main(args=None): + args = parse_args(args) + group_references(args.d, args.o) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf new file mode 100644 index 00000000..e847e4f2 --- /dev/null +++ b/modules/local/find_download_reference.nf @@ -0,0 +1,37 @@ +process FIND_DOWNLOAD_REFERENCE { + tag "${task.process}" + label 'process_low' + + conda "conda-forge::requests=2.26.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/requests:2.26.0' : + 'biocontainers/requests:2.26.0' }" + + input: + path(reports, stageAs: 'reports/*') + path(ncbi_reference) + + output: + path "references_found.tsv" , emit: target_references_tsv + path "*.fna.gz" , emit: fna + path "*.gff.gz" , emit: gff + path "*.faa.gz" , emit: faa + path "versions.yml" , emit: versions + + script: + """ + find_common_reference.py \\ + -d reports/ \\ + -o references_found.tsv + + download_reference.py \\ + -file references_found.tsv \\ + -reference $ncbi_reference \\ + -out_dir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 49106820..4dc16396 100644 --- a/nextflow.config +++ b/nextflow.config @@ -19,6 +19,9 @@ params { // Contamination_screening kraken2db = "" kmerfinderdb = "" + reference_fasta = "" + reference_gff = "" + reference_ncbi_bacteria = "" // Assembly parameters assembler = 'unicycler' // Allowed: ['unicycler', 'canu', 'miniasm', 'dragonflye'] diff --git a/nextflow_schema.json b/nextflow_schema.json index 2eb706e0..0f0e9185 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -73,6 +73,23 @@ "fa_icon": "fab fa-gitkraken", "help_text": "See [Kraken2 homepage](https://benlangmead.github.io/aws-indexes/k2) for download\nlinks. Minikraken2 8GB is a reasonable choice, since we run Kraken here mainly just to check for\nsample purity.", "description": "Path to Kraken2 database." + }, + "kmerfinderdb": { + "type": "string", + "description": "Database for Kmerfinder.", + "help_text": "" + }, + "reference_fasta": { + "type": "string", + "description": "Reference FASTA file." + }, + "reference_gff": { + "type": "string", + "description": "Reference GFF file." + }, + "reference_ncbi_bacteria": { + "type": "string", + "description": "NCBI Bacteria reference database" } } }, @@ -188,6 +205,11 @@ "fa_icon": "fas fa-forward", "description": "Skip running Kraken2 classifier on reads." }, + "skip_kmerfinder": { + "type": "boolean", + "description": "Skip contamination analysis with Kmerfinder", + "fa_icon": "fas fa-forward" + }, "skip_annotation": { "type": "boolean", "fa_icon": "fas fa-forward", @@ -205,7 +227,8 @@ }, "skip_multiqc": { "type": "boolean", - "description": "Skip MultiQC" + "description": "Skip MultiQC", + "fa_icon": "fas fa-forward" } } }, diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 8ac32439..cae1ad2b 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -61,6 +61,7 @@ include { MEDAKA } from '../modules/local/medaka' include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' include { KMERFINDER } from '../modules/local/kmerfinder' include { KMERFINDER_SUMMARY } from '../modules/local/kmerfinder_summary' +include { FIND_DOWNLOAD_REFERENCE } from '../modules/local/find_download_reference' include { DFAST } from '../modules/local/dfast' // @@ -393,23 +394,32 @@ workflow BACASS { // // MODULE: Kmerfinder, QC for sample purity // - - // TODO: add check contamination module // CALLIT PARSE_KMERFINDER - // TODO: if not provided, download reference from kmerfinder results --> module FIND_DOWNLOAD_COMMON_REFFERENCE // TODO: Create kmerfinder mode for short and longreads // TODO: When no kmerfinder database is found, allow nf-core/bacass to download it + // TODO: create a strategy to group the samples according to the reference found. + // TODO: I think that this kmerfinder step could be grouped into a subworkflow if ( !params.skip_kmerfinder && params.kmerfinderdb ) { KMERFINDER ( ch_for_assembly.map{ meta, sr, lr -> tuple( meta, sr) }, // [meta, reads] params.kmerfinderdb // path(kmerfinder database) ) - ch_kmerfinder_report = KMERFINDER.out.report - ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) + KMERFINDER.out.report + .map { meta, report -> report } + .collect() + .set { ch_kmerfinder_reports } + ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) KMERFINDER_SUMMARY ( - ch_kmerfinder_report.map{meta, report -> report }.collect() + ch_kmerfinder_reports ) ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) + + if (!params.reference_fasta && !params.reference_gff) { + FIND_DOWNLOAD_REFERENCE ( + ch_kmerfinder_reports, + params.reference_ncbi_bacteria + ) + } } /* // From c81cc2963eb4596caa1bfc257874e7895c849e6e Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 14 Nov 2023 17:42:27 +0100 Subject: [PATCH 04/58] fix kmerfinder summary input --- modules/local/kmerfinder_summary.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/kmerfinder_summary.nf b/modules/local/kmerfinder_summary.nf index af188125..b60fe4ca 100644 --- a/modules/local/kmerfinder_summary.nf +++ b/modules/local/kmerfinder_summary.nf @@ -16,7 +16,7 @@ process KMERFINDER_SUMMARY { script: """ - kmerfinder_summary.py --path kmerfinder_reports/ --output_bn kmerfinder.bn --output_csv kmerfinder.csv + kmerfinder_summary.py --path reports/ --output_bn kmerfinder.bn --output_csv kmerfinder.csv cat <<-END_VERSIONS > versions.yml "${task.process}": From f989ae3222e64e0938602f8b149ecae0bcde4fc3 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 17 Nov 2023 12:24:35 +0100 Subject: [PATCH 05/58] update kmerfinder output file extension --- conf/modules.config | 2 +- modules/local/kmerfinder.nf | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a32d379d..ebbaf20c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -164,7 +164,7 @@ process { publishDir = [ path: { "${params.outdir}/Kmerfinder/${meta.id}" }, mode: params.publish_dir_mode, - pattern: "*_results.txt", + pattern: "*.txt", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf index db7fddf1..f7dd707e 100644 --- a/modules/local/kmerfinder.nf +++ b/modules/local/kmerfinder.nf @@ -12,7 +12,8 @@ process KMERFINDER { path(kmerfinderDB) output: - tuple val(meta), path("*_results.txt") , emit: report + tuple val(meta), path("*_results.txt") , emit: report + tuple val(meta), path("*_data.json") , emit: json path "versions.yml" , emit: versions script: @@ -28,6 +29,7 @@ process KMERFINDER { -x mv results.txt ${prefix}_results.txt + mv data.json ${prefix}_data.json cat <<-END_VERSIONS > versions.yml "${task.process}": From 20be5f9fc86b138e346bee51ad6fb45e00401a6a Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 17 Nov 2023 12:25:07 +0100 Subject: [PATCH 06/58] add kmerfinder refseqid to meta --- workflows/bacass.nf | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/workflows/bacass.nf b/workflows/bacass.nf index cae1ad2b..fb31f634 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -92,6 +92,7 @@ include { KRAKEN2_KRAKEN2 as KRAKEN2 } from '../modules/nf-core/krake include { KRAKEN2_KRAKEN2 as KRAKEN2_LONG } from '../modules/nf-core/kraken2/kraken2/main' include { QUAST } from '../modules/nf-core/quast/main' include { GUNZIP } from '../modules/nf-core/gunzip/main' +include { GUNZIP_KMERFINDERDB } from '../modules/nf-core/gunzip/main' include { PROKKA } from '../modules/nf-core/prokka/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' @@ -394,20 +395,31 @@ workflow BACASS { // // MODULE: Kmerfinder, QC for sample purity // - // TODO: Create kmerfinder mode for short and longreads - // TODO: When no kmerfinder database is found, allow nf-core/bacass to download it - // TODO: create a strategy to group the samples according to the reference found. + // TODO: Create kmerfinder mode for longreads + // TODO: create a strategy to group the samples according to the reference found. [pending, fix splitjson path-key] // TODO: I think that this kmerfinder step could be grouped into a subworkflow - if ( !params.skip_kmerfinder && params.kmerfinderdb ) { + // TODO: Create a by refseq-id quast report && general. + // TODO: hack multiqc to group quast-entries by refseqid? KMERFINDER ( ch_for_assembly.map{ meta, sr, lr -> tuple( meta, sr) }, // [meta, reads] params.kmerfinderdb // path(kmerfinder database) ) + ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) + + KMERFINDER.out.json + .join(ch_for_assembly, by:0) + .map{ + meta, json, sr, lr -> + meta.refseq = json + .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] + return tuple(meta, sr, lr) + } + .set { ch_refseqid } + KMERFINDER.out.report .map { meta, report -> report } .collect() .set { ch_kmerfinder_reports } - ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) KMERFINDER_SUMMARY ( ch_kmerfinder_reports From 16b5068fe768569e04d1905812494b23a17a0c4f Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 17 Nov 2023 14:12:54 +0100 Subject: [PATCH 07/58] fix url in kmerfinder donwload ref --- bin/download_reference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/download_reference.py b/bin/download_reference.py index 907c547a..8fa18da4 100755 --- a/bin/download_reference.py +++ b/bin/download_reference.py @@ -123,7 +123,7 @@ def download_references(file, reference, out_dir): url = str(url[0]) - url_https = url.replace('ftp', 'https') + url_https = url.replace('ftp', 'https', 1) # get url and reference file From 10f68ba24eedd27d647d3500d5c559b2067d70c0 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 17 Nov 2023 16:44:36 +0100 Subject: [PATCH 08/58] temporary commit --- modules/local/kmerfinder.nf | 4 +- modules/local/kmerfinder_summary.nf | 2 +- workflows/bacass.nf | 58 ++++++++++++++++++----------- 3 files changed, 40 insertions(+), 24 deletions(-) diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf index f7dd707e..58a6de83 100644 --- a/modules/local/kmerfinder.nf +++ b/modules/local/kmerfinder.nf @@ -12,8 +12,8 @@ process KMERFINDER { path(kmerfinderDB) output: - tuple val(meta), path("*_results.txt") , emit: report - tuple val(meta), path("*_data.json") , emit: json + tuple val(meta), path("*_results.txt") , emit: report + tuple val(meta), path("*_data.json") , emit: json path "versions.yml" , emit: versions script: diff --git a/modules/local/kmerfinder_summary.nf b/modules/local/kmerfinder_summary.nf index b60fe4ca..bb8f11d4 100644 --- a/modules/local/kmerfinder_summary.nf +++ b/modules/local/kmerfinder_summary.nf @@ -8,7 +8,7 @@ process KMERFINDER_SUMMARY { 'biocontainers/python:3.10' }" input: - path(reports, stageAs: 'reports/*') + val(meta), path(report, stageAs: 'reports/*') output: path "kmerfinder.csv" , emit: summary diff --git a/workflows/bacass.nf b/workflows/bacass.nf index fb31f634..3a281f84 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -92,7 +92,7 @@ include { KRAKEN2_KRAKEN2 as KRAKEN2 } from '../modules/nf-core/krake include { KRAKEN2_KRAKEN2 as KRAKEN2_LONG } from '../modules/nf-core/kraken2/kraken2/main' include { QUAST } from '../modules/nf-core/quast/main' include { GUNZIP } from '../modules/nf-core/gunzip/main' -include { GUNZIP_KMERFINDERDB } from '../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_KMERFINDERDB } from '../modules/nf-core/gunzip/main' include { PROKKA } from '../modules/nf-core/prokka/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' @@ -223,7 +223,7 @@ workflow BACASS { .dump(tag: 'ch_for_assembly') .set { ch_for_assembly } } -/* + // // ASSEMBLY: Unicycler, Canu, Miniasm, Dragonflye // @@ -356,7 +356,7 @@ workflow BACASS { MEDAKA ( ch_for_medaka.dump(tag: 'into_medaka') ) ch_versions = ch_versions.mix(MEDAKA.out.versions.ifEmpty(null)) } -*/ + // // MODULE: Kraken2, QC for sample purity // @@ -400,38 +400,54 @@ workflow BACASS { // TODO: I think that this kmerfinder step could be grouped into a subworkflow // TODO: Create a by refseq-id quast report && general. // TODO: hack multiqc to group quast-entries by refseqid? + // TODO: corner casse >1 refseq_id + // TODO: PREPARE REFERENCES SUBWORKFLOW + if ( !params.skip_kmerfinder && params.kmerfinderdb ) { + if( params.kmerfinderdb.endsWith('.gz') ){ + GUNZIP_KMERFINDERDB ( params.kmerfinderdb ) + ch_kmerfinderdb = GUNZIP_KMERFINDERDB.out.gunzip + } else { + ch_kmerfinderdb = params.kmerfinderdb + } + KMERFINDER ( ch_for_assembly.map{ meta, sr, lr -> tuple( meta, sr) }, // [meta, reads] - params.kmerfinderdb // path(kmerfinder database) + ch_kmerfinderdb ) ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) KMERFINDER.out.json - .join(ch_for_assembly, by:0) + .join(KMERFINDER.out.report, by:0) + .join(ch_assembly, by:0) .map{ - meta, json, sr, lr -> - meta.refseq = json + meta, json, report, fasta -> + def new_meta = [:] + new_meta.refseq = json .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] - return tuple(meta, sr, lr) + return tuple(meta, new_meta, report, fasta) } - .set { ch_refseqid } + .groupTuple(by:1) + .set { ch_refseqid_fasta } - KMERFINDER.out.report - .map { meta, report -> report } - .collect() - .set { ch_kmerfinder_reports } + ch_refseqid_fasta.map{ meta, new_meta, report, fasta -> tuple (meta, report)}.view() + + +// KMERFINDER.out.report +// .map { meta, report -> report } +// .collect() +// .set { ch_kmerfinder_reports } KMERFINDER_SUMMARY ( - ch_kmerfinder_reports + ch_refseqid_fasta.map{ meta, report, fasta -> tuple (meta, report)} ) ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) - if (!params.reference_fasta && !params.reference_gff) { - FIND_DOWNLOAD_REFERENCE ( - ch_kmerfinder_reports, - params.reference_ncbi_bacteria - ) - } +// if (!params.reference_fasta && !params.reference_gff) { +// FIND_DOWNLOAD_REFERENCE ( +// ch_kmerfinder_reports, +// params.reference_ncbi_bacteria +// ) +// } } /* // @@ -449,7 +465,7 @@ workflow BACASS { ) ch_quast_multiqc = QUAST.out.tsv ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) - +/* // Check assemblies that require further processing for gene annotation ch_assembly .branch{ meta, fasta -> From a48518ddcdc12d6a41b878165b5620269a3c29bc Mon Sep 17 00:00:00 2001 From: Dani VM Date: Sun, 19 Nov 2023 10:54:32 +0100 Subject: [PATCH 09/58] group assemblies by refseqid --- modules/local/find_download_reference.nf | 3 ++- modules/local/kmerfinder_summary.nf | 8 +++---- workflows/bacass.nf | 28 ++++++++++-------------- 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf index e847e4f2..60cc5c66 100644 --- a/modules/local/find_download_reference.nf +++ b/modules/local/find_download_reference.nf @@ -8,7 +8,7 @@ process FIND_DOWNLOAD_REFERENCE { 'biocontainers/requests:2.26.0' }" input: - path(reports, stageAs: 'reports/*') + tuple val(meta), path(reports, stageAs: 'reports/*') path(ncbi_reference) output: @@ -19,6 +19,7 @@ process FIND_DOWNLOAD_REFERENCE { path "versions.yml" , emit: versions script: + def prefix = task.ext.prefix ?: "${meta.refseq}" """ find_common_reference.py \\ -d reports/ \\ diff --git a/modules/local/kmerfinder_summary.nf b/modules/local/kmerfinder_summary.nf index bb8f11d4..58fe104f 100644 --- a/modules/local/kmerfinder_summary.nf +++ b/modules/local/kmerfinder_summary.nf @@ -8,15 +8,15 @@ process KMERFINDER_SUMMARY { 'biocontainers/python:3.10' }" input: - val(meta), path(report, stageAs: 'reports/*') + path(report, stageAs: 'reports/*') output: - path "kmerfinder.csv" , emit: summary - path "versions.yml" , emit: versions + path "*.csv" , emit: summary + path "versions.yml" , emit: versions script: """ - kmerfinder_summary.py --path reports/ --output_bn kmerfinder.bn --output_csv kmerfinder.csv + kmerfinder_summary.py --path reports/ --output_bn kmerfinder.bn --output_csv kmerfinder_summary.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 3a281f84..8c86338e 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -424,30 +424,24 @@ workflow BACASS { def new_meta = [:] new_meta.refseq = json .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] - return tuple(meta, new_meta, report, fasta) + return tuple(new_meta, meta, report, fasta) } - .groupTuple(by:1) + .groupTuple(by:0) .set { ch_refseqid_fasta } - ch_refseqid_fasta.map{ meta, new_meta, report, fasta -> tuple (meta, report)}.view() - - -// KMERFINDER.out.report -// .map { meta, report -> report } -// .collect() -// .set { ch_kmerfinder_reports } - + ch_reports_Byrefseqid = ch_refseqid_fasta + .map{ new_meta, meta, report, fasta -> [new_meta, report] } KMERFINDER_SUMMARY ( - ch_refseqid_fasta.map{ meta, report, fasta -> tuple (meta, report)} + KMERFINDER.out.report.map{meta, report -> report }collect() ) ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) -// if (!params.reference_fasta && !params.reference_gff) { -// FIND_DOWNLOAD_REFERENCE ( -// ch_kmerfinder_reports, -// params.reference_ncbi_bacteria -// ) -// } + if (!params.reference_fasta && !params.reference_gff) { + FIND_DOWNLOAD_REFERENCE ( + ch_reports_Byrefseqid, + params.reference_ncbi_bacteria + ) + } } /* // From 1e436ea100a93acad665abdb61cf3c7f03eadcaf Mon Sep 17 00:00:00 2001 From: Dani VM Date: Sun, 19 Nov 2023 22:23:24 +0100 Subject: [PATCH 10/58] allow global quast and by-refseqid quast --- conf/modules.config | 14 +++++++-- modules/local/find_download_reference.nf | 11 +++---- workflows/bacass.nf | 40 ++++++++++++++++-------- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ebbaf20c..3a3bcee9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -195,12 +195,22 @@ process { ] } - withName: 'QUAST' { + withName: 'QUAST*' { ext.args = '' publishDir = [ path: { "${params.outdir}/QUAST" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + pattern: "meta.id", + saveAs: { filename -> + if (filename.equals('versions.yml')){ + null + } else if (filename.startsWith('GCF')){ + "bySampleReference/${filename}" + } + else { + "global/${filename}" + } + } ] } diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf index 60cc5c66..478d3b67 100644 --- a/modules/local/find_download_reference.nf +++ b/modules/local/find_download_reference.nf @@ -12,14 +12,13 @@ process FIND_DOWNLOAD_REFERENCE { path(ncbi_reference) output: - path "references_found.tsv" , emit: target_references_tsv - path "*.fna.gz" , emit: fna - path "*.gff.gz" , emit: gff - path "*.faa.gz" , emit: faa - path "versions.yml" , emit: versions + tuple val(meta), path( "references_found.tsv") , emit: target_references_tsv + tuple val(meta), path( "*.fna.gz") , emit: fna + tuple val(meta), path( "*.gff.gz") , emit: gff + tuple val(meta), path( "*.faa.gz") , emit: faa + path "versions.yml" , emit: versions script: - def prefix = task.ext.prefix ?: "${meta.refseq}" """ find_common_reference.py \\ -d reports/ \\ diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 8c86338e..639d0974 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -91,6 +91,7 @@ include { SAMTOOLS_INDEX } from '../modules/nf-core/samto include { KRAKEN2_KRAKEN2 as KRAKEN2 } from '../modules/nf-core/kraken2/kraken2/main' include { KRAKEN2_KRAKEN2 as KRAKEN2_LONG } from '../modules/nf-core/kraken2/kraken2/main' include { QUAST } from '../modules/nf-core/quast/main' +include { QUAST as QUAST_BYREFSEQID } from '../modules/nf-core/quast/main' include { GUNZIP } from '../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_KMERFINDERDB } from '../modules/nf-core/gunzip/main' include { PROKKA } from '../modules/nf-core/prokka/main' @@ -395,13 +396,13 @@ workflow BACASS { // // MODULE: Kmerfinder, QC for sample purity // + // TODO: Create kmerfinder mode for longreads - // TODO: create a strategy to group the samples according to the reference found. [pending, fix splitjson path-key] // TODO: I think that this kmerfinder step could be grouped into a subworkflow - // TODO: Create a by refseq-id quast report && general. // TODO: hack multiqc to group quast-entries by refseqid? // TODO: corner casse >1 refseq_id // TODO: PREPARE REFERENCES SUBWORKFLOW + // TODO: PASS QUAST_BYREF TSV TO MULTIQC if ( !params.skip_kmerfinder && params.kmerfinderdb ) { if( params.kmerfinderdb.endsWith('.gz') ){ GUNZIP_KMERFINDERDB ( params.kmerfinderdb ) @@ -422,7 +423,7 @@ workflow BACASS { .map{ meta, json, report, fasta -> def new_meta = [:] - new_meta.refseq = json + new_meta.id = json .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] return tuple(new_meta, meta, report, fasta) } @@ -432,7 +433,7 @@ workflow BACASS { ch_reports_Byrefseqid = ch_refseqid_fasta .map{ new_meta, meta, report, fasta -> [new_meta, report] } KMERFINDER_SUMMARY ( - KMERFINDER.out.report.map{meta, report -> report }collect() + KMERFINDER.out.report.map{meta, report -> report }.collect() ) ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) @@ -441,25 +442,39 @@ workflow BACASS { ch_reports_Byrefseqid, params.reference_ncbi_bacteria ) + ch_reference_fasta = FIND_DOWNLOAD_REFERENCE.out.fna + ch_reference_gff = FIND_DOWNLOAD_REFERENCE.out.gff } } -/* + // // MODULE: QUAST, assembly QC // - ch_assembly - .collect{ it[1] } - .map { consensus_collect -> tuple([id: "report"], consensus_collect) } - .set { ch_to_quast } + ch_refseqid_fasta + .join(ch_reference_fasta) + .join(ch_reference_gff) + .groupTuple(by:0) + .set { ch_to_quast} - QUAST ( - ch_to_quast, + QUAST( + ch_assembly + .collect{ it[1]} + .map{ consensus -> tuple([id:'report'], consensus)}, [[:],[]], [[:],[]] ) + QUAST_BYREFSEQID ( + ch_to_quast + .map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus.flatten()) + }, + ch_to_quast + .map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_fasta)}, + ch_to_quast + .map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} + ) ch_quast_multiqc = QUAST.out.tsv ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) -/* + // Check assemblies that require further processing for gene annotation ch_assembly .branch{ meta, fasta -> @@ -557,7 +572,6 @@ workflow BACASS { ) multiqc_report = MULTIQC.out.report.toList() } -*/ } /* From 7969e11b03a888b3d0d0face9aecd708dae3e70c Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 21 Nov 2023 10:31:17 +0100 Subject: [PATCH 11/58] move kmerfinder processing to subworkflow plus output refactoring --- conf/modules.config | 13 ++- subworkflows/local/kmerfinder_subworkflow.nf | 65 +++++++++++++++ workflows/bacass.nf | 84 +++++++------------- 3 files changed, 101 insertions(+), 61 deletions(-) create mode 100644 subworkflows/local/kmerfinder_subworkflow.nf diff --git a/conf/modules.config b/conf/modules.config index 3a3bcee9..41dd51f3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -159,7 +159,7 @@ process { ] } - withName: 'KMERFINDER' { + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER' { ext.args = '' publishDir = [ path: { "${params.outdir}/Kmerfinder/${meta.id}" }, @@ -169,7 +169,7 @@ process { ] } - withName: 'KMERFINDER_SUMMARY' { + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER_SUMMARY' { ext.args = '' publishDir = [ path: { "${params.outdir}/Kmerfinder" }, @@ -195,20 +195,19 @@ process { ] } - withName: 'QUAST*' { + withName: 'QUAST' { ext.args = '' publishDir = [ path: { "${params.outdir}/QUAST" }, mode: params.publish_dir_mode, - pattern: "meta.id", saveAs: { filename -> - if (filename.equals('versions.yml')){ + if (filename.equals('versions.yml') || filename.endsWith('.tsv')){ null } else if (filename.startsWith('GCF')){ - "bySampleReference/${filename}" + "report_bySampleReference/${filename}" } else { - "global/${filename}" + "${filename}" } } ] diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf new file mode 100644 index 00000000..bc844c77 --- /dev/null +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -0,0 +1,65 @@ +// +// Kmerfinder subworkflow for species identification & QC +// +include { KMERFINDER } from '../../modules/local/kmerfinder' +include { KMERFINDER_SUMMARY } from '../../modules/local/kmerfinder_summary' +include { FIND_DOWNLOAD_REFERENCE } from '../../modules/local/find_download_reference' +include { QUAST } from '../../modules/nf-core/quast/main' + +workflow KMERFINDER_SUBWORKFLOW { + take: + kmerfinder_db // channel: [ path ] + ncbi_bacteria_db // channel: [ path ] + reads // channel: [ meta, reads ] + consensus // channel: [ meta, consensus ] + + main: + ch_versions = Channel.empty() + + // MODULE: Kmerfinder, QC for sample purity + KMERFINDER ( + reads, + kmerfinder_db + ) + ch_kmerfinder_report = KMERFINDER.out.report + ch_kmerfinder_json = KMERFINDER.out.json + ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) + + // MODULE: Kmerfinder summary report + KMERFINDER_SUMMARY ( + ch_kmerfinder_report.map{meta, report -> report }.collect() + ) + ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) + + // SUBWORKFLOW: Group sample assemblies by reference geneome + ch_kmerfinder_json + .join(ch_kmerfinder_report, by:0) + .join(consensus, by:0) + .map{ + meta, json, report_txt, fasta -> + def refseq = [:] + refseq.id = json + .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] + return tuple(refseq, meta, report_txt, fasta) + } + .groupTuple(by:0) + .set { ch_consensus_byrefseq } + + // MODULE: Find & Download common reference sequences + if (!params.reference_fasta && !params.reference_gff) { + FIND_DOWNLOAD_REFERENCE ( + ch_consensus_byrefseq.map{ refseq, meta, report_txt, fasta -> tuple(refseq, report_txt)}, + ncbi_bacteria_db + ) + ch_reference_fasta = FIND_DOWNLOAD_REFERENCE.out.fna + ch_reference_gff = FIND_DOWNLOAD_REFERENCE.out.gff + ch_versions = ch_versions.mix( FIND_DOWNLOAD_REFERENCE.out.versions.ifEmpty(null) ) + } + + + emit: + versions = ch_versions.ifEmpty(null) // channel: [ path(versions.yml) ] + reference_fasta = ch_reference_fasta // channel: [ meta, path(*.fna) ] + reference_gff = ch_reference_gff // channel: [ meta, path(*.gff) ] + consensus_byrefseq = ch_consensus_byrefseq // channel: [ refseq, meta, report_txt, fasta ] +} diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 639d0974..9ba3f49a 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -59,9 +59,6 @@ include { UNICYCLER } from '../modules/local/unicycler' include { NANOPOLISH } from '../modules/local/nanopolish' include { MEDAKA } from '../modules/local/medaka' include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' -include { KMERFINDER } from '../modules/local/kmerfinder' -include { KMERFINDER_SUMMARY } from '../modules/local/kmerfinder_summary' -include { FIND_DOWNLOAD_REFERENCE } from '../modules/local/find_download_reference' include { DFAST } from '../modules/local/dfast' // @@ -102,6 +99,7 @@ include { MULTIQC } from '../modules/nf-core/multi // SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules // include { FASTQ_TRIM_FASTP_FASTQC } from '../subworkflows/nf-core/fastq_trim_fastp_fastqc/main' +include { KMERFINDER_SUBWORKFLOW } from '../subworkflows/local/kmerfinder_subworkflow' include { BAKTA_DBDOWNLOAD_RUN } from '../subworkflows/local/bakta_dbdownload_run' /* @@ -398,7 +396,6 @@ workflow BACASS { // // TODO: Create kmerfinder mode for longreads - // TODO: I think that this kmerfinder step could be grouped into a subworkflow // TODO: hack multiqc to group quast-entries by refseqid? // TODO: corner casse >1 refseq_id // TODO: PREPARE REFERENCES SUBWORKFLOW @@ -411,70 +408,49 @@ workflow BACASS { ch_kmerfinderdb = params.kmerfinderdb } - KMERFINDER ( - ch_for_assembly.map{ meta, sr, lr -> tuple( meta, sr) }, // [meta, reads] - ch_kmerfinderdb + KMERFINDER_SUBWORKFLOW ( + ch_kmerfinderdb, + params.reference_ncbi_bacteria, + ch_for_assembly.map{meta, sr, lr -> tuple( meta, sr)}, // [meta, reads] + ch_assembly // [meta, consensus] ) - ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) - - KMERFINDER.out.json - .join(KMERFINDER.out.report, by:0) - .join(ch_assembly, by:0) - .map{ - meta, json, report, fasta -> - def new_meta = [:] - new_meta.id = json - .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] - return tuple(new_meta, meta, report, fasta) - } - .groupTuple(by:0) - .set { ch_refseqid_fasta } - - ch_reports_Byrefseqid = ch_refseqid_fasta - .map{ new_meta, meta, report, fasta -> [new_meta, report] } - KMERFINDER_SUMMARY ( - KMERFINDER.out.report.map{meta, report -> report }.collect() - ) - ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) + ch_reference_fasta = KMERFINDER_SUBWORKFLOW.out.reference_fasta + ch_reference_gff = KMERFINDER_SUBWORKFLOW.out.reference_gff + ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq + ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) - if (!params.reference_fasta && !params.reference_gff) { - FIND_DOWNLOAD_REFERENCE ( - ch_reports_Byrefseqid, - params.reference_ncbi_bacteria - ) - ch_reference_fasta = FIND_DOWNLOAD_REFERENCE.out.fna - ch_reference_gff = FIND_DOWNLOAD_REFERENCE.out.gff - } } // // MODULE: QUAST, assembly QC // - ch_refseqid_fasta - .join(ch_reference_fasta) - .join(ch_reference_gff) - .groupTuple(by:0) - .set { ch_to_quast} - - QUAST( - ch_assembly + ch_assembly .collect{ it[1]} - .map{ consensus -> tuple([id:'report'], consensus)}, + .map{ consensus -> tuple([id:'report'], consensus)} + .set{ch_to_quast} + QUAST( + ch_to_quast, [[:],[]], [[:],[]] ) - QUAST_BYREFSEQID ( - ch_to_quast - .map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus.flatten()) - }, - ch_to_quast - .map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_fasta)}, - ch_to_quast - .map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} - ) ch_quast_multiqc = QUAST.out.tsv ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) + if (!params.skip_kmerfinder){ + // Prepare input for quast + ch_consensus_byrefseq // [ refseq, meta, report_txt, consensus ] + .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] + .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] + .groupTuple(by:0) + .set { ch_to_quast_byrefseq}// channel: [refseq, meta, report, consensus, ref_fasta, ref_gff] + + QUAST_BYREFSEQID ( + ch_to_quast_byrefseq.map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus.flatten())}, + ch_to_quast_byrefseq.map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_fasta)}, + ch_to_quast_byrefseq.map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} + ) + } + // Check assemblies that require further processing for gene annotation ch_assembly .branch{ meta, fasta -> From 3e8d9f19c853400b0f7148057953bc77ec1d5cea Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 21 Nov 2023 15:50:05 +0100 Subject: [PATCH 12/58] allow quast to standard and byrefseq data --- subworkflows/local/kmerfinder_subworkflow.nf | 6 ++ workflows/bacass.nf | 58 ++++++++++++-------- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf index bc844c77..99b9b4c3 100644 --- a/subworkflows/local/kmerfinder_subworkflow.nf +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -56,9 +56,15 @@ workflow KMERFINDER_SUBWORKFLOW { ch_versions = ch_versions.mix( FIND_DOWNLOAD_REFERENCE.out.versions.ifEmpty(null) ) } + // Get reference sequence IDs + ch_consensus_byrefseq + .map{ refseq, meta, report_txt, fasta -> refseq } + .collect() + .set { ch_refseqid } emit: versions = ch_versions.ifEmpty(null) // channel: [ path(versions.yml) ] + refseqids = ch_refseqid reference_fasta = ch_reference_fasta // channel: [ meta, path(*.fna) ] reference_gff = ch_reference_gff // channel: [ meta, path(*.gff) ] consensus_byrefseq = ch_consensus_byrefseq // channel: [ refseq, meta, report_txt, fasta ] diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 9ba3f49a..694c5e9f 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -397,8 +397,6 @@ workflow BACASS { // TODO: Create kmerfinder mode for longreads // TODO: hack multiqc to group quast-entries by refseqid? - // TODO: corner casse >1 refseq_id - // TODO: PREPARE REFERENCES SUBWORKFLOW // TODO: PASS QUAST_BYREF TSV TO MULTIQC if ( !params.skip_kmerfinder && params.kmerfinderdb ) { if( params.kmerfinderdb.endsWith('.gz') ){ @@ -414,13 +412,30 @@ workflow BACASS { ch_for_assembly.map{meta, sr, lr -> tuple( meta, sr)}, // [meta, reads] ch_assembly // [meta, consensus] ) + ch_refseqid = KMERFINDER_SUBWORKFLOW.out.refseqids ch_reference_fasta = KMERFINDER_SUBWORKFLOW.out.reference_fasta ch_reference_gff = KMERFINDER_SUBWORKFLOW.out.reference_gff ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) + // Processing output: + ch_consensus_byrefseq + .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] + .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] + .groupTuple(by:0) + .map { + refseq, meta, report_txt, consensus, ref_fasta, ref_gff -> + ch_refseqid.size() + if (ch_refseqid.size().getVal() > 1 ){ + return [refseq, consensus.flatten(), ref_fasta, ref_gff] + } else { + return [[id:'report'], consensus.flatten(), ref_fasta, ref_gff] + } + } + .set { ch_to_quast_byrefseq } } + // // MODULE: QUAST, assembly QC // @@ -428,28 +443,27 @@ workflow BACASS { .collect{ it[1]} .map{ consensus -> tuple([id:'report'], consensus)} .set{ch_to_quast} - QUAST( - ch_to_quast, - [[:],[]], - [[:],[]] - ) - ch_quast_multiqc = QUAST.out.tsv - ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) - - if (!params.skip_kmerfinder){ - // Prepare input for quast - ch_consensus_byrefseq // [ refseq, meta, report_txt, consensus ] - .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] - .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] - .groupTuple(by:0) - .set { ch_to_quast_byrefseq}// channel: [refseq, meta, report, consensus, ref_fasta, ref_gff] - QUAST_BYREFSEQID ( - ch_to_quast_byrefseq.map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus.flatten())}, - ch_to_quast_byrefseq.map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_fasta)}, - ch_to_quast_byrefseq.map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} - ) + if(params.skip_kmerfinder){ + QUAST( + ch_to_quast, + params.reference_fasta ?: [[:],[]], + params.reference_gff ?: [[:],[]] + ) + } else if (ch_to_quast_byrefseq){ + QUAST( + ch_to_quast, + [[:],[]], + [[:],[]] + ) + QUAST_BYREFSEQID( + ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus)}, + ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_fasta)}, + ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} + ) } + ch_quast_multiqc = QUAST.out.tsv + ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) // Check assemblies that require further processing for gene annotation ch_assembly From 53cc7ecc5ef5def0c7e55dd50f2a188e376893ec Mon Sep 17 00:00:00 2001 From: Dani VM Date: Wed, 22 Nov 2023 13:55:32 +0100 Subject: [PATCH 13/58] add byrefseq quast reports to multiqc and patch quast --- conf/modules.config | 4 ++-- modules.json | 3 ++- modules/nf-core/quast/main.nf | 4 ++-- modules/nf-core/quast/quast.diff | 23 +++++++++++++++++++++++ workflows/bacass.nf | 8 ++++---- 5 files changed, 33 insertions(+), 9 deletions(-) create mode 100644 modules/nf-core/quast/quast.diff diff --git a/conf/modules.config b/conf/modules.config index 41dd51f3..09098682 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -204,7 +204,7 @@ process { if (filename.equals('versions.yml') || filename.endsWith('.tsv')){ null } else if (filename.startsWith('GCF')){ - "report_bySampleReference/${filename}" + "runs_per_reference/${filename}" } else { "${filename}" @@ -233,7 +233,7 @@ process { } withName: 'MULTIQC' { - ext.args = '' + ext.args = params.skip_kmerfinder ? '' : '--cl-config "quast: {fn: GCF*, shared: true}"' publishDir = [ path: { "${params.outdir}/multiqc" }, mode: params.publish_dir_mode, diff --git a/modules.json b/modules.json index 8a3cf78d..954e22da 100644 --- a/modules.json +++ b/modules.json @@ -86,7 +86,8 @@ "quast": { "branch": "master", "git_sha": "344638191a5d6b3526556410819dfcf24e98039e", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/quast/quast.diff" }, "racon": { "branch": "master", diff --git a/modules/nf-core/quast/main.nf b/modules/nf-core/quast/main.nf index e265df73..29fb78f5 100644 --- a/modules/nf-core/quast/main.nf +++ b/modules/nf-core/quast/main.nf @@ -14,7 +14,7 @@ process QUAST { output: tuple val(meta), path("${prefix}") , emit: results - tuple val(meta), path("${prefix}.tsv") , emit: tsv + tuple val(meta), path("report.tsv") , emit: tsv tuple val(meta), path("${prefix}_transcriptome.tsv") , optional: true , emit: transcriptome tuple val(meta), path("${prefix}_misassemblies.tsv") , optional: true , emit: misassemblies tuple val(meta), path("${prefix}_unaligned.tsv") , optional: true , emit: unaligned @@ -37,7 +37,7 @@ process QUAST { $args \\ ${consensus.join(' ')} - ln -s ${prefix}/report.tsv ${prefix}.tsv + ln -s ${prefix}/report.tsv report.tsv [ -f ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ] && ln -s ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ${prefix}_transcriptome.tsv [ -f ${prefix}/contigs_reports/misassemblies_report.tsv ] && ln -s ${prefix}/contigs_reports/misassemblies_report.tsv ${prefix}_misassemblies.tsv [ -f ${prefix}/contigs_reports/unaligned_report.tsv ] && ln -s ${prefix}/contigs_reports/unaligned_report.tsv ${prefix}_unaligned.tsv diff --git a/modules/nf-core/quast/quast.diff b/modules/nf-core/quast/quast.diff new file mode 100644 index 00000000..d267a2c9 --- /dev/null +++ b/modules/nf-core/quast/quast.diff @@ -0,0 +1,23 @@ +Changes in module 'nf-core/quast' +--- modules/nf-core/quast/main.nf ++++ modules/nf-core/quast/main.nf +@@ -14,7 +14,7 @@ + + output: + tuple val(meta), path("${prefix}") , emit: results +- tuple val(meta), path("${prefix}.tsv") , emit: tsv ++ tuple val(meta), path("report.tsv") , emit: tsv + tuple val(meta), path("${prefix}_transcriptome.tsv") , optional: true , emit: transcriptome + tuple val(meta), path("${prefix}_misassemblies.tsv") , optional: true , emit: misassemblies + tuple val(meta), path("${prefix}_unaligned.tsv") , optional: true , emit: unaligned +@@ -37,7 +37,7 @@ + $args \\ + ${consensus.join(' ')} + +- ln -s ${prefix}/report.tsv ${prefix}.tsv ++ ln -s ${prefix}/report.tsv report.tsv + [ -f ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ] && ln -s ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ${prefix}_transcriptome.tsv + [ -f ${prefix}/contigs_reports/misassemblies_report.tsv ] && ln -s ${prefix}/contigs_reports/misassemblies_report.tsv ${prefix}_misassemblies.tsv + [ -f ${prefix}/contigs_reports/unaligned_report.tsv ] && ln -s ${prefix}/contigs_reports/unaligned_report.tsv ${prefix}_unaligned.tsv + +************************************************************ diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 694c5e9f..1f106b34 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -396,8 +396,7 @@ workflow BACASS { // // TODO: Create kmerfinder mode for longreads - // TODO: hack multiqc to group quast-entries by refseqid? - // TODO: PASS QUAST_BYREF TSV TO MULTIQC + // TODO: add new column to multiqc with refseq name if ( !params.skip_kmerfinder && params.kmerfinderdb ) { if( params.kmerfinderdb.endsWith('.gz') ){ GUNZIP_KMERFINDERDB ( params.kmerfinderdb ) @@ -418,7 +417,7 @@ workflow BACASS { ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) - // Processing output: + // Processing output: group data according to their ref-genome and rename meta according to the number of identified references ch_consensus_byrefseq .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] @@ -450,6 +449,7 @@ workflow BACASS { params.reference_fasta ?: [[:],[]], params.reference_gff ?: [[:],[]] ) + ch_quast_multiqc = QUAST.out.tsv } else if (ch_to_quast_byrefseq){ QUAST( ch_to_quast, @@ -462,7 +462,7 @@ workflow BACASS { ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} ) } - ch_quast_multiqc = QUAST.out.tsv + ch_quast_multiqc = QUAST_BYREFSEQID.out.tsv ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) // Check assemblies that require further processing for gene annotation From 7aff2f684c351591c7b757c986ee8b81a3f00bef Mon Sep 17 00:00:00 2001 From: Dani VM Date: Wed, 22 Nov 2023 13:55:32 +0100 Subject: [PATCH 14/58] add byrefseq quast reports to multiqc and patch quast --- assets/multiqc_config.yml | 118 +++++++++++++++++++- bin/multiqc_to_custom_csv.py | 184 +++++++++++++++++++++++++++++++ conf/modules.config | 4 +- modules.json | 3 +- modules/local/multiqc_custom.nf | 54 +++++++++ modules/nf-core/quast/main.nf | 4 +- modules/nf-core/quast/quast.diff | 23 ++++ workflows/bacass.nf | 63 ++++++----- 8 files changed, 421 insertions(+), 32 deletions(-) create mode 100755 bin/multiqc_to_custom_csv.py create mode 100644 modules/local/multiqc_custom.nf create mode 100644 modules/nf-core/quast/quast.diff diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index d1c68735..e1c0dc5c 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -2,12 +2,126 @@ report_comment: > This report has been generated by the nf-core/bacass analysis pipeline. For information about how to interpret these results, please see the documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - fastqc + - fastp + - nanoplot + - porechop + - pycoqc + - kraken2 + - quast + - prokka + - bakta + +module_order: + - fastqc: + name: "PREPROCESS: FastQC (raw reads)" + info: "This section of the report shows FastQC results for the raw reads before adapter trimming." + path_filters: + - "./fastqc/*.zip" + - fastp: + name: "PREPROCESS: fastp (adapter trimming)" + info: "This section of the report shows fastp results for reads after adapter and quality trimming." + - nanostat: + name: "PREPROCESS: Nanoplot" + info: "This section of the report shows Nanoplot results for nanopore sequencing data." + path_filters: + - "./nanoplot/*.txt" + - porechop: + name: "PREPROCESS: Porechop" + info: "This section of the report shows Porechop results for reads after adapter trimming." + path_filters: + - "./porechop/*.log" + - pycoqc: + name: "PREPROCESS: PycoQC" + info: "This section of the report shows PycoQC results for quality control of long-read sequencing data." + path_filters: + - "./pycoqc/*.txt" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast (Unicycler)" + anchor: "quast_unicycler" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast_unicycler/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + + report_section_order: - "nf-core-bacass-methods-description": + fastqc: + after: general_stats + fastp: + after: fastqc + nanoplot: + after: general_stats + porechop: + after: nanoplot + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: order: -1000 software_versions: order: -1001 - "nf-core-bacass-summary": + nf-core-bacass-summary: order: -1002 +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Contigs (Unicycler)": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig (Unicycler)": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50 (Unicycler)": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Reference Genome (Kmerfinder)": + description: "Reference genome calculated by Blast" + format: "{:,.0f}" + export_plots: true + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details +sp: + fastp: + fn: "*.fastp.json" diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py new file mode 100755 index 00000000..3f536b1c --- /dev/null +++ b/bin/multiqc_to_custom_csv.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python +# Sourced and Edited from nf-core/viralrecon: +# https://github.com/nf-core/viralrecon/blob/master/bin/multiqc_to_custom_csv.py#L59 +import os +import sys +import errno +import argparse +import yaml + + +def parse_args(args=None): + Description = ( + "Create custom spreadsheet for pertinent MultiQC metrics generated by the nf-core/viralrecon pipeline." + ) + Epilog = "Example usage: python multiqc_to_custom_tsv.py" + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-md", + "--multiqc_data_dir", + type=str, + dest="MULTIQC_DATA_DIR", + default="multiqc_data", + help="Full path to directory containing YAML files for each module, as generated by MultiQC. (default: 'multiqc_data').", + ) + parser.add_argument( + "-op", + "--out_prefix", + type=str, + dest="OUT_PREFIX", + default="summary", + help="Full path to output prefix (default: 'summary').", + ) + return parser.parse_args(args) + + +def make_dir(path): + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + +# Find key in dictionary created from YAML file recursively +# From https://stackoverflow.com/a/37626981 +def find_tag(d, tag): + if tag in d: + yield d[tag] + for k, v in d.items(): + if isinstance(v, dict): + for i in find_tag(v, tag): + yield i + + +def yaml_fields_to_dict(yaml_file, append_dict={}, field_mapping_list=[], valid_sample_list=[]): + integer_fields = [ + "mapped_passed", + "number_of_SNPs", + "number_of_indels", + "MISSENSE", + "# contigs (>= 0 bp)", + "# contigs (>= 5000 bp)", + "Largest contig", + ] + if os.path.exists(yaml_file): + with open(yaml_file) as f: + yaml_dict = yaml.safe_load(f) + for k in yaml_dict.keys(): + key = k + include_sample = True + if len(valid_sample_list) != 0 and key not in valid_sample_list: + include_sample = False + if include_sample: + if key not in append_dict: + append_dict[key] = {} + if field_mapping_list != []: + for i, j in field_mapping_list: + val = list(find_tag(yaml_dict[k], j[0])) + ## Fix for Cutadapt reporting reads/pairs as separate values + if j[0] == "r_written" and len(val) == 0: + val = [list(find_tag(yaml_dict[k], "pairs_written"))[0] * 2] + if len(val) != 0: + val = val[0] + if len(j) == 2: + val = list(find_tag(val, j[1]))[0] + if j[0] in integer_fields: + val = int(val) + if i not in append_dict[key]: + append_dict[key][i] = val + else: + print( + "WARNING: {} key already exists in dictionary so will be overwritten. YAML file {}.".format( + i, yaml_file + ) + ) + else: + append_dict[key] = yaml_dict[k] + else: + print("WARNING: File does not exist: {}".format(yaml_file)) + if len(valid_sample_list) != 0: + for key in valid_sample_list: + if key not in append_dict: + append_dict[key] = {} + if field_mapping_list != []: + for i, j in field_mapping_list: + if i not in append_dict[key]: + append_dict[key][i] = "NA" + else: + print( + "WARNING: {} key already exists in dictionary so will be overwritten. YAML file {}.".format( + i, yaml_file + ) + ) + else: + append_dict[key] = "NA" + return append_dict + + +def metrics_dict_to_file(file_field_list, multiqc_data_dir, out_file, valid_sample_list=[]): + metrics_dict = {} + field_list = [] + for yaml_file, mapping_list in file_field_list: + yaml_file = os.path.join(multiqc_data_dir, yaml_file) + metrics_dict = yaml_fields_to_dict( + yaml_file=yaml_file, + append_dict=metrics_dict, + field_mapping_list=mapping_list, + valid_sample_list=valid_sample_list, + ) + field_list += [x[0] for x in mapping_list] + + if metrics_dict != {}: + make_dir(os.path.dirname(out_file)) + fout = open(out_file, "w") + header = ["Sample"] + field_list + fout.write("{}\n".format(",".join(header))) + for k in sorted(metrics_dict.keys()): + row_list = [k] + for field in field_list: + if field in metrics_dict[k]: + if metrics_dict[k][field]: + row_list.append(str(metrics_dict[k][field]).replace(",", ";")) + else: + row_list.append("NA") + else: + row_list.append("NA") + fout.write("{}\n".format(",".join(row_list))) + fout.close() + return metrics_dict + + +def main(args=None): + args = parse_args(args) + + ## File names for MultiQC YAML along with fields to fetch from each file + illumina_assembly_files = [ + ( + "multiqc_quast_quast_unicycler.yaml", + [ + ("# Contigs (Unicycler)", ["# contigs (>= 0 bp)"]), + ("# Largest contig (Unicycler)", ["Largest contig"]), + ("# N50 (Unicycler)", ["N50"]), + ("# % Genome fraction", ["Genome fraction (%)"]), + ], + ), + ( + "multiqc_quast_extra.yaml", + [ + ("# Reference Genome (Kmerfinder)", ["RefGenome"]), + ] + ), + ] + + ## Write de novo assembly metrics to file + metrics_dict_to_file( + file_field_list=illumina_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) + +if __name__ == "__main__": + sys.exit(main()) diff --git a/conf/modules.config b/conf/modules.config index 41dd51f3..afeb53f2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -204,7 +204,7 @@ process { if (filename.equals('versions.yml') || filename.endsWith('.tsv')){ null } else if (filename.startsWith('GCF')){ - "report_bySampleReference/${filename}" + "runs_per_reference/${filename}" } else { "${filename}" @@ -233,7 +233,7 @@ process { } withName: 'MULTIQC' { - ext.args = '' + ext.args = '-k yaml' publishDir = [ path: { "${params.outdir}/multiqc" }, mode: params.publish_dir_mode, diff --git a/modules.json b/modules.json index 8a3cf78d..954e22da 100644 --- a/modules.json +++ b/modules.json @@ -86,7 +86,8 @@ "quast": { "branch": "master", "git_sha": "344638191a5d6b3526556410819dfcf24e98039e", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/quast/quast.diff" }, "racon": { "branch": "master", diff --git a/modules/local/multiqc_custom.nf b/modules/local/multiqc_custom.nf new file mode 100644 index 00000000..59b0ceb7 --- /dev/null +++ b/modules/local/multiqc_custom.nf @@ -0,0 +1,54 @@ +process MULTIQC { + label 'process_medium' + + conda "bioconda::multiqc=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : + 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" + + input: + path 'multiqc_config.yaml' + path multiqc_custom_config + path software_versions + //path workflow_summary + path multiqc_logo + path ('fastqc/*') + path ('fastp/*') + path ('nanoplot/*') + path ('porechop/*') + path ('pycoqc/*') + path ('kraken2_short/*') + path ('kraken2_long/*') + path ('quast_unicycler/*') + path ('prokka/*') + path ('bakta/*') + path ('extra/*') + + output: + path "*multiqc_report.html" , emit: report + path "*_data" , emit: data + path "*_plots" , optional:true, emit: plots + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def custom_config = multiqc_custom_config ? "--config $multiqc_custom_config" : '' + """ + ## Run MultiQC once to parse tool logs + multiqc -f $args $custom_config . + + ## Collect extra fields to be included in the report + cp extra/* multiqc_data/ + + ## Parse YAML files dumped by MultiQC to obtain metrics + multiqc_to_custom_csv.py + + ## Run multiqc a second time + multiqc -f $args -e general_stats $custom_config . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/quast/main.nf b/modules/nf-core/quast/main.nf index e265df73..29fb78f5 100644 --- a/modules/nf-core/quast/main.nf +++ b/modules/nf-core/quast/main.nf @@ -14,7 +14,7 @@ process QUAST { output: tuple val(meta), path("${prefix}") , emit: results - tuple val(meta), path("${prefix}.tsv") , emit: tsv + tuple val(meta), path("report.tsv") , emit: tsv tuple val(meta), path("${prefix}_transcriptome.tsv") , optional: true , emit: transcriptome tuple val(meta), path("${prefix}_misassemblies.tsv") , optional: true , emit: misassemblies tuple val(meta), path("${prefix}_unaligned.tsv") , optional: true , emit: unaligned @@ -37,7 +37,7 @@ process QUAST { $args \\ ${consensus.join(' ')} - ln -s ${prefix}/report.tsv ${prefix}.tsv + ln -s ${prefix}/report.tsv report.tsv [ -f ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ] && ln -s ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ${prefix}_transcriptome.tsv [ -f ${prefix}/contigs_reports/misassemblies_report.tsv ] && ln -s ${prefix}/contigs_reports/misassemblies_report.tsv ${prefix}_misassemblies.tsv [ -f ${prefix}/contigs_reports/unaligned_report.tsv ] && ln -s ${prefix}/contigs_reports/unaligned_report.tsv ${prefix}_unaligned.tsv diff --git a/modules/nf-core/quast/quast.diff b/modules/nf-core/quast/quast.diff new file mode 100644 index 00000000..d267a2c9 --- /dev/null +++ b/modules/nf-core/quast/quast.diff @@ -0,0 +1,23 @@ +Changes in module 'nf-core/quast' +--- modules/nf-core/quast/main.nf ++++ modules/nf-core/quast/main.nf +@@ -14,7 +14,7 @@ + + output: + tuple val(meta), path("${prefix}") , emit: results +- tuple val(meta), path("${prefix}.tsv") , emit: tsv ++ tuple val(meta), path("report.tsv") , emit: tsv + tuple val(meta), path("${prefix}_transcriptome.tsv") , optional: true , emit: transcriptome + tuple val(meta), path("${prefix}_misassemblies.tsv") , optional: true , emit: misassemblies + tuple val(meta), path("${prefix}_unaligned.tsv") , optional: true , emit: unaligned +@@ -37,7 +37,7 @@ + $args \\ + ${consensus.join(' ')} + +- ln -s ${prefix}/report.tsv ${prefix}.tsv ++ ln -s ${prefix}/report.tsv report.tsv + [ -f ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ] && ln -s ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ${prefix}_transcriptome.tsv + [ -f ${prefix}/contigs_reports/misassemblies_report.tsv ] && ln -s ${prefix}/contigs_reports/misassemblies_report.tsv ${prefix}_misassemblies.tsv + [ -f ${prefix}/contigs_reports/unaligned_report.tsv ] && ln -s ${prefix}/contigs_reports/unaligned_report.tsv ${prefix}_unaligned.tsv + +************************************************************ diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 694c5e9f..dbf934b5 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -40,8 +40,8 @@ if(! params.skip_kraken2){ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? file(params.multiqc_config) : [] ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) @@ -60,6 +60,8 @@ include { NANOPOLISH } from '../modules/local/nanopolish' include { MEDAKA } from '../modules/local/medaka' include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' include { DFAST } from '../modules/local/dfast' +include { CUSTOM_MQC_TABLES } from '../modules/local/custom_mqc_tables' +include { MULTIQC } from '../modules/local/multiqc_custom' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -93,7 +95,6 @@ include { GUNZIP } from '../modules/nf-core/gunzi include { GUNZIP as GUNZIP_KMERFINDERDB } from '../modules/nf-core/gunzip/main' include { PROKKA } from '../modules/nf-core/prokka/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' // // SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules @@ -396,8 +397,7 @@ workflow BACASS { // // TODO: Create kmerfinder mode for longreads - // TODO: hack multiqc to group quast-entries by refseqid? - // TODO: PASS QUAST_BYREF TSV TO MULTIQC + // TODO: add new column to multiqc with refseq name if ( !params.skip_kmerfinder && params.kmerfinderdb ) { if( params.kmerfinderdb.endsWith('.gz') ){ GUNZIP_KMERFINDERDB ( params.kmerfinderdb ) @@ -418,7 +418,7 @@ workflow BACASS { ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) - // Processing output: + // Processing output: group data according to their ref-genome and rename meta according to the number of identified references ch_consensus_byrefseq .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] @@ -439,6 +439,7 @@ workflow BACASS { // // MODULE: QUAST, assembly QC // + // FIXME: simplify it. I think choolsing anotherapproach will improve it ch_assembly .collect{ it[1]} .map{ consensus -> tuple([id:'report'], consensus)} @@ -450,6 +451,7 @@ workflow BACASS { params.reference_fasta ?: [[:],[]], params.reference_gff ?: [[:],[]] ) + ch_quast_multiqc = QUAST.out.results } else if (ch_to_quast_byrefseq){ QUAST( ch_to_quast, @@ -462,7 +464,7 @@ workflow BACASS { ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} ) } - ch_quast_multiqc = QUAST.out.tsv + ch_quast_multiqc = QUAST_BYREFSEQID.out.results ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) // Check assemblies that require further processing for gene annotation @@ -538,27 +540,38 @@ workflow BACASS { methods_description = WorkflowBacass.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) ch_methods_description = Channel.value(methods_description) - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(ch_fastqc_raw_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_fastqc_trim_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_trim_json_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_quast_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_prokka_txt_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_nanoplot_txt_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_porechop_log_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_pycoqc_multiqc.collect{it[1]}.ifEmpty([])) + // TODO: Clean this. find a better place. + ch_to_quast_byrefseq + .map{ + refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus) + } + .transpose() + .map { + [it[1].getSimpleName(), it[0]['id']] + } + .collectFile(name: 'multiqc_quast_extra.yaml') { + sample_name, refseqid -> + "$sample_name:\n RefGenome: $refseqid\n" + } + .set { ch_extra_multiqc } MULTIQC ( - ch_multiqc_files.collect(), ch_multiqc_config, - ch_multiqc_custom_config.collect().ifEmpty([]), - ch_multiqc_logo.collect().ifEmpty([]) + ch_multiqc_custom_config, + CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect(), + //ch_workflow_summary, // FIXME: Cannot parse this file... + ch_multiqc_logo.collect().ifEmpty([]), + ch_fastqc_raw_multiqc.collect{it[1]}.ifEmpty([]), + ch_trim_json_multiqc.collect{it[1]}.ifEmpty([]), + ch_nanoplot_txt_multiqc.collect{it[1]}.ifEmpty([]), + ch_porechop_log_multiqc.collect{it[1]}.ifEmpty([]), + ch_pycoqc_multiqc.collect{it[1]}.ifEmpty([]), + ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([]), + ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([]), + ch_quast_multiqc.collect{it[1]}.ifEmpty([]), // FIXME: input filename collision + ch_prokka_txt_multiqc.collect{it[1]}.ifEmpty([]), + ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([]), + ch_extra_multiqc.collect().ifEmpty([]) ) multiqc_report = MULTIQC.out.report.toList() } From de8e877a6af428ef18d8082883d2fcd97d58f955 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 2 Jan 2024 11:17:16 +0100 Subject: [PATCH 15/58] update multiqc and append fastp metrics to assmebly metrics df --- assets/multiqc_config.yml | 10 +++++++++- bin/multiqc_to_custom_csv.py | 7 +++++++ modules/local/multiqc_custom.nf | 6 +++--- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index e1c0dc5c..3dd4fbc0 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -28,6 +28,8 @@ module_order: - fastp: name: "PREPROCESS: fastp (adapter trimming)" info: "This section of the report shows fastp results for reads after adapter and quality trimming." + path_filters: + - "./fastp/*.json" - nanostat: name: "PREPROCESS: Nanoplot" info: "This section of the report shows Nanoplot results for nanopore sequencing data." @@ -70,7 +72,7 @@ report_section_order: fastqc: after: general_stats fastp: - after: fastqc + after: general_stats nanoplot: after: general_stats porechop: @@ -99,6 +101,12 @@ custom_data: "Sample": description: "Input sample names" format: "{:,.0f}" + "# Input reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Trimmed reads (fastp)": + description: "Total number of reads remaining after adapter/quality trimming with fastp" + format: "{:,.0f}" "# Contigs (Unicycler)": description: "Total number of contigs calculated by QUAST" format: "{:,.0f}" diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py index 3f536b1c..8d9549c4 100755 --- a/bin/multiqc_to_custom_csv.py +++ b/bin/multiqc_to_custom_csv.py @@ -155,6 +155,13 @@ def main(args=None): ## File names for MultiQC YAML along with fields to fetch from each file illumina_assembly_files = [ + ( + "multiqc_fastp.yaml", + [ + ("# Input reads", ["before_filtering", "total_reads"]), + ("# Trimmed reads (fastp)", ["after_filtering", "total_reads"]), + ] + ), ( "multiqc_quast_quast_unicycler.yaml", [ diff --git a/modules/local/multiqc_custom.nf b/modules/local/multiqc_custom.nf index 59b0ceb7..488bc378 100644 --- a/modules/local/multiqc_custom.nf +++ b/modules/local/multiqc_custom.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_medium' - conda "bioconda::multiqc=1.17" + conda "bioconda::multiqc=1.19" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : - 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" input: path 'multiqc_config.yaml' From 7103bfa98cc174d0e9956265d595e443468009b5 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 2 Jan 2024 12:51:40 +0100 Subject: [PATCH 16/58] add new method to complie kmerfinder results into multiqc report --- assets/multiqc_config.yml | 25 ++++++++- bin/csv_to_yaml.py | 58 ++++++++++++++++++++ bin/multiqc_to_custom_csv.py | 11 +++- modules/local/find_download_reference.nf | 1 - modules/local/kmerfinder_summary.nf | 11 +++- modules/local/multiqc_custom.nf | 2 +- subworkflows/local/kmerfinder_subworkflow.nf | 13 +++-- workflows/bacass.nf | 28 ++-------- 8 files changed, 110 insertions(+), 39 deletions(-) create mode 100755 bin/csv_to_yaml.py diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 3dd4fbc0..caa58344 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -119,8 +119,29 @@ custom_data: "# % Genome fraction": description: "% genome fraction calculated by QUAST" format: "{:,.2f}" - "# Reference Genome (Kmerfinder)": - description: "Reference genome calculated by Blast" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder" format: "{:,.0f}" export_plots: true diff --git a/bin/csv_to_yaml.py b/bin/csv_to_yaml.py new file mode 100755 index 00000000..6f3fc9cf --- /dev/null +++ b/bin/csv_to_yaml.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +import sys +import argparse +import csv +import yaml + +def parse_args(args=None): + Description = ( + "Create a yaml file from csv input file grouping samples as keys and resting fields as their value pair." + ) + + Epilog = "Example usage: python csv_to_yaml.py -i myfile.csv -k 'sample_name' -o converted_file" + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-i", + "--input", + type=str, + dest="CSV_FILE", + help="Input file in CSV format." + ) + + parser.add_argument( + "-k", + "--key_field", + type=str, + dest="KEY_FIELD", + help="Name of the key/column grupping field in the input csv." + ) + + parser.add_argument( + "-op", + "--output_prefix", + type=str, + default="output_file", + dest="OUT_PREFIX", + help="Output file name" + ) + return parser.parse_args(args) + +def parse_csv(csv_file): + with open(csv_file, 'r') as c: + csv_reader = csv.DictReader(c) + data = [ row for row in csv_reader] + return data + +def create_yaml(data, key, output_prefix): + yaml_data = {entry[key]: {k: v for k, v in entry.items() if k != key} for entry in data} + with open( output_prefix + '.yaml' , 'w') as yaml_file: + yaml.dump(yaml_data, yaml_file, default_flow_style=False) + +def main(args=None): + args = parse_args(args) + file_list = parse_csv(args.CSV_FILE) + + create_yaml(data=file_list, key=args.KEY_FIELD, output_prefix=args.OUT_PREFIX) + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py index 8d9549c4..444b2860 100755 --- a/bin/multiqc_to_custom_csv.py +++ b/bin/multiqc_to_custom_csv.py @@ -172,9 +172,16 @@ def main(args=None): ], ), ( - "multiqc_quast_extra.yaml", + "multiqc_kmerfinder.yaml", [ - ("# Reference Genome (Kmerfinder)", ["RefGenome"]), + ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), + ("# Best hit assembly ID (Kmerfinder)", ["07-kmerfinder_best_hit_# Assembly"]), + ("# Best hit query coverage (Kmerfinder)", ["07-kmerfinder_best_hit_Query_Coverage"]), + ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), + ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), + ("# Second hit assembly ID (Kmerfinder)", ["07-kmerfinder_second_hit_# Assembly"]), + ("# Second hit query coverage (Kmerfinder)", ["07-kmerfinder_second_hit_Query_Coverage:"]), + ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), ] ), ] diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf index 478d3b67..36b59f0c 100644 --- a/modules/local/find_download_reference.nf +++ b/modules/local/find_download_reference.nf @@ -12,7 +12,6 @@ process FIND_DOWNLOAD_REFERENCE { path(ncbi_reference) output: - tuple val(meta), path( "references_found.tsv") , emit: target_references_tsv tuple val(meta), path( "*.fna.gz") , emit: fna tuple val(meta), path( "*.gff.gz") , emit: gff tuple val(meta), path( "*.faa.gz") , emit: faa diff --git a/modules/local/kmerfinder_summary.nf b/modules/local/kmerfinder_summary.nf index 58fe104f..8e5fe45f 100644 --- a/modules/local/kmerfinder_summary.nf +++ b/modules/local/kmerfinder_summary.nf @@ -2,22 +2,27 @@ process KMERFINDER_SUMMARY { tag "kmerfinder_summary" label 'process_low' - conda "bioconda::python=3.10.0" + conda "bioconda::multiqc=1.19" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.10' : - 'biocontainers/python:3.10' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" input: path(report, stageAs: 'reports/*') output: path "*.csv" , emit: summary + path "*.yaml" , emit: yaml path "versions.yml" , emit: versions script: """ + ## summarizing kmerfinder results kmerfinder_summary.py --path reports/ --output_bn kmerfinder.bn --output_csv kmerfinder_summary.csv + ## Create a yaml file from csv + csv_to_yaml.py -i kmerfinder_summary.csv -k 'sample_name' -op kmerfinder_summary + cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | awk '{print \$2}') diff --git a/modules/local/multiqc_custom.nf b/modules/local/multiqc_custom.nf index 488bc378..30ab46b7 100644 --- a/modules/local/multiqc_custom.nf +++ b/modules/local/multiqc_custom.nf @@ -37,7 +37,7 @@ process MULTIQC { ## Run MultiQC once to parse tool logs multiqc -f $args $custom_config . - ## Collect extra fields to be included in the report + ## Collect additional files to be included in the report cp extra/* multiqc_data/ ## Parse YAML files dumped by MultiQC to obtain metrics diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf index 99b9b4c3..ffe3f74e 100644 --- a/subworkflows/local/kmerfinder_subworkflow.nf +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -29,17 +29,17 @@ workflow KMERFINDER_SUBWORKFLOW { KMERFINDER_SUMMARY ( ch_kmerfinder_report.map{meta, report -> report }.collect() ) - ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) + ch_summary_yaml = KMERFINDER_SUMMARY.out.yaml + ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) - // SUBWORKFLOW: Group sample assemblies by reference geneome + // SUBWORKFLOW: Group assemblies by reference geneome ch_kmerfinder_json .join(ch_kmerfinder_report, by:0) .join(consensus, by:0) .map{ - meta, json, report_txt, fasta -> + meta, report_json, report_txt, fasta -> def refseq = [:] - refseq.id = json - .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] + refseq.id = report_json.splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] return tuple(refseq, meta, report_txt, fasta) } .groupTuple(by:0) @@ -64,7 +64,8 @@ workflow KMERFINDER_SUBWORKFLOW { emit: versions = ch_versions.ifEmpty(null) // channel: [ path(versions.yml) ] - refseqids = ch_refseqid + summary_yaml = ch_summary_yaml // channel: [ path(kmerfinder_summary.yml) ] + refseqids = ch_refseqid // channel: [ val(refseq1), val(refseq1),...] reference_fasta = ch_reference_fasta // channel: [ meta, path(*.fna) ] reference_gff = ch_reference_gff // channel: [ meta, path(*.gff) ] consensus_byrefseq = ch_consensus_byrefseq // channel: [ refseq, meta, report_txt, fasta ] diff --git a/workflows/bacass.nf b/workflows/bacass.nf index dbf934b5..05ce3508 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -60,7 +60,6 @@ include { NANOPOLISH } from '../modules/local/nanopolish' include { MEDAKA } from '../modules/local/medaka' include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' include { DFAST } from '../modules/local/dfast' -include { CUSTOM_MQC_TABLES } from '../modules/local/custom_mqc_tables' include { MULTIQC } from '../modules/local/multiqc_custom' // @@ -394,10 +393,8 @@ workflow BACASS { // // MODULE: Kmerfinder, QC for sample purity - // - // TODO: Create kmerfinder mode for longreads - // TODO: add new column to multiqc with refseq name + if ( !params.skip_kmerfinder && params.kmerfinderdb ) { if( params.kmerfinderdb.endsWith('.gz') ){ GUNZIP_KMERFINDERDB ( params.kmerfinderdb ) @@ -419,13 +416,12 @@ workflow BACASS { ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) // Processing output: group data according to their ref-genome and rename meta according to the number of identified references - ch_consensus_byrefseq + ch_consensus_byrefseq // [ refseq, meta, report_txt, consensus ] .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] .groupTuple(by:0) .map { - refseq, meta, report_txt, consensus, ref_fasta, ref_gff -> - ch_refseqid.size() + refseq, meta, report_txt, consensus, ref_fasta, ref_gff -> ch_refseqid.size() if (ch_refseqid.size().getVal() > 1 ){ return [refseq, consensus.flatten(), ref_fasta, ref_gff] } else { @@ -435,7 +431,6 @@ workflow BACASS { .set { ch_to_quast_byrefseq } } - // // MODULE: QUAST, assembly QC // @@ -540,21 +535,6 @@ workflow BACASS { methods_description = WorkflowBacass.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) ch_methods_description = Channel.value(methods_description) - // TODO: Clean this. find a better place. - ch_to_quast_byrefseq - .map{ - refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus) - } - .transpose() - .map { - [it[1].getSimpleName(), it[0]['id']] - } - .collectFile(name: 'multiqc_quast_extra.yaml') { - sample_name, refseqid -> - "$sample_name:\n RefGenome: $refseqid\n" - } - .set { ch_extra_multiqc } - MULTIQC ( ch_multiqc_config, ch_multiqc_custom_config, @@ -571,7 +551,7 @@ workflow BACASS { ch_quast_multiqc.collect{it[1]}.ifEmpty([]), // FIXME: input filename collision ch_prokka_txt_multiqc.collect{it[1]}.ifEmpty([]), ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([]), - ch_extra_multiqc.collect().ifEmpty([]) + KMERFINDER_SUBWORKFLOW.out.summary_yaml.collectFile(name: 'multiqc_kmerfinder.yaml'), ) multiqc_report = MULTIQC.out.report.toList() } From 854782df8a702dd494002f85ffa55b453c5dedf1 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 2 Jan 2024 16:58:33 +0100 Subject: [PATCH 17/58] add long reads assembly metrics to custom multiqc --- assets/multiqc_config.yml | 147 +------------------------------ assets/multiqc_config_long.yml | 139 +++++++++++++++++++++++++++++ assets/multiqc_config_short.yml | 133 ++++++++++++++++++++++++++++ bin/multiqc_to_custom_csv.py | 71 ++++++++++++--- modules.json | 3 +- modules/local/kmerfinder.nf | 4 +- modules/local/multiqc_custom.nf | 4 +- modules/nf-core/racon/main.nf | 6 +- modules/nf-core/racon/racon.diff | 26 ++++++ workflows/bacass.nf | 25 ++++-- 10 files changed, 386 insertions(+), 172 deletions(-) create mode 100644 assets/multiqc_config_long.yml create mode 100644 assets/multiqc_config_short.yml create mode 100644 modules/nf-core/racon/racon.diff diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index caa58344..d1c68735 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -2,155 +2,12 @@ report_comment: > This report has been generated by the nf-core/bacass analysis pipeline. For information about how to interpret these results, please see the documentation. - -data_format: "yaml" - -max_table_rows: 10000 - -run_modules: - - custom_content - - fastqc - - fastp - - nanoplot - - porechop - - pycoqc - - kraken2 - - quast - - prokka - - bakta - -module_order: - - fastqc: - name: "PREPROCESS: FastQC (raw reads)" - info: "This section of the report shows FastQC results for the raw reads before adapter trimming." - path_filters: - - "./fastqc/*.zip" - - fastp: - name: "PREPROCESS: fastp (adapter trimming)" - info: "This section of the report shows fastp results for reads after adapter and quality trimming." - path_filters: - - "./fastp/*.json" - - nanostat: - name: "PREPROCESS: Nanoplot" - info: "This section of the report shows Nanoplot results for nanopore sequencing data." - path_filters: - - "./nanoplot/*.txt" - - porechop: - name: "PREPROCESS: Porechop" - info: "This section of the report shows Porechop results for reads after adapter trimming." - path_filters: - - "./porechop/*.log" - - pycoqc: - name: "PREPROCESS: PycoQC" - info: "This section of the report shows PycoQC results for quality control of long-read sequencing data." - path_filters: - - "./pycoqc/*.txt" - - kraken2: - name: "CONTAMINATION ANALYSIS: Kraken 2" - info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." - path_filters: - - ".*kraken2_*/*report.txt" - - quast: - name: "ASSEMBLY: Quast (Unicycler)" - anchor: "quast_unicycler" - info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." - path_filters: - - "./quast_unicycler/*/report.tsv" - - prokka: - name: "ANNOTATION: Prokka" - info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." - path_filters: - - "./prokka/*.txt" - - bakta: - name: "ANNOTATION: Bakta" - info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." - path_filters: - - "./bakta/*.txt" - - report_section_order: - fastqc: - after: general_stats - fastp: - after: general_stats - nanoplot: - after: general_stats - porechop: - after: nanoplot - kraken2: - after: general_stats - quast: - after: general_stats - prokka: - before: nf-core-bacass-methods-description - bakta: - before: nf-core-bacass-methods-description - nf-core-bacass-methods-description: + "nf-core-bacass-methods-description": order: -1000 software_versions: order: -1001 - nf-core-bacass-summary: + "nf-core-bacass-summary": order: -1002 -custom_data: - summary_assembly_metrics: - section_name: "De novo assembly metrics" - description: "generated by nf-core/bacass" - plot_type: "table" - headers: - "Sample": - description: "Input sample names" - format: "{:,.0f}" - "# Input reads": - description: "Total number of input reads in raw fastq files" - format: "{:,.0f}" - "# Trimmed reads (fastp)": - description: "Total number of reads remaining after adapter/quality trimming with fastp" - format: "{:,.0f}" - "# Contigs (Unicycler)": - description: "Total number of contigs calculated by QUAST" - format: "{:,.0f}" - "# Largest contig (Unicycler)": - description: "Size of largest contig calculated by QUAST" - format: "{:,.0f}" - "# N50 (Unicycler)": - description: "N50 metric for de novo assembly as calculated by QUAST" - format: "{:,.0f}" - "# % Genome fraction": - description: "% genome fraction calculated by QUAST" - format: "{:,.2f}" - "# Best hit (Kmerfinder)": - description: "Specie name of the best hit from Kmerfinder" - format: "{:,.0f}" - "# Best hit assembly ID (Kmerfinder)": - description: "Assembly ID of the best hit from Kmerfinder" - format: "{:,.0f}" - "# Best hit query coverage (Kmerfinder)": - description: "Query coverage value of the best hit from Kmerfinder" - format: "{:,.0f}" - "# Best hit depth (Kmerfinder)": - description: "Depth of the best hit from Kmerfinder" - format: "{:,.0f}" - "# Second hit (Kmerfinder)": - description: "Specie name of the second hit from Kmerfinder" - format: "{:,.0f}" - "# Second hit assembly ID (Kmerfinder)": - description: "Assembly ID of the second hit from Kmerfinder" - format: "{:,.0f}" - "# Second hit query coverage (Kmerfinder)": - description: "Query coverage value of the second hit from Kmerfinder" - format: "{:,.0f}" - "# Second hit depth (Kmerfinder)": - description: "Depth of the second hit from Kmerfinder" - format: "{:,.0f}" - export_plots: true - -# # Customise the module search patterns to speed up execution time -# # - Skip module sub-tools that we are not interested in -# # - Replace file-content searching with filename pattern searching -# # - Don't add anything that is the same as the MultiQC default -# # See https://multiqc.info/docs/#optimise-file-search-patterns for details -sp: - fastp: - fn: "*.fastp.json" diff --git a/assets/multiqc_config_long.yml b/assets/multiqc_config_long.yml new file mode 100644 index 00000000..083ea39b --- /dev/null +++ b/assets/multiqc_config_long.yml @@ -0,0 +1,139 @@ +report_comment: > + This report has been generated by the nf-core/bacass + analysis pipeline. For information about how to interpret these results, please see the + documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - nanostat + - porechop + - pycoqc + - kraken2 + - quast + - prokka + - bakta + +module_order: + - nanostat: + name: "PREPROCESS: Nanoplot" + info: "This section of the report shows Nanoplot results for nanopore sequencing data." + path_filters: + - "./nanoplot/*.txt" + - porechop: + name: "PREPROCESS: Porechop" + info: "This section of the report shows Porechop results for reads after adapter trimming." + path_filters: + - "./porechop/*.log" + - pycoqc: + name: "PREPROCESS: PycoQC" + info: "This section of the report shows PycoQC results for quality control of long-read sequencing data." + path_filters: + - "./pycoqc/*.txt" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + + +report_section_order: + nanostat: + after: general_stats + porechop: + before: nanostat + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: + order: -1000 + software_versions: + order: -1001 + nf-core-bacass-summary: + order: -1002 + +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics (long-reads)" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Input reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Median read lenght": + description: "Median read lenght (bp)" + format: "{:,.0f}" + "# Median read quality": + description: "Median read quality (Phred scale)" + format: "{:,.0f}" + "# Contigs": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder" + format: "{:,.0f}" + +export_plots: true + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details diff --git a/assets/multiqc_config_short.yml b/assets/multiqc_config_short.yml new file mode 100644 index 00000000..ae8eaebe --- /dev/null +++ b/assets/multiqc_config_short.yml @@ -0,0 +1,133 @@ +report_comment: > + This report has been generated by the nf-core/bacass + analysis pipeline. For information about how to interpret these results, please see the + documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - fastqc + - fastp + - kraken2 + - quast + - prokka + - bakta + +module_order: + - fastqc: + name: "PREPROCESS: FastQC (raw reads)" + info: "This section of the report shows FastQC results for the raw reads before adapter trimming." + path_filters: + - "./fastqc/*.zip" + - fastp: + name: "PREPROCESS: fastp (adapter trimming)" + info: "This section of the report shows fastp results for reads after adapter and quality trimming." + path_filters: + - "./fastp/*.json" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + + +report_section_order: + fastqc: + after: general_stats + fastp: + after: general_stats + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: + order: -1000 + software_versions: + order: -1001 + nf-core-bacass-summary: + order: -1002 + +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics (short-reads)" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Input reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Trimmed reads (fastp)": + description: "Total number of reads remaining after adapter/quality trimming with fastp" + format: "{:,.0f}" + "# Contigs": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder" + format: "{:,.0f}" + +export_plots: true + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details +sp: + fastp: + fn: "*.fastp.json" diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py index 444b2860..bccb0143 100755 --- a/bin/multiqc_to_custom_csv.py +++ b/bin/multiqc_to_custom_csv.py @@ -22,6 +22,14 @@ def parse_args(args=None): default="multiqc_data", help="Full path to directory containing YAML files for each module, as generated by MultiQC. (default: 'multiqc_data').", ) + parser.add_argument( + "-t", + "--assembly_type", + type=str, + dest="ASSEMBLY_TYPE", + default="short", + help="String defining the assembly mode for genome de novo assembly (options: short, long, hybrid).", + ) parser.add_argument( "-op", "--out_prefix", @@ -163,11 +171,11 @@ def main(args=None): ] ), ( - "multiqc_quast_quast_unicycler.yaml", + "multiqc_quast.yaml", [ - ("# Contigs (Unicycler)", ["# contigs (>= 0 bp)"]), - ("# Largest contig (Unicycler)", ["Largest contig"]), - ("# N50 (Unicycler)", ["N50"]), + ("# Contigs", ["# contigs (>= 0 bp)"]), + ("# Largest contig", ["Largest contig"]), + ("# N50", ["N50"]), ("# % Genome fraction", ["Genome fraction (%)"]), ], ), @@ -180,19 +188,60 @@ def main(args=None): ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), ("# Second hit assembly ID (Kmerfinder)", ["07-kmerfinder_second_hit_# Assembly"]), - ("# Second hit query coverage (Kmerfinder)", ["07-kmerfinder_second_hit_Query_Coverage:"]), + ("# Second hit query coverage (Kmerfinder)", ["07-kmerfinder_second_hit_Query_Coverage"]), + ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), + ] + ), + ] + + nanopore_assembly_files = [ + ( + "multiqc_nanostat.yaml", + [ + ("# Input reads", ["Number of reads_fastq"]), + ("# Median read lenght", ["Median read length_fastq"]), + ("# Median read quality", ["Median read quality_fastq"]), + ] + ), + ( + "multiqc_quast.yaml", # TODO: "multiqc_quast_quast_{assemblertool}.yaml" + [ + ("# Contigs", ["# contigs (>= 0 bp)"]), + ("# Largest contig", ["Largest contig"]), + ("# N50", ["N50"]), + ("# % Genome fraction", ["Genome fraction (%)"]), + ], + ), + ( + "multiqc_kmerfinder.yaml", + [ + ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), + ("# Best hit assembly ID (Kmerfinder)", ["07-kmerfinder_best_hit_# Assembly"]), + ("# Best hit query coverage (Kmerfinder)", ["07-kmerfinder_best_hit_Query_Coverage"]), + ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), + ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), + ("# Second hit assembly ID (Kmerfinder)", ["07-kmerfinder_second_hit_# Assembly"]), + ("# Second hit query coverage (Kmerfinder)", ["07-kmerfinder_second_hit_Query_Coverage"]), ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), ] ), ] ## Write de novo assembly metrics to file - metrics_dict_to_file( - file_field_list=illumina_assembly_files, - multiqc_data_dir=args.MULTIQC_DATA_DIR, - out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", - valid_sample_list=[], - ) + if args.ASSEMBLY_TYPE == 'short': + metrics_dict_to_file( + file_field_list=illumina_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) + elif args.ASSEMBLY_TYPE == 'long': + metrics_dict_to_file( + file_field_list=nanopore_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) if __name__ == "__main__": sys.exit(main()) diff --git a/modules.json b/modules.json index 954e22da..abdd7f62 100644 --- a/modules.json +++ b/modules.json @@ -92,7 +92,8 @@ "racon": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/racon/racon.diff" }, "samtools/index": { "branch": "master", diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf index 58a6de83..59e1d5b3 100644 --- a/modules/local/kmerfinder.nf +++ b/modules/local/kmerfinder.nf @@ -17,8 +17,8 @@ process KMERFINDER { path "versions.yml" , emit: versions script: - def prefix = task.ext.prefix ?: "${meta.id}" - def in_reads = reads.size() == 1 ? "${reads}" : "${reads[0]} ${reads[1]}" + def prefix = task.ext.prefix ?: "${meta.id}" + def in_reads = reads[0] && reads[1] ? "${reads[0]} ${reads[1]}" : "${reads}" """ kmerfinder.py \\ diff --git a/modules/local/multiqc_custom.nf b/modules/local/multiqc_custom.nf index 30ab46b7..025fcac3 100644 --- a/modules/local/multiqc_custom.nf +++ b/modules/local/multiqc_custom.nf @@ -19,7 +19,7 @@ process MULTIQC { path ('pycoqc/*') path ('kraken2_short/*') path ('kraken2_long/*') - path ('quast_unicycler/*') + path ('quast/*') path ('prokka/*') path ('bakta/*') path ('extra/*') @@ -41,7 +41,7 @@ process MULTIQC { cp extra/* multiqc_data/ ## Parse YAML files dumped by MultiQC to obtain metrics - multiqc_to_custom_csv.py + multiqc_to_custom_csv.py --assembly_type $params.assembly_type ## Run multiqc a second time multiqc -f $args -e general_stats $custom_config . diff --git a/modules/nf-core/racon/main.nf b/modules/nf-core/racon/main.nf index 6d0cceb2..8f1cbfa9 100644 --- a/modules/nf-core/racon/main.nf +++ b/modules/nf-core/racon/main.nf @@ -11,7 +11,7 @@ process RACON { tuple val(meta), path(reads), path(assembly), path(paf) output: - tuple val(meta), path('*_assembly_consensus.fasta.gz') , emit: improved_assembly + tuple val(meta), path('*.consensus.fasta.gz') , emit: improved_assembly path "versions.yml" , emit: versions when: @@ -26,9 +26,9 @@ process RACON { "${paf}" \\ $args \\ "${assembly}" > \\ - ${prefix}_assembly_consensus.fasta + ${prefix}.consensus.fasta - gzip -n ${prefix}_assembly_consensus.fasta + gzip -n ${prefix}.consensus.fasta cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/racon/racon.diff b/modules/nf-core/racon/racon.diff new file mode 100644 index 00000000..c6e8d118 --- /dev/null +++ b/modules/nf-core/racon/racon.diff @@ -0,0 +1,26 @@ +Changes in module 'nf-core/racon' +--- modules/nf-core/racon/main.nf ++++ modules/nf-core/racon/main.nf +@@ -11,7 +11,7 @@ + tuple val(meta), path(reads), path(assembly), path(paf) + + output: +- tuple val(meta), path('*_assembly_consensus.fasta.gz') , emit: improved_assembly ++ tuple val(meta), path('*.consensus.fasta.gz') , emit: improved_assembly + path "versions.yml" , emit: versions + + when: +@@ -26,9 +26,9 @@ + "${paf}" \\ + $args \\ + "${assembly}" > \\ +- ${prefix}_assembly_consensus.fasta ++ ${prefix}.consensus.fasta + +- gzip -n ${prefix}_assembly_consensus.fasta ++ gzip -n ${prefix}.consensus.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 05ce3508..d393732d 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -39,8 +39,11 @@ if(! params.skip_kraken2){ CONFIG FILES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ - -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +if(params.assembly_type){ + ch_multiqc_config = file("$projectDir/assets/multiqc_config_${params.assembly_type}.yml", checkIfExists: true) +} else { + ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +} ch_multiqc_custom_config = params.multiqc_config ? file(params.multiqc_config) : [] ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) @@ -392,8 +395,8 @@ workflow BACASS { } // - // MODULE: Kmerfinder, QC for sample purity - // TODO: Create kmerfinder mode for longreads + // MODULE: Kmerfinder, QC for sample purity. Available for short, long and hybrid assemblies + // if ( !params.skip_kmerfinder && params.kmerfinderdb ) { if( params.kmerfinderdb.endsWith('.gz') ){ @@ -403,11 +406,17 @@ workflow BACASS { ch_kmerfinderdb = params.kmerfinderdb } + if( params.assembly_type == 'short' || params.assembly_type == 'hybrid' ) { + ch_for_kmerfinder = FASTQ_TRIM_FASTP_FASTQC.out.reads + } else if ( params.assembly_type == 'long' ) { + ch_for_kmerfinder = PORECHOP_PORECHOP.out.reads + } + KMERFINDER_SUBWORKFLOW ( ch_kmerfinderdb, params.reference_ncbi_bacteria, - ch_for_assembly.map{meta, sr, lr -> tuple( meta, sr)}, // [meta, reads] - ch_assembly // [meta, consensus] + ch_for_kmerfinder, + ch_assembly ) ch_refseqid = KMERFINDER_SUBWORKFLOW.out.refseqids ch_reference_fasta = KMERFINDER_SUBWORKFLOW.out.reference_fasta @@ -415,7 +424,7 @@ workflow BACASS { ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) - // Processing output: group data according to their ref-genome and rename meta according to the number of identified references + // Group data based on ref-genome and rename meta according to the identified references count. ch_consensus_byrefseq // [ refseq, meta, report_txt, consensus ] .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] @@ -548,7 +557,7 @@ workflow BACASS { ch_pycoqc_multiqc.collect{it[1]}.ifEmpty([]), ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([]), ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([]), - ch_quast_multiqc.collect{it[1]}.ifEmpty([]), // FIXME: input filename collision + ch_quast_multiqc.collect{it[1]}.ifEmpty([]), // TODO: Create a quast channel for each assembler ch_prokka_txt_multiqc.collect{it[1]}.ifEmpty([]), ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([]), KMERFINDER_SUBWORKFLOW.out.summary_yaml.collectFile(name: 'multiqc_kmerfinder.yaml'), From 0f3970ee8ee78f91770b49559e632f17e63fc1bc Mon Sep 17 00:00:00 2001 From: Dani VM Date: Wed, 3 Jan 2024 13:45:29 +0100 Subject: [PATCH 18/58] fix custom multiqc when kmerfinder is not invoked --- assets/multiqc_config_long.yml | 3 ++ assets/multiqc_config_short.yml | 3 ++ conf/modules.config | 43 ++++++++++++------------ modules.json | 5 --- modules/local/multiqc_custom.nf | 24 +++++++++----- modules/nf-core/multiqc/main.nf | 53 ------------------------------ modules/nf-core/multiqc/meta.yml | 56 -------------------------------- workflows/bacass.nf | 48 +++++++++++++++++---------- 8 files changed, 76 insertions(+), 159 deletions(-) delete mode 100644 modules/nf-core/multiqc/main.nf delete mode 100644 modules/nf-core/multiqc/meta.yml diff --git a/assets/multiqc_config_long.yml b/assets/multiqc_config_long.yml index 083ea39b..7c5349ba 100644 --- a/assets/multiqc_config_long.yml +++ b/assets/multiqc_config_long.yml @@ -17,6 +17,9 @@ run_modules: - prokka - bakta +exclude_modules: + - general_stats + module_order: - nanostat: name: "PREPROCESS: Nanoplot" diff --git a/assets/multiqc_config_short.yml b/assets/multiqc_config_short.yml index ae8eaebe..c068b167 100644 --- a/assets/multiqc_config_short.yml +++ b/assets/multiqc_config_short.yml @@ -16,6 +16,9 @@ run_modules: - prokka - bakta +exclude_modules: + - general_stats + module_order: - fastqc: name: "PREPROCESS: FastQC (raw reads)" diff --git a/conf/modules.config b/conf/modules.config index afeb53f2..3bd7d064 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -159,26 +159,6 @@ process { ] } - withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER' { - ext.args = '' - publishDir = [ - path: { "${params.outdir}/Kmerfinder/${meta.id}" }, - mode: params.publish_dir_mode, - pattern: "*.txt", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER_SUMMARY' { - ext.args = '' - publishDir = [ - path: { "${params.outdir}/Kmerfinder" }, - mode: params.publish_dir_mode, - pattern: "*.csv", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'KRAKEN2_LONG' { ext.args = '' publishDir = [ @@ -306,6 +286,29 @@ if (!params.skip_fastp) { } } } +if (!params.skip_kmerfinder) { + process { + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER' { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/Kmerfinder/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.txt", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER_SUMMARY' { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/Kmerfinder" }, + mode: params.publish_dir_mode, + pattern: "*.csv", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} if (params.annotation_tool == 'bakta') { if (params.baktadb_download == true) { diff --git a/modules.json b/modules.json index abdd7f62..d30a7ef9 100644 --- a/modules.json +++ b/modules.json @@ -61,11 +61,6 @@ "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", "installed_by": ["modules"] }, - "multiqc": { - "branch": "master", - "git_sha": "c4e79dd48ab2cedea2d7d525582bea061c241e0f", - "installed_by": ["modules"] - }, "nanoplot": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/modules/local/multiqc_custom.nf b/modules/local/multiqc_custom.nf index 025fcac3..32a49dd5 100644 --- a/modules/local/multiqc_custom.nf +++ b/modules/local/multiqc_custom.nf @@ -10,7 +10,7 @@ process MULTIQC { path 'multiqc_config.yaml' path multiqc_custom_config path software_versions - //path workflow_summary + path workflow_summary path multiqc_logo path ('fastqc/*') path ('fastp/*') @@ -25,10 +25,11 @@ process MULTIQC { path ('extra/*') output: - path "*multiqc_report.html" , emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions + path "*multiqc_report.html" , emit: report + path "*_data" , emit: data + path "*_assembly_metrics_mqc.csv" , optional:true, emit: csv_assembly + path "*_plots" , optional:true, emit: plots + path "versions.yml" , emit: versions script: def args = task.ext.args ?: '' @@ -38,13 +39,20 @@ process MULTIQC { multiqc -f $args $custom_config . ## Collect additional files to be included in the report - cp extra/* multiqc_data/ + if [ -d extra/ ]; then + cp extra/* multiqc_data/ + fi - ## Parse YAML files dumped by MultiQC to obtain metrics + ## Create multiqc custom data multiqc_to_custom_csv.py --assembly_type $params.assembly_type + ## Avoid the custom Multiqc table when the kmerfinder process is not invoked. + if grep ">skip_kmerfinder<" workflow_summary_mqc.yaml; then + rm *_assembly_metrics_mqc.csv + fi + ## Run multiqc a second time - multiqc -f $args -e general_stats $custom_config . + multiqc -f $args $custom_config . cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf deleted file mode 100644 index 40a691eb..00000000 --- a/modules/nf-core/multiqc/main.nf +++ /dev/null @@ -1,53 +0,0 @@ -process MULTIQC { - label 'process_single' - - conda "bioconda::multiqc=1.17" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : - 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" - - input: - path multiqc_files, stageAs: "?/*" - path(multiqc_config) - path(extra_multiqc_config) - path(multiqc_logo) - - output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def config = multiqc_config ? "--config $multiqc_config" : '' - def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' - """ - multiqc \\ - --force \\ - $args \\ - $config \\ - $extra_config \\ - . - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ - - stub: - """ - touch multiqc_data - touch multiqc_plots - touch multiqc_report.html - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml deleted file mode 100644 index f93b5ee5..00000000 --- a/modules/nf-core/multiqc/meta.yml +++ /dev/null @@ -1,56 +0,0 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json -name: MultiQC -description: Aggregate results from bioinformatics analyses across many samples into a single report -keywords: - - QC - - bioinformatics tools - - Beautiful stand-alone HTML report -tools: - - multiqc: - description: | - MultiQC searches a given directory for analysis logs and compiles a HTML report. - It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. - homepage: https://multiqc.info/ - documentation: https://multiqc.info/docs/ - licence: ["GPL-3.0-or-later"] - -input: - - multiqc_files: - type: file - description: | - List of reports / files recognised by MultiQC, for example the html and zip output of FastQC - - multiqc_config: - type: file - description: Optional config yml for MultiQC - pattern: "*.{yml,yaml}" - - extra_multiqc_config: - type: file - description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. - pattern: "*.{yml,yaml}" - - multiqc_logo: - type: file - description: Optional logo file for MultiQC - pattern: "*.{png}" - -output: - - report: - type: file - description: MultiQC report file - pattern: "multiqc_report.html" - - data: - type: directory - description: MultiQC data dir - pattern: "multiqc_data" - - plots: - type: file - description: Plots created by MultiQC - pattern: "*_data" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" diff --git a/workflows/bacass.nf b/workflows/bacass.nf index d393732d..5dbc2d50 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -26,26 +26,34 @@ def checkPathParamList = [ params.input, params.multiqc_config, params.kraken2db for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check krakendb -if(! params.skip_kraken2){ - if(params.kraken2db){ +if (!params.skip_kraken2) { + if (params.kraken2db) { kraken2db = file(params.kraken2db) } else { exit 1, "Missing Kraken2 DB arg" } } +// Check kmerfinderdb +if (!params.skip_kmerfinder && !params.kmerfinderdb){ + exit 1, "Missing Kmerfinder DB arg: --kmerfinderdb " +} + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -if(params.assembly_type){ + + +// When invoking kmerfinder, utilize a custom MultiQC config file to generate a specialized report. This report will organize samples into groups based on their reference genome, w were previously calculated by kmerfinder. +if (!params.skip_kmerfinder && params.assembly_type) { ch_multiqc_config = file("$projectDir/assets/multiqc_config_${params.assembly_type}.yml", checkIfExists: true) } else { ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) } ch_multiqc_custom_config = params.multiqc_config ? file(params.multiqc_config) : [] -ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) /* @@ -258,6 +266,7 @@ workflow BACASS { // // MODULE: Miniasm, genome assembly, long reads + // if ( params.assembler == 'miniasm' ) { MINIMAP2_ALIGN ( ch_for_assembly.map{ meta,sr,lr -> tuple(meta,lr) }, @@ -336,7 +345,7 @@ workflow BACASS { ch_for_polish // tuple val(meta), val(reads), file(longreads), file(assembly) .join( MINIMAP2_POLISH.out.bam ) // tuple val(meta), file(bam) .join( SAMTOOLS_INDEX.out.bai ) // tuple val(meta), file(bai) - .join( ch_fast5 ) // tuple val(meta), file(fast5) + .join( ch_fast5 ) // tuple val(meta), file(fast5) .set { ch_for_nanopolish } // tuple val(meta), val(reads), file(longreads), file(assembly), file(bam), file(bai), file(fast5) // TODO: 'nanopolish index' couldn't be tested. No fast5 provided in test datasets. @@ -395,10 +404,13 @@ workflow BACASS { } // - // MODULE: Kmerfinder, QC for sample purity. Available for short, long and hybrid assemblies + // MODULE: Kmerfinder, QC for sample purity. // - if ( !params.skip_kmerfinder && params.kmerfinderdb ) { + ch_kmerfinder_multiqc = Channel.empty() + if (!params.skip_kmerfinder) { + + // Process kmerfinder database if( params.kmerfinderdb.endsWith('.gz') ){ GUNZIP_KMERFINDERDB ( params.kmerfinderdb ) ch_kmerfinderdb = GUNZIP_KMERFINDERDB.out.gunzip @@ -406,6 +418,7 @@ workflow BACASS { ch_kmerfinderdb = params.kmerfinderdb } + // Set kmerfinder input based on assembly type if( params.assembly_type == 'short' || params.assembly_type == 'hybrid' ) { ch_for_kmerfinder = FASTQ_TRIM_FASTP_FASTQC.out.reads } else if ( params.assembly_type == 'long' ) { @@ -422,12 +435,13 @@ workflow BACASS { ch_reference_fasta = KMERFINDER_SUBWORKFLOW.out.reference_fasta ch_reference_gff = KMERFINDER_SUBWORKFLOW.out.reference_gff ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq + ch_kmerfinder_multiqc = KMERFINDER_SUBWORKFLOW.out.summary_yaml ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) // Group data based on ref-genome and rename meta according to the identified references count. ch_consensus_byrefseq // [ refseq, meta, report_txt, consensus ] .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] - .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] + .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff ] .groupTuple(by:0) .map { refseq, meta, report_txt, consensus, ref_fasta, ref_gff -> ch_refseqid.size() @@ -445,9 +459,9 @@ workflow BACASS { // // FIXME: simplify it. I think choolsing anotherapproach will improve it ch_assembly - .collect{ it[1]} - .map{ consensus -> tuple([id:'report'], consensus)} - .set{ch_to_quast} + .collect{it[1]} + .map{ consensus -> tuple([id:'report'], consensus) } + .set{ ch_to_quast } if(params.skip_kmerfinder){ QUAST( @@ -456,7 +470,7 @@ workflow BACASS { params.reference_gff ?: [[:],[]] ) ch_quast_multiqc = QUAST.out.results - } else if (ch_to_quast_byrefseq){ + } else if (!params.skip_kmerfinder && ch_to_quast_byrefseq) { QUAST( ch_to_quast, [[:],[]], @@ -467,9 +481,9 @@ workflow BACASS { ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_fasta)}, ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} ) + ch_quast_multiqc = QUAST_BYREFSEQID.out.results } - ch_quast_multiqc = QUAST_BYREFSEQID.out.results - ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) + ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) // Check assemblies that require further processing for gene annotation ch_assembly @@ -548,7 +562,7 @@ workflow BACASS { ch_multiqc_config, ch_multiqc_custom_config, CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect(), - //ch_workflow_summary, // FIXME: Cannot parse this file... + ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'), ch_multiqc_logo.collect().ifEmpty([]), ch_fastqc_raw_multiqc.collect{it[1]}.ifEmpty([]), ch_trim_json_multiqc.collect{it[1]}.ifEmpty([]), @@ -557,10 +571,10 @@ workflow BACASS { ch_pycoqc_multiqc.collect{it[1]}.ifEmpty([]), ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([]), ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([]), - ch_quast_multiqc.collect{it[1]}.ifEmpty([]), // TODO: Create a quast channel for each assembler + ch_quast_multiqc.collect{it[1]}.ifEmpty([]), ch_prokka_txt_multiqc.collect{it[1]}.ifEmpty([]), ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([]), - KMERFINDER_SUBWORKFLOW.out.summary_yaml.collectFile(name: 'multiqc_kmerfinder.yaml'), + ch_kmerfinder_multiqc.collectFile(name: 'multiqc_kmerfinder.yaml').ifEmpty([]), ) multiqc_report = MULTIQC.out.report.toList() } From 247f592640221db4df35ae08a824813ca32177a4 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 4 Jan 2024 11:41:59 +0100 Subject: [PATCH 19/58] add custom multiqc for hybrid assembly --- assets/multiqc_config_hybrid.yml | 166 +++++++++++++++++++++++++++++++ bin/multiqc_to_custom_csv.py | 55 +++++++++- nextflow.config | 2 +- 3 files changed, 218 insertions(+), 5 deletions(-) create mode 100644 assets/multiqc_config_hybrid.yml diff --git a/assets/multiqc_config_hybrid.yml b/assets/multiqc_config_hybrid.yml new file mode 100644 index 00000000..4c036265 --- /dev/null +++ b/assets/multiqc_config_hybrid.yml @@ -0,0 +1,166 @@ +report_comment: > + This report has been generated by the nf-core/bacass + analysis pipeline. For information about how to interpret these results, please see the + documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - fastqc + - fastp + - nanostat + - porechop + - pycoqc + - kraken2 + - quast + - prokka + - bakta + +exclude_modules: + - general_stats + +module_order: + - fastqc: + name: "PREPROCESS: FastQC (raw reads)" + info: "This section of the report shows FastQC results for the raw reads before adapter trimming." + path_filters: + - "./fastqc/*.zip" + - fastp: + name: "PREPROCESS: fastp (adapter trimming)" + info: "This section of the report shows fastp results for reads after adapter and quality trimming." + path_filters: + - "./fastp/*.json" + - nanostat: + name: "PREPROCESS: Nanoplot" + info: "This section of the report shows Nanoplot results for nanopore sequencing data." + path_filters: + - "./nanoplot/*.txt" + - porechop: + name: "PREPROCESS: Porechop" + info: "This section of the report shows Porechop results for reads after adapter trimming." + path_filters: + - "./porechop/*.log" + - pycoqc: + name: "PREPROCESS: PycoQC" + info: "This section of the report shows PycoQC results for quality control of long-read sequencing data." + path_filters: + - "./pycoqc/*.txt" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + +report_section_order: + fastqc: + after: general_stats + fastp: + after: general_stats + nanostat: + after: general_stats + porechop: + before: nanostat + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: + order: -1000 + software_versions: + order: -1001 + nf-core-bacass-summary: + order: -1002 + +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics (shorts & long reads)" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Input short reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Trimmed short reads (fastp)": + description: "Total number of reads remaining after adapter/quality trimming with fastp" + format: "{:,.0f}" + "# Input long reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Median long reads lenght": + description: "Median read lenght (bp)" + format: "{:,.0f}" + "# Median long reads quality": + description: "Median read quality (Phred scale)" + format: "{:,.0f}" + "# Contigs (hybrid assembly)": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig (hybrid assembly)": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50 (hybrid assembly)": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction (hybrid assembly)": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + +export_plots: true + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details +sp: + fastp: + fn: "*.fastp.json" diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py index bccb0143..1e79910b 100755 --- a/bin/multiqc_to_custom_csv.py +++ b/bin/multiqc_to_custom_csv.py @@ -67,7 +67,7 @@ def yaml_fields_to_dict(yaml_file, append_dict={}, field_mapping_list=[], valid_ "number_of_SNPs", "number_of_indels", "MISSENSE", - "# contigs (>= 0 bp)", + "# contigs", "# contigs (>= 5000 bp)", "Largest contig", ] @@ -173,7 +173,7 @@ def main(args=None): ( "multiqc_quast.yaml", [ - ("# Contigs", ["# contigs (>= 0 bp)"]), + ("# Contigs", ["# contigs"]), ("# Largest contig", ["Largest contig"]), ("# N50", ["N50"]), ("# % Genome fraction", ["Genome fraction (%)"]), @@ -204,9 +204,9 @@ def main(args=None): ] ), ( - "multiqc_quast.yaml", # TODO: "multiqc_quast_quast_{assemblertool}.yaml" + "multiqc_quast.yaml", [ - ("# Contigs", ["# contigs (>= 0 bp)"]), + ("# Contigs", ["# contigs"]), ("# Largest contig", ["Largest contig"]), ("# N50", ["N50"]), ("# % Genome fraction", ["Genome fraction (%)"]), @@ -227,6 +227,46 @@ def main(args=None): ), ] + hybrid_assembly_files = [ + ( + "multiqc_fastp.yaml", + [ + ("# Input short reads", ["before_filtering", "total_reads"]), + ("# Trimmed short reads (fastp)", ["after_filtering", "total_reads"]), + ] + ), + ( + "multiqc_nanostat.yaml", + [ + ("# Input long reads", ["Number of reads_fastq"]), + ("# Median long reads lenght", ["Median read length_fastq"]), + ("# Median long reads quality", ["Median read quality_fastq"]), + ] + ), + ( + "multiqc_quast.yaml", + [ + ("# Contigs (hybrid assembly)", ["# contigs"]), + ("# Largest contig (hybrid assembly)", ["Largest contig"]), + ("# N50 (hybrid assembly)", ["N50"]), + ("# % Genome fraction (hybrid assembly)", ["Genome fraction (%)"]), + ], + ), + ( + "multiqc_kmerfinder.yaml", + [ + ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), + ("# Best hit assembly ID (Kmerfinder)", ["07-kmerfinder_best_hit_# Assembly"]), + ("# Best hit query coverage (Kmerfinder)", ["07-kmerfinder_best_hit_Query_Coverage"]), + ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), + ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), + ("# Second hit assembly ID (Kmerfinder)", ["07-kmerfinder_second_hit_# Assembly"]), + ("# Second hit query coverage (Kmerfinder)", ["07-kmerfinder_second_hit_Query_Coverage"]), + ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), + ] + ), + ] + ## Write de novo assembly metrics to file if args.ASSEMBLY_TYPE == 'short': metrics_dict_to_file( @@ -242,6 +282,13 @@ def main(args=None): out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", valid_sample_list=[], ) + elif args.ASSEMBLY_TYPE == 'hybrid': + metrics_dict_to_file( + file_field_list=hybrid_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) if __name__ == "__main__": sys.exit(main()) diff --git a/nextflow.config b/nextflow.config index 4dc16396..d9d397b9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -46,7 +46,7 @@ params { skip_fastqc = false skip_fastp = false skip_kraken2 = false - skip_kmerfinder = false + skip_kmerfinder = true skip_pycoqc = false skip_annotation = false skip_polish = false From df1403dc3dc8d8153ca1d753e1986e0bdefd39c9 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 4 Jan 2024 15:24:33 +0100 Subject: [PATCH 20/58] add file-check-exist and rename variables --- bin/multiqc_to_custom_csv.py | 4 ---- modules/local/find_download_reference.nf | 6 ++++-- modules/local/kmerfinder.nf | 6 +++--- nextflow.config | 2 +- nextflow_schema.json | 6 +++--- subworkflows/local/kmerfinder_subworkflow.nf | 4 ++-- workflows/bacass.nf | 19 ++++++++++++------- 7 files changed, 25 insertions(+), 22 deletions(-) diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py index 1e79910b..391ca41e 100755 --- a/bin/multiqc_to_custom_csv.py +++ b/bin/multiqc_to_custom_csv.py @@ -63,10 +63,6 @@ def find_tag(d, tag): def yaml_fields_to_dict(yaml_file, append_dict={}, field_mapping_list=[], valid_sample_list=[]): integer_fields = [ - "mapped_passed", - "number_of_SNPs", - "number_of_indels", - "MISSENSE", "# contigs", "# contigs (>= 5000 bp)", "Largest contig", diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf index 36b59f0c..7ca5e8a3 100644 --- a/modules/local/find_download_reference.nf +++ b/modules/local/find_download_reference.nf @@ -9,7 +9,7 @@ process FIND_DOWNLOAD_REFERENCE { input: tuple val(meta), path(reports, stageAs: 'reports/*') - path(ncbi_reference) + path(ncbi_metadata_db) output: tuple val(meta), path( "*.fna.gz") , emit: fna @@ -19,13 +19,15 @@ process FIND_DOWNLOAD_REFERENCE { script: """ + ## Find the common reference genome find_common_reference.py \\ -d reports/ \\ -o references_found.tsv + ## Download the winner reference genome from the ncbi database download_reference.py \\ -file references_found.tsv \\ - -reference $ncbi_reference \\ + -reference $ncbi_metadata_db \\ -out_dir . cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf index 59e1d5b3..92aff76d 100644 --- a/modules/local/kmerfinder.nf +++ b/modules/local/kmerfinder.nf @@ -9,7 +9,7 @@ process KMERFINDER { input: tuple val(meta), path(reads) - path(kmerfinderDB) + path(kmerfinder_db) output: tuple val(meta), path("*_results.txt") , emit: report @@ -24,8 +24,8 @@ process KMERFINDER { kmerfinder.py \\ --infile $in_reads \\ --output_folder . \\ - --db_path ${kmerfinderDB}/bacteria.ATG \\ - -tax ${kmerfinderDB}/bacteria.name \\ + --db_path ${kmerfinder_db}/bacteria.ATG \\ + -tax ${kmerfinder_db}/bacteria.name \\ -x mv results.txt ${prefix}_results.txt diff --git a/nextflow.config b/nextflow.config index d9d397b9..c5b768b9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,7 +21,7 @@ params { kmerfinderdb = "" reference_fasta = "" reference_gff = "" - reference_ncbi_bacteria = "" + ncbi_assembly_metadata = "" // Assembly parameters assembler = 'unicycler' // Allowed: ['unicycler', 'canu', 'miniasm', 'dragonflye'] diff --git a/nextflow_schema.json b/nextflow_schema.json index 0f0e9185..43a732e0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -76,7 +76,7 @@ }, "kmerfinderdb": { "type": "string", - "description": "Database for Kmerfinder.", + "description": "Path to the Kmerfinder bacteria database.", "help_text": "" }, "reference_fasta": { @@ -87,9 +87,9 @@ "type": "string", "description": "Reference GFF file." }, - "reference_ncbi_bacteria": { + "ncbi_assembly_metadata": { "type": "string", - "description": "NCBI Bacteria reference database" + "description": "Master file (*.txt) containing a summary of asseblies available in GeneBank or RefSeq. See: https://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt" } } }, diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf index ffe3f74e..d7775372 100644 --- a/subworkflows/local/kmerfinder_subworkflow.nf +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -9,7 +9,7 @@ include { QUAST } from '../../modules/nf-core/quast/main' workflow KMERFINDER_SUBWORKFLOW { take: kmerfinder_db // channel: [ path ] - ncbi_bacteria_db // channel: [ path ] + ncbi_assembly_metadata // channel: [ path ] reads // channel: [ meta, reads ] consensus // channel: [ meta, consensus ] @@ -49,7 +49,7 @@ workflow KMERFINDER_SUBWORKFLOW { if (!params.reference_fasta && !params.reference_gff) { FIND_DOWNLOAD_REFERENCE ( ch_consensus_byrefseq.map{ refseq, meta, report_txt, fasta -> tuple(refseq, report_txt)}, - ncbi_bacteria_db + ncbi_assembly_metadata ) ch_reference_fasta = FIND_DOWNLOAD_REFERENCE.out.fna ch_reference_gff = FIND_DOWNLOAD_REFERENCE.out.gff diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 5dbc2d50..a77f8dee 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -28,17 +28,21 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check krakendb if (!params.skip_kraken2) { if (params.kraken2db) { - kraken2db = file(params.kraken2db) + kraken2db = file(params.kraken2db, checkIfExists: true) } else { exit 1, "Missing Kraken2 DB arg" } } -// Check kmerfinderdb -if (!params.skip_kmerfinder && !params.kmerfinderdb){ - exit 1, "Missing Kmerfinder DB arg: --kmerfinderdb " +// Check kmerfinder dependencies +if (!params.skip_kmerfinder) { + if (!params.kmerfinderdb || !params.ncbi_assembly_metadata) { + exit 1, "[KMERFINDER]: Missing --kmerfinder_db and/or --ncbi_assembly_metadata arguments. Both are required to run KMERFINDER." + } else { + file(params.kmerfinderdb, checkIfExists: true) + file(params.ncbi_assembly_metadata, checkIfExists: true) + } } - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -404,8 +408,9 @@ workflow BACASS { } // - // MODULE: Kmerfinder, QC for sample purity. + // SUBWORKFLOW: Kmerfinder, QC for sample purity. // + // TODO: Executes both kmerfinder and organizes samples by the reference genome (all this through the kmerfinder_subworkflow()). Ideally, users can also utilize kmerfinder independently without the need to download reference genome and grouping data —simply running kmerfinder alone-. ch_kmerfinder_multiqc = Channel.empty() if (!params.skip_kmerfinder) { @@ -427,7 +432,7 @@ workflow BACASS { KMERFINDER_SUBWORKFLOW ( ch_kmerfinderdb, - params.reference_ncbi_bacteria, + params.ncbi_assembly_metadata, ch_for_kmerfinder, ch_assembly ) From c42735e44ffbf1815546787e5029013630e6683a Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 5 Jan 2024 12:50:15 +0100 Subject: [PATCH 21/58] update documentation and add save_trimmed option --- README.md | 11 +++++++++-- conf/modules.config | 7 +++++-- docs/output.md | 25 ++++++++++++++++++++++--- nextflow.config | 1 + nextflow_schema.json | 7 +++++-- 5 files changed, 42 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 6ed64f2a..3ac71b85 100644 --- a/README.md +++ b/README.md @@ -25,11 +25,12 @@ On release, automated continuous integration tests run the pipeline on a full-si ### Short Read Assembly -This pipeline is primarily for bacterial assembly of next-generation sequencing reads. It can be used to quality trim your reads using [FastP](https://github.com/OpenGene/fastp) and performs basic sequencing QC using [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Afterwards, the pipeline performs read assembly using [Unicycler](https://github.com/rrwick/Unicycler). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) to verify sample purity. +This pipeline is primarily for bacterial assembly of next-generation sequencing reads. It can be used to quality trim your reads using [FastP](https://github.com/OpenGene/fastp) and performs basic sequencing QC using [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Afterwards, the pipeline performs read assembly using [Unicycler](https://github.com/rrwick/Unicycler). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) and [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) to verify sample purity. ### Long Read Assembly -For users that only have Nanopore data, the pipeline quality trims these using [PoreChop](https://github.com/rrwick/Porechop) and assesses basic sequencing QC utilizing [NanoPlot](https://github.com/wdecoster/NanoPlot) and [PycoQC](https://github.com/a-slide/pycoQC). +For users that only have Nanopore data, the pipeline quality trims these using [PoreChop](https://github.com/rrwick/Porechop) and assesses basic sequencing QC utilizing [NanoPlot](https://github.com/wdecoster/NanoPlot) and [PycoQC](https://github.com/a-slide/pycoQC). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) and [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) to verify sample purity. + The pipeline can then perform long read assembly utilizing [Unicycler](https://github.com/rrwick/Unicycler), [Miniasm](https://github.com/lh3/miniasm) in combination with [Racon](https://github.com/isovic/racon), [Canu](https://github.com/marbl/canu) or [Flye](https://github.com/fenderglass/Flye) by using the [Dragonflye](https://github.com/rpetit3/dragonflye)(\*) pipeline. Long reads assembly can be polished using [Medaka](https://github.com/nanoporetech/medaka) or [NanoPolish](https://github.com/jts/nanopolish) with Fast5 files. > **\*Note**: Dragonflye is a comprehensive pipeline designed for genome assembly of Oxford Nanopore Reads. It facilitates the utilization of Flye (default), Miniasm, and Raven assemblers, along with Racon(default) and Medaka polishers. For more information, visit the [Dragonflye GitHub](https://github.com/rpetit3/dragonflye) repository. @@ -42,6 +43,12 @@ For users specifying both short read and long read (NanoPore) data, the pipeline In all cases, the assembly is assessed using [QUAST](http://bioinf.spbau.ru/quast). The resulting bacterial assembly is furthermore annotated using [Prokka](https://github.com/tseemann/prokka), [Bakta](https://github.com/oschwengers/bakta) or [DFAST](https://github.com/nigyta/dfast_core). +In specific cases where samples recorded in the input samplesheet belong to more than one species, the pipeline finds and downloads their respectve reference genomes (this also works with single specie input samplesheet). It then groups the samples into batches and collects assembly QC results based on their corresponding reference genomes. + +> NOTE: This scenario is supported when [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) analysis is performed only. + + +In cases where input samplesheet has files where , the pipeline will group samples in batches according to their reference genomes and will provide a general QUAST containing all the input samples and a by reference genome QUAST report, that is, a quast report for each reference genome. ## Usage :::note diff --git a/conf/modules.config b/conf/modules.config index 3bd7d064..95eb20b8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -36,7 +36,8 @@ process { publishDir = [ path: { "${params.outdir}/trimming/longreads" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enable: params.save_trimmed ] } @@ -180,6 +181,7 @@ process { publishDir = [ path: { "${params.outdir}/QUAST" }, mode: params.publish_dir_mode, + pattern: "{report,runs_per_reference/*}/{report.html,report.pdf,icarus.html}", saveAs: { filename -> if (filename.equals('versions.yml') || filename.endsWith('.tsv')){ null @@ -251,7 +253,8 @@ if (!params.skip_fastp) { path: { "${params.outdir}/trimming/shortreads" }, mode: params.publish_dir_mode, pattern: "*.fastp.fastq.gz", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_trimmed ], [ path: { "${params.outdir}/trimming/shortreads/json_html" }, diff --git a/docs/output.md b/docs/output.md index ba44aa38..2b043a9c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -119,6 +119,21 @@ Exemplary Kraken2 report screenshot: +## Reads QC and Sample purity + +The pipeline includes a dedicated step for short and long reads QC as well as contamination analysis using [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/). This process helps assess the quality and purity of the samples. + +
+Output files + +- `Kmerfinder/{ID}/` + - `*_results.txt`: Kmerfinder report table containing reads QC results and taxonomic information. + +- `Kmerfinder/`: + - kmerfinder_summary.csv: A CSV file containing the most relevant results of all samples analyzed with Kmerfinder. + +
+ ## Assembly Output Trimmed reads are assembled with [Unicycler](https://github.com/rrwick/Unicycler) in `short` or `hybrid` assembly modes. For long-read assembly, there are also `canu` and `miniasm` available. @@ -181,9 +196,12 @@ The assembly QC is performed with [QUAST](http://quast.sourceforge.net/quast) fo
Output files -- `QUAST` - - `report.tsv`: QUAST's report in text format -- `QUAST/report` +- `QUAST/report/` + - `icarus.html`: QUAST's contig browser as HTML + - `report.html`: QUAST assembly QC as HTML report + - `report.pdf`: QUAST assembly QC as pdf + +- `QUAST/runs_per_reference/{reference_assembly}/` - `icarus.html`: QUAST's contig browser as HTML - `report.html`: QUAST assembly QC as HTML report - `report.pdf`: QUAST assembly QC as pdf @@ -241,6 +259,7 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - `multiqc_plots/`: directory containing static images from the report in various formats. + - summary_assembly_metrics_mqc.csv: custom table containing most relevant assembly QC metrics.
diff --git a/nextflow.config b/nextflow.config index c5b768b9..53b6c0fc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,7 @@ params { input = null // QC and trimming options + save_trimmed = false save_trimmed_fail = false save_merged = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 43a732e0..e39ca896 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -42,6 +42,10 @@ "description": "Parameters for QC and trim short-reads", "default": "", "properties": { + "save_trimmed": { + "type": "boolean", + "description": "save trimmed files" + }, "save_trimmed_fail": { "type": "boolean", "enum": ["true", "false"], @@ -76,8 +80,7 @@ }, "kmerfinderdb": { "type": "string", - "description": "Path to the Kmerfinder bacteria database.", - "help_text": "" + "description": "Path to the Kmerfinder bacteria database." }, "reference_fasta": { "type": "string", From 3357785ef3e102382f37cea1b74ab90e53984dba Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 5 Jan 2024 16:05:59 +0100 Subject: [PATCH 22/58] add fastp additional options and fix input sample path --- conf/modules.config | 4 ++-- nextflow.config | 1 + nextflow_schema.json | 4 ++++ workflows/bacass.nf | 6 +++--- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 95eb20b8..4c1c1700 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -247,7 +247,7 @@ if (!params.skip_fastqc) { if (!params.skip_fastp) { process { withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTP' { - ext.args = '' + ext.args = params.fastp_args ? params.fastp_args : '' publishDir = [ [ path: { "${params.outdir}/trimming/shortreads" }, @@ -296,7 +296,7 @@ if (!params.skip_kmerfinder) { publishDir = [ path: { "${params.outdir}/Kmerfinder/${meta.id}" }, mode: params.publish_dir_mode, - pattern: "*.txt", + pattern: "*.{txt,json}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } diff --git a/nextflow.config b/nextflow.config index 53b6c0fc..c0ab1beb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,7 @@ params { input = null // QC and trimming options + fastp_args = "" save_trimmed = false save_trimmed_fail = false save_merged = false diff --git a/nextflow_schema.json b/nextflow_schema.json index e39ca896..07ac01e7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -42,6 +42,10 @@ "description": "Parameters for QC and trim short-reads", "default": "", "properties": { + "fastp_args": { + "type": "string", + "description": "This can be used to pass arguments to [Fastp](https://github.com/OpenGene/fastp)" + }, "save_trimmed": { "type": "boolean", "description": "save trimmed files" diff --git a/workflows/bacass.nf b/workflows/bacass.nf index a77f8dee..96fe9c9d 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -135,9 +135,9 @@ workflow BACASS { // def criteria = multiMapCriteria { meta, fastq_1, fastq_2, long_fastq, fast5 -> - shortreads: fastq_1 != 'NA' ? tuple(meta, [fastq_1, fastq_2]) : null - longreads: long_fastq != 'NA' ? tuple(meta, long_fastq) : null - fast5: fast5 != 'NA' ? tuple(meta, fast5) : null + shortreads: fastq_1 != 'NA' ? tuple(meta, [file(fastq_1), file(fastq_2)]) : null + longreads: long_fastq != 'NA' ? tuple(meta, file(long_fastq)) : null + fast5: fast5 != 'NA' ? tuple(meta, fast5) : null } // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ Channel From 1847f397001f547095f865a2c192e0d577bacae4 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Mon, 15 Jan 2024 15:42:20 +0100 Subject: [PATCH 23/58] allow module to emit tsv report --- modules/local/find_download_reference.nf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf index 7ca5e8a3..95cb8e09 100644 --- a/modules/local/find_download_reference.nf +++ b/modules/local/find_download_reference.nf @@ -12,10 +12,11 @@ process FIND_DOWNLOAD_REFERENCE { path(ncbi_metadata_db) output: - tuple val(meta), path( "*.fna.gz") , emit: fna - tuple val(meta), path( "*.gff.gz") , emit: gff - tuple val(meta), path( "*.faa.gz") , emit: faa - path "versions.yml" , emit: versions + tuple val(meta), path("*.fna.gz") , emit: fna + tuple val(meta), path("*.gff.gz") , emit: gff + tuple val(meta), path("*.faa.gz") , emit: faa + tuple val(meta), path("references_found.tsv") , emit: references_tsv + path "versions.yml" , emit: versions script: """ From b73e3f089b84c4dca1ad47a4aed170cad86e5dca Mon Sep 17 00:00:00 2001 From: Dani VM Date: Mon, 6 Nov 2023 16:55:08 +0100 Subject: [PATCH 24/58] add kmerfinder for shortreads --- conf/modules.config | 10 ++++++++++ modules/local/kmerfinder.nf | 37 +++++++++++++++++++++++++++++++++++++ nextflow.config | 2 ++ workflows/bacass.nf | 23 +++++++++++++++++++++-- 4 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 modules/local/kmerfinder.nf diff --git a/conf/modules.config b/conf/modules.config index 03a35293..f67f1fe0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -160,6 +160,16 @@ process { ] } + withName: 'KMERFINDER' { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/Kmerfinder/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*_results.txt", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'KRAKEN2_LONG' { ext.args = '' publishDir = [ diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf new file mode 100644 index 00000000..db7fddf1 --- /dev/null +++ b/modules/local/kmerfinder.nf @@ -0,0 +1,37 @@ +process KMERFINDER { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::kmerfinder=3.0.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kmerfinder:3.0.2--hdfd78af_0' : + 'biocontainers/kmerfinder:3.0.2--hdfd78af_0' }" + + input: + tuple val(meta), path(reads) + path(kmerfinderDB) + + output: + tuple val(meta), path("*_results.txt") , emit: report + path "versions.yml" , emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def in_reads = reads.size() == 1 ? "${reads}" : "${reads[0]} ${reads[1]}" + + """ + kmerfinder.py \\ + --infile $in_reads \\ + --output_folder . \\ + --db_path ${kmerfinderDB}/bacteria.ATG \\ + -tax ${kmerfinderDB}/bacteria.name \\ + -x + + mv results.txt ${prefix}_results.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kmerfinder: \$(echo "3.0.2") + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index bf5e7aff..bf69022e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,6 +18,7 @@ params { // Contamination_screening kraken2db = "" + kmerfinderdb = "" // Assembly parameters assembler = 'unicycler' // Allowed: ['unicycler', 'canu', 'miniasm', 'dragonflye'] @@ -42,6 +43,7 @@ params { skip_fastqc = false skip_fastp = false skip_kraken2 = false + skip_kmerfinder = false skip_pycoqc = false skip_annotation = false skip_polish = false diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 6e923340..36cede8f 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -59,6 +59,7 @@ include { UNICYCLER } from '../modules/local/unicycler' include { NANOPOLISH } from '../modules/local/nanopolish' include { MEDAKA } from '../modules/local/medaka' include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' +include { KMERFINDER } from '../modules/local/kmerfinder' include { DFAST } from '../modules/local/dfast' // @@ -219,7 +220,7 @@ workflow BACASS { .dump(tag: 'ch_for_assembly') .set { ch_for_assembly } } - +/* // // ASSEMBLY: Unicycler, Canu, Miniasm, Dragonflye // @@ -352,7 +353,7 @@ workflow BACASS { MEDAKA ( ch_for_medaka.dump(tag: 'into_medaka') ) ch_versions = ch_versions.mix(MEDAKA.out.versions.ifEmpty(null)) } - +*/ // // MODULE: Kraken2, QC for sample purity // @@ -388,6 +389,23 @@ workflow BACASS { ch_versions = ch_versions.mix(KRAKEN2_LONG.out.versions.ifEmpty(null)) } + // + // MODULE: Kmerfinder, QC for sample purity + // + + // TODO: add check contamination module // CALLIT PARSE_KMERFINDER + // TODO: if not provided, download reference from kmerfinder results --> module FIND_DOWNLOAD_COMMON_REFFERENCE + // TODO: Create kmerfinder mode for short and longreads + // TODO: When no kmerfinder database is found, allow nf-core/bacass to download it + if ( !params.skip_kmerfinder && params.kmerfinderdb ) { + KMERFINDER ( + ch_for_assembly.map{ meta, sr, lr -> tuple( meta, sr) }, // [meta, reads] + params.kmerfinderdb // path(kmerfinder database) + ) + ch_kmerfinder_report = KMERFINDER.out.report + ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) + } +/* // // MODULE: QUAST, assembly QC // @@ -501,6 +519,7 @@ workflow BACASS { ) multiqc_report = MULTIQC.out.report.toList() } +*/ } /* From 764a3d0cc0a6bef82ac06f85ae867a392dcfb08f Mon Sep 17 00:00:00 2001 From: Dani VM Date: Mon, 6 Nov 2023 18:07:00 +0100 Subject: [PATCH 25/58] add module kmerfinder summary report --- bin/kmerfinder_summary.py | 204 ++++++++++++++++++++++++++++ conf/modules.config | 10 ++ modules/local/kmerfinder_summary.nf | 26 ++++ workflows/bacass.nf | 6 + 4 files changed, 246 insertions(+) create mode 100755 bin/kmerfinder_summary.py create mode 100644 modules/local/kmerfinder_summary.nf diff --git a/bin/kmerfinder_summary.py b/bin/kmerfinder_summary.py new file mode 100755 index 00000000..612a2525 --- /dev/null +++ b/bin/kmerfinder_summary.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 + + +import argparse +import sys +import re +import csv +import pickle +import os + + +################# +### FUNCTIONS ### +################# + + +def check_arg(args=None): + + """ + Description: + Function collect arguments from command line using argparse + Input: + args # command line arguments + Constant: + None + Variables + parser + Return + parser.parse_args() # Parsed arguments + """ + + parser = argparse.ArgumentParser( + prog="07-kmerfinder.py", + formatter_class=argparse.RawDescriptionHelpFormatter, + description="07-kmerfinder.py creates a csv file from results.txt file", # FIXME + ) + + parser.add_argument( + "--path", + "-p", + required=True, + help="Insert path of results.txt file like /home/user/Service_folder/ANALYSIS/07-kmerfinder", # FIXME + ) + + parser.add_argument( + "--output_bn", "-b", required=True, help="The output in binary file" + ) + + parser.add_argument( + "--output_csv", "-c", required=True, help="The output in csv file" + ) + + # Example: python3 parse_kmerfinder.py -p /home/s.gonzalez/07-kmerfinder -b p_dic.dicke -c p_kmer.csv + + return parser.parse_args() + + +################# +### FUNCTIONS ### +################# + + +def kmerfinder_dictionary(file_txt): + + """ + Description: + Function to extract the relevant part of result.txt file + Input: + result.txt file + Return: + dictionary + """ + + step = "07-kmerfinder_" # FIXME + + num_lines = sum(1 for line in open(file_txt)) + hits = num_lines - 1 # to count the total number of hits + lookupfile = open(file_txt, "r") + lines = lookupfile.readlines() + parameters = lines[0].strip().split("\t") + if num_lines > 1: + values_best_hit = lines[1].strip().split("\t") + if num_lines > 2: + values_second_hit = lines[2].strip().split("\t") + + kmer_dict = {} + + for i in range(len(parameters)): + if num_lines > 1: + kmer_dict[step + "best_hit_" + parameters[i]] = values_best_hit[i] + else: + kmer_dict[step + "best_hit_" + parameters[i]] = "" + + kmer_dict.update(Total_hits_07_kmerfinder=hits) + + if num_lines > 2: + + kmer_dict[step + "second_hit_" + parameters[i]] = values_second_hit[i] + + else: + + kmer_dict[step + "second_hit_" + parameters[i]] = "" + + return kmer_dict + + +################# +### FUNCTIONS ### +################# + + +def dictionary2bn(dictionary, binary_file): + + """ + + Description: + Function to create a binary file from a dictionary + Input: + dictionary + Return: + binary file + """ + + pickle_out = open(binary_file, "wb") + pickle.dump(dictionary, pickle_out) + pickle_out.close() + + return + + +################# +### FUNCTIONS ### +################# + + +def dictionary2csv(dictionary, csv_file): + + """ + + Description: + Function to create a csv from a dictionary + Input: + dictionary + Return: + csv file + + """ + + header = sorted(set(i for b in map(dict.keys, dictionary.values()) for i in b)) + with open(csv_file, "w", newline="") as f: + write = csv.writer(f) + write.writerow(["sample_name", *header]) + for a, b in dictionary.items(): + write.writerow([a] + [b.get(i, "") for i in header]) + return + + +################### +### MAIN SCRIPT ### +################### + + +if __name__ == "__main__": + + # Variables + version = "07-kmerfinder.py v 0.1.0." # Script version # FIXME + arguments = check_arg(sys.argv[1:]) + + # Create sample_id_list + path = arguments.path + sample_list = [] + tmp = os.listdir(path) + for item in tmp: + if os.path.isdir(os.path.join(path, item)): + if item != "logs": + sample_name = item.replace("_results.txt", "") + sample_list.append(sample_name) + else: + sample_name = item.replace("_results.txt", "") + sample_list.append(sample_name) + + print("sample_list done") + + # Create a dictionary + kmer_all = {} + + for sample in sample_list: + file_name = os.path.join(path, sample + "_results.txt" ) + kmer_all[sample] = kmerfinder_dictionary(file_name) + + print("kmerfinder_dictionary done") + # print (kmer_all) + + # Save the dicctionary to binary file + + dictionary2bn(kmer_all, arguments.output_bn) + + print("kmerfinder_dictionary_bn done") + + # Convert the dictionary to csv file + + dictionary2csv(kmer_all, arguments.output_csv) + + print("kmerfinder_dictionary_csv done") diff --git a/conf/modules.config b/conf/modules.config index f67f1fe0..38a81488 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -170,6 +170,16 @@ process { ] } + withName: 'KMERFINDER_SUMMARY' { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/Kmerfinder" }, + mode: params.publish_dir_mode, + pattern: "*.csv", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'KRAKEN2_LONG' { ext.args = '' publishDir = [ diff --git a/modules/local/kmerfinder_summary.nf b/modules/local/kmerfinder_summary.nf new file mode 100644 index 00000000..af188125 --- /dev/null +++ b/modules/local/kmerfinder_summary.nf @@ -0,0 +1,26 @@ +process KMERFINDER_SUMMARY { + tag "kmerfinder_summary" + label 'process_low' + + conda "bioconda::python=3.10.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.10' : + 'biocontainers/python:3.10' }" + + input: + path(reports, stageAs: 'reports/*') + + output: + path "kmerfinder.csv" , emit: summary + path "versions.yml" , emit: versions + + script: + """ + kmerfinder_summary.py --path kmerfinder_reports/ --output_bn kmerfinder.bn --output_csv kmerfinder.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """ +} diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 36cede8f..fd0eab38 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -60,6 +60,7 @@ include { NANOPOLISH } from '../modules/local/nanopolish' include { MEDAKA } from '../modules/local/medaka' include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' include { KMERFINDER } from '../modules/local/kmerfinder' +include { KMERFINDER_SUMMARY } from '../modules/local/kmerfinder_summary' include { DFAST } from '../modules/local/dfast' // @@ -404,6 +405,11 @@ workflow BACASS { ) ch_kmerfinder_report = KMERFINDER.out.report ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) + + KMERFINDER_SUMMARY ( + ch_kmerfinder_report.map{meta, report -> report }.collect() + ) + ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) } /* // From 0f07718831c2c4e9aee597c4e46189405a0cbdfe Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 14 Nov 2023 13:48:33 +0100 Subject: [PATCH 26/58] add module to find and download reference genome --- bin/download_reference.py | 153 +++++++++++++++++++++++ bin/find_common_reference.py | 104 +++++++++++++++ modules/local/find_download_reference.nf | 37 ++++++ nextflow.config | 3 + nextflow_schema.json | 25 +++- workflows/bacass.nf | 22 +++- 6 files changed, 337 insertions(+), 7 deletions(-) create mode 100755 bin/download_reference.py create mode 100755 bin/find_common_reference.py create mode 100644 modules/local/find_download_reference.nf diff --git a/bin/download_reference.py b/bin/download_reference.py new file mode 100755 index 00000000..907c547a --- /dev/null +++ b/bin/download_reference.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python +""" +============================================================= +HEADER +============================================================= +INSTITUTION: BU-ISCIII +AUTHOR: Guillermo J. Gorines Cordero +MAIL: guillermo.gorines@urjc.es +VERSION: 0.1 +CREATED: Early 2022 +REVISED: 18-2-2022 +EDITED: 14-11-2023 +DESCRIPTION: + Given a file with the kmerfinder results and frequencies (probably + created by find_common_reference.py), and the NCBI assembly sheet, + download the top-reference genome, gff and protein files from + the NCBI ftp. + +INPUT: + -FILE: file containing the ranking of references from kmerfinder created by the script find_common_references + -REFERENCE: file with the NCBI reference list + -OUTDIR: name of the output dir + +OUTPUT: + - *_fna.gz: file with the top-reference genome + - *_gff.gz: file with the top-reference gff + - *_protein.gz: file with the top-reference proteins + +USAGE: + python download_reference.py + -file [FILE] + -reference [REFERENCE] + -out_dir [OUTDIR] + +REQUIREMENTS: + -Python >= 3.6 + -Python wget + +DISCLAIMER: + This script has been designed for the assembly pipeline of BU-ISCIII. + Feel free to use it at will, however we dont guarantee its success + outside its purpose. +================================================================ +END_OF_HEADER +================================================================ +""" + +import sys +import argparse +import os + +#import wget +import requests + + +def parse_args(args=None): + Description = ( + "download the reference files \ + (fna, faa, gff)from the reference NCBI file." + ) + Epilog = """Usage example: \ + python download_reference.py \ + -file \ + -reference \ + -out_dir """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-file", + help="File containing the ranking of references from kmerfinder." + ) + parser.add_argument( + "-reference", + help="File containing the paths to bacterial references." + ) + parser.add_argument( + "-out_dir", + help="Output directory." + ) + + return parser.parse_args(args) + + +def download_references(file, reference, out_dir): + """ + Downloads the top reference from the NCBI database + """ + + reference_ends = ["_genomic.fna.gz", "_protein.faa.gz", "_genomic.gff.gz"] + + # extract the most common reference from file + with open(file) as infile: + infile = infile.readlines() + infile = [ + item.replace("\n", "").split("\t") + for item in infile + if not item.startswith("#") + ] + top_reference = infile[0][0] + + print(top_reference) + + # create the outdir (do nothing if already there) + try: + os.mkdir(out_dir) + except FileExistsError: + pass + + # open the reference and find the reference + with open(reference) as inref: + inref = inref.readlines() + inref = [ + item.replace("\n", "").split("\t") + for item in inref + if not item.startswith("#") + ] + + url = [row[19] for row in inref if row[0] in top_reference] + + if len(url) == 0: + print("No assemblies responding to the top reference: ", top_reference, " were found") + sys.exit(1) + + + url = str(url[0]) + url_https = url.replace('ftp', 'https') + + # get url and reference file + + for r_end in reference_ends: + + out_file = out_dir + "/" + top_reference + r_end + file_url = url_https + "/" + top_reference + r_end + + print(out_file) + print(file_url) + + #wget.download(file_url, out_file) + response = requests.get(file_url, stream=True) + with open(out_file, 'wb') as out: + for chunk in response.iter_content(chunk_size=8192): + out.write(chunk) + + return + + +def main(args=None): + args = parse_args(args) + download_references(args.file, args.reference, args.out_dir) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/find_common_reference.py b/bin/find_common_reference.py new file mode 100755 index 00000000..e26aaf53 --- /dev/null +++ b/bin/find_common_reference.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +""" +============================================================= +HEADER +============================================================= +INSTITUTION: BU-ISCIII +AUTHOR: Guillermo J. Gorines Cordero +MAIL: guillermo.gorines@urjc.es +VERSION: 0.1 +CREATED: Early 2022 +REVISED: 18-2-2022 +DESCRIPTION: + Given a directory with kmerfinder results, sum them up + in an outfile named by the user. + +INPUT: + -DIRECTORY: directory containing all kmerfinder results. + -OUTFILE: Name of the file to write the whole results in. + +OUTPUT: + -OUTFILE: file containing the kmerfinder results. + +USAGE: + python find_common_reference.py -d [DIRECTORY] -o [OUTFILE] +REQUIREMENTS: + -Python >= 3.6 + +DISCLAIMER: This script has been designed for the assembly pipeline of BU-ISCIII. + Feel free to use it at will, however we dont guarantee its success + outside its purpose. + +================================================================ +END_OF_HEADER +================================================================ +""" +import os +import sys +import errno +import argparse + + +def parse_args(args=None): + """ + Parse the args given to argparser + """ + Description = "Fetch kmerfinder result files and get the most used reference." + Epilog = """Example usage: python find_common_reference.py -d -o """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("-d", help="Input directory.") + parser.add_argument("-o", help="Output file.") + return parser.parse_args(args) + + +def group_references(kmer_result_dir, out_file): + """ + Unifies the kmerfinder results, and counts their occurrences + """ + reference_assembly = {} + + # for file in dir + for k_file in os.listdir(kmer_result_dir): + # open file + with open(os.path.join(kmer_result_dir, k_file), "r") as fh: + file_lines = fh.readlines() + + # remove heading + try: + heading = file_lines[0].split("\t") + first_line = file_lines[1].split("\t") + + # where is the assembly in the header? + # find reference according to index + index_assembly = heading.index("# Assembly") + reference = first_line[index_assembly] + + # add it to the dict if not there + if reference not in reference_assembly: + index_description = heading.index("Description") + reference_assembly[reference] = [0, first_line[index_description]] + # sum 1 for another occurrence + reference_assembly[reference][0] += 1 + except IndexError: + pass + + # sort it (more occurrences first in file) + order_reference = dict( + sorted(reference_assembly.items(), key=lambda x: x[1][0], reverse=True) + ) + + # write it + with open(out_file, "w") as f_out: + for key, value in order_reference.items(): + f_out.write(key + "\t" + str(value[0]) + "\t" + value[1] + "\n") + return + + +def main(args=None): + args = parse_args(args) + group_references(args.d, args.o) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf new file mode 100644 index 00000000..e847e4f2 --- /dev/null +++ b/modules/local/find_download_reference.nf @@ -0,0 +1,37 @@ +process FIND_DOWNLOAD_REFERENCE { + tag "${task.process}" + label 'process_low' + + conda "conda-forge::requests=2.26.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/requests:2.26.0' : + 'biocontainers/requests:2.26.0' }" + + input: + path(reports, stageAs: 'reports/*') + path(ncbi_reference) + + output: + path "references_found.tsv" , emit: target_references_tsv + path "*.fna.gz" , emit: fna + path "*.gff.gz" , emit: gff + path "*.faa.gz" , emit: faa + path "versions.yml" , emit: versions + + script: + """ + find_common_reference.py \\ + -d reports/ \\ + -o references_found.tsv + + download_reference.py \\ + -file references_found.tsv \\ + -reference $ncbi_reference \\ + -out_dir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index bf69022e..94d1fdbb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -19,6 +19,9 @@ params { // Contamination_screening kraken2db = "" kmerfinderdb = "" + reference_fasta = "" + reference_gff = "" + reference_ncbi_bacteria = "" // Assembly parameters assembler = 'unicycler' // Allowed: ['unicycler', 'canu', 'miniasm', 'dragonflye'] diff --git a/nextflow_schema.json b/nextflow_schema.json index 2eb706e0..0f0e9185 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -73,6 +73,23 @@ "fa_icon": "fab fa-gitkraken", "help_text": "See [Kraken2 homepage](https://benlangmead.github.io/aws-indexes/k2) for download\nlinks. Minikraken2 8GB is a reasonable choice, since we run Kraken here mainly just to check for\nsample purity.", "description": "Path to Kraken2 database." + }, + "kmerfinderdb": { + "type": "string", + "description": "Database for Kmerfinder.", + "help_text": "" + }, + "reference_fasta": { + "type": "string", + "description": "Reference FASTA file." + }, + "reference_gff": { + "type": "string", + "description": "Reference GFF file." + }, + "reference_ncbi_bacteria": { + "type": "string", + "description": "NCBI Bacteria reference database" } } }, @@ -188,6 +205,11 @@ "fa_icon": "fas fa-forward", "description": "Skip running Kraken2 classifier on reads." }, + "skip_kmerfinder": { + "type": "boolean", + "description": "Skip contamination analysis with Kmerfinder", + "fa_icon": "fas fa-forward" + }, "skip_annotation": { "type": "boolean", "fa_icon": "fas fa-forward", @@ -205,7 +227,8 @@ }, "skip_multiqc": { "type": "boolean", - "description": "Skip MultiQC" + "description": "Skip MultiQC", + "fa_icon": "fas fa-forward" } } }, diff --git a/workflows/bacass.nf b/workflows/bacass.nf index fd0eab38..4eceda02 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -61,6 +61,7 @@ include { MEDAKA } from '../modules/local/medaka' include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' include { KMERFINDER } from '../modules/local/kmerfinder' include { KMERFINDER_SUMMARY } from '../modules/local/kmerfinder_summary' +include { FIND_DOWNLOAD_REFERENCE } from '../modules/local/find_download_reference' include { DFAST } from '../modules/local/dfast' // @@ -393,23 +394,32 @@ workflow BACASS { // // MODULE: Kmerfinder, QC for sample purity // - - // TODO: add check contamination module // CALLIT PARSE_KMERFINDER - // TODO: if not provided, download reference from kmerfinder results --> module FIND_DOWNLOAD_COMMON_REFFERENCE // TODO: Create kmerfinder mode for short and longreads // TODO: When no kmerfinder database is found, allow nf-core/bacass to download it + // TODO: create a strategy to group the samples according to the reference found. + // TODO: I think that this kmerfinder step could be grouped into a subworkflow if ( !params.skip_kmerfinder && params.kmerfinderdb ) { KMERFINDER ( ch_for_assembly.map{ meta, sr, lr -> tuple( meta, sr) }, // [meta, reads] params.kmerfinderdb // path(kmerfinder database) ) - ch_kmerfinder_report = KMERFINDER.out.report - ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) + KMERFINDER.out.report + .map { meta, report -> report } + .collect() + .set { ch_kmerfinder_reports } + ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) KMERFINDER_SUMMARY ( - ch_kmerfinder_report.map{meta, report -> report }.collect() + ch_kmerfinder_reports ) ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) + + if (!params.reference_fasta && !params.reference_gff) { + FIND_DOWNLOAD_REFERENCE ( + ch_kmerfinder_reports, + params.reference_ncbi_bacteria + ) + } } /* // From e072b2d7c9bec98c2d4767775400521ee8c0753a Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 14 Nov 2023 17:42:27 +0100 Subject: [PATCH 27/58] fix kmerfinder summary input --- modules/local/kmerfinder_summary.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/kmerfinder_summary.nf b/modules/local/kmerfinder_summary.nf index af188125..b60fe4ca 100644 --- a/modules/local/kmerfinder_summary.nf +++ b/modules/local/kmerfinder_summary.nf @@ -16,7 +16,7 @@ process KMERFINDER_SUMMARY { script: """ - kmerfinder_summary.py --path kmerfinder_reports/ --output_bn kmerfinder.bn --output_csv kmerfinder.csv + kmerfinder_summary.py --path reports/ --output_bn kmerfinder.bn --output_csv kmerfinder.csv cat <<-END_VERSIONS > versions.yml "${task.process}": From 8baded16f75e8dea98504587ccb98993919b93f9 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 17 Nov 2023 12:24:35 +0100 Subject: [PATCH 28/58] update kmerfinder output file extension --- conf/modules.config | 2 +- modules/local/kmerfinder.nf | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 38a81488..8320045b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -165,7 +165,7 @@ process { publishDir = [ path: { "${params.outdir}/Kmerfinder/${meta.id}" }, mode: params.publish_dir_mode, - pattern: "*_results.txt", + pattern: "*.txt", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf index db7fddf1..f7dd707e 100644 --- a/modules/local/kmerfinder.nf +++ b/modules/local/kmerfinder.nf @@ -12,7 +12,8 @@ process KMERFINDER { path(kmerfinderDB) output: - tuple val(meta), path("*_results.txt") , emit: report + tuple val(meta), path("*_results.txt") , emit: report + tuple val(meta), path("*_data.json") , emit: json path "versions.yml" , emit: versions script: @@ -28,6 +29,7 @@ process KMERFINDER { -x mv results.txt ${prefix}_results.txt + mv data.json ${prefix}_data.json cat <<-END_VERSIONS > versions.yml "${task.process}": From 9bdc67e2641d19e765e07874e0151c06cbd5e7ef Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 17 Nov 2023 12:25:07 +0100 Subject: [PATCH 29/58] add kmerfinder refseqid to meta --- workflows/bacass.nf | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 4eceda02..a5f2fe43 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -92,6 +92,7 @@ include { KRAKEN2_KRAKEN2 as KRAKEN2 } from '../modules/nf-core/krake include { KRAKEN2_KRAKEN2 as KRAKEN2_LONG } from '../modules/nf-core/kraken2/kraken2/main' include { QUAST } from '../modules/nf-core/quast/main' include { GUNZIP } from '../modules/nf-core/gunzip/main' +include { GUNZIP_KMERFINDERDB } from '../modules/nf-core/gunzip/main' include { PROKKA } from '../modules/nf-core/prokka/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' @@ -394,20 +395,31 @@ workflow BACASS { // // MODULE: Kmerfinder, QC for sample purity // - // TODO: Create kmerfinder mode for short and longreads - // TODO: When no kmerfinder database is found, allow nf-core/bacass to download it - // TODO: create a strategy to group the samples according to the reference found. + // TODO: Create kmerfinder mode for longreads + // TODO: create a strategy to group the samples according to the reference found. [pending, fix splitjson path-key] // TODO: I think that this kmerfinder step could be grouped into a subworkflow - if ( !params.skip_kmerfinder && params.kmerfinderdb ) { + // TODO: Create a by refseq-id quast report && general. + // TODO: hack multiqc to group quast-entries by refseqid? KMERFINDER ( ch_for_assembly.map{ meta, sr, lr -> tuple( meta, sr) }, // [meta, reads] params.kmerfinderdb // path(kmerfinder database) ) + ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) + + KMERFINDER.out.json + .join(ch_for_assembly, by:0) + .map{ + meta, json, sr, lr -> + meta.refseq = json + .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] + return tuple(meta, sr, lr) + } + .set { ch_refseqid } + KMERFINDER.out.report .map { meta, report -> report } .collect() .set { ch_kmerfinder_reports } - ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) KMERFINDER_SUMMARY ( ch_kmerfinder_reports From 97c28660d5ada16e1ce27d75c8b1706daf039805 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 17 Nov 2023 14:12:54 +0100 Subject: [PATCH 30/58] fix url in kmerfinder donwload ref --- bin/download_reference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/download_reference.py b/bin/download_reference.py index 907c547a..8fa18da4 100755 --- a/bin/download_reference.py +++ b/bin/download_reference.py @@ -123,7 +123,7 @@ def download_references(file, reference, out_dir): url = str(url[0]) - url_https = url.replace('ftp', 'https') + url_https = url.replace('ftp', 'https', 1) # get url and reference file From 0724141bcbb1cfa054b1c80ccc3b07404dfbd1af Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 17 Nov 2023 16:44:36 +0100 Subject: [PATCH 31/58] temporary commit --- modules/local/kmerfinder.nf | 4 +- modules/local/kmerfinder_summary.nf | 2 +- workflows/bacass.nf | 58 ++++++++++++++++++----------- 3 files changed, 40 insertions(+), 24 deletions(-) diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf index f7dd707e..58a6de83 100644 --- a/modules/local/kmerfinder.nf +++ b/modules/local/kmerfinder.nf @@ -12,8 +12,8 @@ process KMERFINDER { path(kmerfinderDB) output: - tuple val(meta), path("*_results.txt") , emit: report - tuple val(meta), path("*_data.json") , emit: json + tuple val(meta), path("*_results.txt") , emit: report + tuple val(meta), path("*_data.json") , emit: json path "versions.yml" , emit: versions script: diff --git a/modules/local/kmerfinder_summary.nf b/modules/local/kmerfinder_summary.nf index b60fe4ca..bb8f11d4 100644 --- a/modules/local/kmerfinder_summary.nf +++ b/modules/local/kmerfinder_summary.nf @@ -8,7 +8,7 @@ process KMERFINDER_SUMMARY { 'biocontainers/python:3.10' }" input: - path(reports, stageAs: 'reports/*') + val(meta), path(report, stageAs: 'reports/*') output: path "kmerfinder.csv" , emit: summary diff --git a/workflows/bacass.nf b/workflows/bacass.nf index a5f2fe43..5020fe5c 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -92,7 +92,7 @@ include { KRAKEN2_KRAKEN2 as KRAKEN2 } from '../modules/nf-core/krake include { KRAKEN2_KRAKEN2 as KRAKEN2_LONG } from '../modules/nf-core/kraken2/kraken2/main' include { QUAST } from '../modules/nf-core/quast/main' include { GUNZIP } from '../modules/nf-core/gunzip/main' -include { GUNZIP_KMERFINDERDB } from '../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_KMERFINDERDB } from '../modules/nf-core/gunzip/main' include { PROKKA } from '../modules/nf-core/prokka/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' @@ -223,7 +223,7 @@ workflow BACASS { .dump(tag: 'ch_for_assembly') .set { ch_for_assembly } } -/* + // // ASSEMBLY: Unicycler, Canu, Miniasm, Dragonflye // @@ -356,7 +356,7 @@ workflow BACASS { MEDAKA ( ch_for_medaka.dump(tag: 'into_medaka') ) ch_versions = ch_versions.mix(MEDAKA.out.versions.ifEmpty(null)) } -*/ + // // MODULE: Kraken2, QC for sample purity // @@ -400,38 +400,54 @@ workflow BACASS { // TODO: I think that this kmerfinder step could be grouped into a subworkflow // TODO: Create a by refseq-id quast report && general. // TODO: hack multiqc to group quast-entries by refseqid? + // TODO: corner casse >1 refseq_id + // TODO: PREPARE REFERENCES SUBWORKFLOW + if ( !params.skip_kmerfinder && params.kmerfinderdb ) { + if( params.kmerfinderdb.endsWith('.gz') ){ + GUNZIP_KMERFINDERDB ( params.kmerfinderdb ) + ch_kmerfinderdb = GUNZIP_KMERFINDERDB.out.gunzip + } else { + ch_kmerfinderdb = params.kmerfinderdb + } + KMERFINDER ( ch_for_assembly.map{ meta, sr, lr -> tuple( meta, sr) }, // [meta, reads] - params.kmerfinderdb // path(kmerfinder database) + ch_kmerfinderdb ) ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) KMERFINDER.out.json - .join(ch_for_assembly, by:0) + .join(KMERFINDER.out.report, by:0) + .join(ch_assembly, by:0) .map{ - meta, json, sr, lr -> - meta.refseq = json + meta, json, report, fasta -> + def new_meta = [:] + new_meta.refseq = json .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] - return tuple(meta, sr, lr) + return tuple(meta, new_meta, report, fasta) } - .set { ch_refseqid } + .groupTuple(by:1) + .set { ch_refseqid_fasta } - KMERFINDER.out.report - .map { meta, report -> report } - .collect() - .set { ch_kmerfinder_reports } + ch_refseqid_fasta.map{ meta, new_meta, report, fasta -> tuple (meta, report)}.view() + + +// KMERFINDER.out.report +// .map { meta, report -> report } +// .collect() +// .set { ch_kmerfinder_reports } KMERFINDER_SUMMARY ( - ch_kmerfinder_reports + ch_refseqid_fasta.map{ meta, report, fasta -> tuple (meta, report)} ) ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) - if (!params.reference_fasta && !params.reference_gff) { - FIND_DOWNLOAD_REFERENCE ( - ch_kmerfinder_reports, - params.reference_ncbi_bacteria - ) - } +// if (!params.reference_fasta && !params.reference_gff) { +// FIND_DOWNLOAD_REFERENCE ( +// ch_kmerfinder_reports, +// params.reference_ncbi_bacteria +// ) +// } } /* // @@ -449,7 +465,7 @@ workflow BACASS { ) ch_quast_multiqc = QUAST.out.tsv ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) - +/* // Check assemblies that require further processing for gene annotation ch_assembly .branch{ meta, fasta -> From 071d3346edafd206f802451e55c69b2ba6c93f2b Mon Sep 17 00:00:00 2001 From: Dani VM Date: Sun, 19 Nov 2023 10:54:32 +0100 Subject: [PATCH 32/58] group assemblies by refseqid --- modules/local/find_download_reference.nf | 3 ++- modules/local/kmerfinder_summary.nf | 8 +++---- workflows/bacass.nf | 28 ++++++++++-------------- 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf index e847e4f2..60cc5c66 100644 --- a/modules/local/find_download_reference.nf +++ b/modules/local/find_download_reference.nf @@ -8,7 +8,7 @@ process FIND_DOWNLOAD_REFERENCE { 'biocontainers/requests:2.26.0' }" input: - path(reports, stageAs: 'reports/*') + tuple val(meta), path(reports, stageAs: 'reports/*') path(ncbi_reference) output: @@ -19,6 +19,7 @@ process FIND_DOWNLOAD_REFERENCE { path "versions.yml" , emit: versions script: + def prefix = task.ext.prefix ?: "${meta.refseq}" """ find_common_reference.py \\ -d reports/ \\ diff --git a/modules/local/kmerfinder_summary.nf b/modules/local/kmerfinder_summary.nf index bb8f11d4..58fe104f 100644 --- a/modules/local/kmerfinder_summary.nf +++ b/modules/local/kmerfinder_summary.nf @@ -8,15 +8,15 @@ process KMERFINDER_SUMMARY { 'biocontainers/python:3.10' }" input: - val(meta), path(report, stageAs: 'reports/*') + path(report, stageAs: 'reports/*') output: - path "kmerfinder.csv" , emit: summary - path "versions.yml" , emit: versions + path "*.csv" , emit: summary + path "versions.yml" , emit: versions script: """ - kmerfinder_summary.py --path reports/ --output_bn kmerfinder.bn --output_csv kmerfinder.csv + kmerfinder_summary.py --path reports/ --output_bn kmerfinder.bn --output_csv kmerfinder_summary.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 5020fe5c..ffda1899 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -424,30 +424,24 @@ workflow BACASS { def new_meta = [:] new_meta.refseq = json .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] - return tuple(meta, new_meta, report, fasta) + return tuple(new_meta, meta, report, fasta) } - .groupTuple(by:1) + .groupTuple(by:0) .set { ch_refseqid_fasta } - ch_refseqid_fasta.map{ meta, new_meta, report, fasta -> tuple (meta, report)}.view() - - -// KMERFINDER.out.report -// .map { meta, report -> report } -// .collect() -// .set { ch_kmerfinder_reports } - + ch_reports_Byrefseqid = ch_refseqid_fasta + .map{ new_meta, meta, report, fasta -> [new_meta, report] } KMERFINDER_SUMMARY ( - ch_refseqid_fasta.map{ meta, report, fasta -> tuple (meta, report)} + KMERFINDER.out.report.map{meta, report -> report }collect() ) ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) -// if (!params.reference_fasta && !params.reference_gff) { -// FIND_DOWNLOAD_REFERENCE ( -// ch_kmerfinder_reports, -// params.reference_ncbi_bacteria -// ) -// } + if (!params.reference_fasta && !params.reference_gff) { + FIND_DOWNLOAD_REFERENCE ( + ch_reports_Byrefseqid, + params.reference_ncbi_bacteria + ) + } } /* // From 7822c4bce75aa294fd0540f61ea76f7feeac029e Mon Sep 17 00:00:00 2001 From: Dani VM Date: Sun, 19 Nov 2023 22:23:24 +0100 Subject: [PATCH 33/58] allow global quast and by-refseqid quast --- conf/modules.config | 14 +++++++-- modules/local/find_download_reference.nf | 11 +++---- workflows/bacass.nf | 40 ++++++++++++++++-------- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 8320045b..0774005f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -196,12 +196,22 @@ process { ] } - withName: 'QUAST' { + withName: 'QUAST*' { ext.args = '' publishDir = [ path: { "${params.outdir}/QUAST" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + pattern: "meta.id", + saveAs: { filename -> + if (filename.equals('versions.yml')){ + null + } else if (filename.startsWith('GCF')){ + "bySampleReference/${filename}" + } + else { + "global/${filename}" + } + } ] } diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf index 60cc5c66..478d3b67 100644 --- a/modules/local/find_download_reference.nf +++ b/modules/local/find_download_reference.nf @@ -12,14 +12,13 @@ process FIND_DOWNLOAD_REFERENCE { path(ncbi_reference) output: - path "references_found.tsv" , emit: target_references_tsv - path "*.fna.gz" , emit: fna - path "*.gff.gz" , emit: gff - path "*.faa.gz" , emit: faa - path "versions.yml" , emit: versions + tuple val(meta), path( "references_found.tsv") , emit: target_references_tsv + tuple val(meta), path( "*.fna.gz") , emit: fna + tuple val(meta), path( "*.gff.gz") , emit: gff + tuple val(meta), path( "*.faa.gz") , emit: faa + path "versions.yml" , emit: versions script: - def prefix = task.ext.prefix ?: "${meta.refseq}" """ find_common_reference.py \\ -d reports/ \\ diff --git a/workflows/bacass.nf b/workflows/bacass.nf index ffda1899..3a0a2753 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -91,6 +91,7 @@ include { SAMTOOLS_INDEX } from '../modules/nf-core/samto include { KRAKEN2_KRAKEN2 as KRAKEN2 } from '../modules/nf-core/kraken2/kraken2/main' include { KRAKEN2_KRAKEN2 as KRAKEN2_LONG } from '../modules/nf-core/kraken2/kraken2/main' include { QUAST } from '../modules/nf-core/quast/main' +include { QUAST as QUAST_BYREFSEQID } from '../modules/nf-core/quast/main' include { GUNZIP } from '../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_KMERFINDERDB } from '../modules/nf-core/gunzip/main' include { PROKKA } from '../modules/nf-core/prokka/main' @@ -395,13 +396,13 @@ workflow BACASS { // // MODULE: Kmerfinder, QC for sample purity // + // TODO: Create kmerfinder mode for longreads - // TODO: create a strategy to group the samples according to the reference found. [pending, fix splitjson path-key] // TODO: I think that this kmerfinder step could be grouped into a subworkflow - // TODO: Create a by refseq-id quast report && general. // TODO: hack multiqc to group quast-entries by refseqid? // TODO: corner casse >1 refseq_id // TODO: PREPARE REFERENCES SUBWORKFLOW + // TODO: PASS QUAST_BYREF TSV TO MULTIQC if ( !params.skip_kmerfinder && params.kmerfinderdb ) { if( params.kmerfinderdb.endsWith('.gz') ){ GUNZIP_KMERFINDERDB ( params.kmerfinderdb ) @@ -422,7 +423,7 @@ workflow BACASS { .map{ meta, json, report, fasta -> def new_meta = [:] - new_meta.refseq = json + new_meta.id = json .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] return tuple(new_meta, meta, report, fasta) } @@ -432,7 +433,7 @@ workflow BACASS { ch_reports_Byrefseqid = ch_refseqid_fasta .map{ new_meta, meta, report, fasta -> [new_meta, report] } KMERFINDER_SUMMARY ( - KMERFINDER.out.report.map{meta, report -> report }collect() + KMERFINDER.out.report.map{meta, report -> report }.collect() ) ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) @@ -441,25 +442,39 @@ workflow BACASS { ch_reports_Byrefseqid, params.reference_ncbi_bacteria ) + ch_reference_fasta = FIND_DOWNLOAD_REFERENCE.out.fna + ch_reference_gff = FIND_DOWNLOAD_REFERENCE.out.gff } } -/* + // // MODULE: QUAST, assembly QC // - ch_assembly - .collect{ it[1] } - .map { consensus_collect -> tuple([id: "report"], consensus_collect) } - .set { ch_to_quast } + ch_refseqid_fasta + .join(ch_reference_fasta) + .join(ch_reference_gff) + .groupTuple(by:0) + .set { ch_to_quast} - QUAST ( - ch_to_quast, + QUAST( + ch_assembly + .collect{ it[1]} + .map{ consensus -> tuple([id:'report'], consensus)}, [[:],[]], [[:],[]] ) + QUAST_BYREFSEQID ( + ch_to_quast + .map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus.flatten()) + }, + ch_to_quast + .map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_fasta)}, + ch_to_quast + .map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} + ) ch_quast_multiqc = QUAST.out.tsv ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) -/* + // Check assemblies that require further processing for gene annotation ch_assembly .branch{ meta, fasta -> @@ -557,7 +572,6 @@ workflow BACASS { ) multiqc_report = MULTIQC.out.report.toList() } -*/ } /* From 149071f7aa7b52f6f82c1831dde61eae84479bcb Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 21 Nov 2023 10:31:17 +0100 Subject: [PATCH 34/58] move kmerfinder processing to subworkflow plus output refactoring --- conf/modules.config | 13 ++- subworkflows/local/kmerfinder_subworkflow.nf | 65 +++++++++++++++ workflows/bacass.nf | 84 +++++++------------- 3 files changed, 101 insertions(+), 61 deletions(-) create mode 100644 subworkflows/local/kmerfinder_subworkflow.nf diff --git a/conf/modules.config b/conf/modules.config index 0774005f..bcbd5750 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -160,7 +160,7 @@ process { ] } - withName: 'KMERFINDER' { + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER' { ext.args = '' publishDir = [ path: { "${params.outdir}/Kmerfinder/${meta.id}" }, @@ -170,7 +170,7 @@ process { ] } - withName: 'KMERFINDER_SUMMARY' { + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER_SUMMARY' { ext.args = '' publishDir = [ path: { "${params.outdir}/Kmerfinder" }, @@ -196,20 +196,19 @@ process { ] } - withName: 'QUAST*' { + withName: 'QUAST' { ext.args = '' publishDir = [ path: { "${params.outdir}/QUAST" }, mode: params.publish_dir_mode, - pattern: "meta.id", saveAs: { filename -> - if (filename.equals('versions.yml')){ + if (filename.equals('versions.yml') || filename.endsWith('.tsv')){ null } else if (filename.startsWith('GCF')){ - "bySampleReference/${filename}" + "report_bySampleReference/${filename}" } else { - "global/${filename}" + "${filename}" } } ] diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf new file mode 100644 index 00000000..bc844c77 --- /dev/null +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -0,0 +1,65 @@ +// +// Kmerfinder subworkflow for species identification & QC +// +include { KMERFINDER } from '../../modules/local/kmerfinder' +include { KMERFINDER_SUMMARY } from '../../modules/local/kmerfinder_summary' +include { FIND_DOWNLOAD_REFERENCE } from '../../modules/local/find_download_reference' +include { QUAST } from '../../modules/nf-core/quast/main' + +workflow KMERFINDER_SUBWORKFLOW { + take: + kmerfinder_db // channel: [ path ] + ncbi_bacteria_db // channel: [ path ] + reads // channel: [ meta, reads ] + consensus // channel: [ meta, consensus ] + + main: + ch_versions = Channel.empty() + + // MODULE: Kmerfinder, QC for sample purity + KMERFINDER ( + reads, + kmerfinder_db + ) + ch_kmerfinder_report = KMERFINDER.out.report + ch_kmerfinder_json = KMERFINDER.out.json + ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) + + // MODULE: Kmerfinder summary report + KMERFINDER_SUMMARY ( + ch_kmerfinder_report.map{meta, report -> report }.collect() + ) + ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) + + // SUBWORKFLOW: Group sample assemblies by reference geneome + ch_kmerfinder_json + .join(ch_kmerfinder_report, by:0) + .join(consensus, by:0) + .map{ + meta, json, report_txt, fasta -> + def refseq = [:] + refseq.id = json + .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] + return tuple(refseq, meta, report_txt, fasta) + } + .groupTuple(by:0) + .set { ch_consensus_byrefseq } + + // MODULE: Find & Download common reference sequences + if (!params.reference_fasta && !params.reference_gff) { + FIND_DOWNLOAD_REFERENCE ( + ch_consensus_byrefseq.map{ refseq, meta, report_txt, fasta -> tuple(refseq, report_txt)}, + ncbi_bacteria_db + ) + ch_reference_fasta = FIND_DOWNLOAD_REFERENCE.out.fna + ch_reference_gff = FIND_DOWNLOAD_REFERENCE.out.gff + ch_versions = ch_versions.mix( FIND_DOWNLOAD_REFERENCE.out.versions.ifEmpty(null) ) + } + + + emit: + versions = ch_versions.ifEmpty(null) // channel: [ path(versions.yml) ] + reference_fasta = ch_reference_fasta // channel: [ meta, path(*.fna) ] + reference_gff = ch_reference_gff // channel: [ meta, path(*.gff) ] + consensus_byrefseq = ch_consensus_byrefseq // channel: [ refseq, meta, report_txt, fasta ] +} diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 3a0a2753..d7b36b38 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -59,9 +59,6 @@ include { UNICYCLER } from '../modules/local/unicycler' include { NANOPOLISH } from '../modules/local/nanopolish' include { MEDAKA } from '../modules/local/medaka' include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' -include { KMERFINDER } from '../modules/local/kmerfinder' -include { KMERFINDER_SUMMARY } from '../modules/local/kmerfinder_summary' -include { FIND_DOWNLOAD_REFERENCE } from '../modules/local/find_download_reference' include { DFAST } from '../modules/local/dfast' // @@ -102,6 +99,7 @@ include { MULTIQC } from '../modules/nf-core/multi // SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules // include { FASTQ_TRIM_FASTP_FASTQC } from '../subworkflows/nf-core/fastq_trim_fastp_fastqc/main' +include { KMERFINDER_SUBWORKFLOW } from '../subworkflows/local/kmerfinder_subworkflow' include { BAKTA_DBDOWNLOAD_RUN } from '../subworkflows/local/bakta_dbdownload_run' /* @@ -398,7 +396,6 @@ workflow BACASS { // // TODO: Create kmerfinder mode for longreads - // TODO: I think that this kmerfinder step could be grouped into a subworkflow // TODO: hack multiqc to group quast-entries by refseqid? // TODO: corner casse >1 refseq_id // TODO: PREPARE REFERENCES SUBWORKFLOW @@ -411,70 +408,49 @@ workflow BACASS { ch_kmerfinderdb = params.kmerfinderdb } - KMERFINDER ( - ch_for_assembly.map{ meta, sr, lr -> tuple( meta, sr) }, // [meta, reads] - ch_kmerfinderdb + KMERFINDER_SUBWORKFLOW ( + ch_kmerfinderdb, + params.reference_ncbi_bacteria, + ch_for_assembly.map{meta, sr, lr -> tuple( meta, sr)}, // [meta, reads] + ch_assembly // [meta, consensus] ) - ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) - - KMERFINDER.out.json - .join(KMERFINDER.out.report, by:0) - .join(ch_assembly, by:0) - .map{ - meta, json, report, fasta -> - def new_meta = [:] - new_meta.id = json - .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] - return tuple(new_meta, meta, report, fasta) - } - .groupTuple(by:0) - .set { ch_refseqid_fasta } - - ch_reports_Byrefseqid = ch_refseqid_fasta - .map{ new_meta, meta, report, fasta -> [new_meta, report] } - KMERFINDER_SUMMARY ( - KMERFINDER.out.report.map{meta, report -> report }.collect() - ) - ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) + ch_reference_fasta = KMERFINDER_SUBWORKFLOW.out.reference_fasta + ch_reference_gff = KMERFINDER_SUBWORKFLOW.out.reference_gff + ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq + ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) - if (!params.reference_fasta && !params.reference_gff) { - FIND_DOWNLOAD_REFERENCE ( - ch_reports_Byrefseqid, - params.reference_ncbi_bacteria - ) - ch_reference_fasta = FIND_DOWNLOAD_REFERENCE.out.fna - ch_reference_gff = FIND_DOWNLOAD_REFERENCE.out.gff - } } // // MODULE: QUAST, assembly QC // - ch_refseqid_fasta - .join(ch_reference_fasta) - .join(ch_reference_gff) - .groupTuple(by:0) - .set { ch_to_quast} - - QUAST( - ch_assembly + ch_assembly .collect{ it[1]} - .map{ consensus -> tuple([id:'report'], consensus)}, + .map{ consensus -> tuple([id:'report'], consensus)} + .set{ch_to_quast} + QUAST( + ch_to_quast, [[:],[]], [[:],[]] ) - QUAST_BYREFSEQID ( - ch_to_quast - .map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus.flatten()) - }, - ch_to_quast - .map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_fasta)}, - ch_to_quast - .map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} - ) ch_quast_multiqc = QUAST.out.tsv ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) + if (!params.skip_kmerfinder){ + // Prepare input for quast + ch_consensus_byrefseq // [ refseq, meta, report_txt, consensus ] + .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] + .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] + .groupTuple(by:0) + .set { ch_to_quast_byrefseq}// channel: [refseq, meta, report, consensus, ref_fasta, ref_gff] + + QUAST_BYREFSEQID ( + ch_to_quast_byrefseq.map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus.flatten())}, + ch_to_quast_byrefseq.map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_fasta)}, + ch_to_quast_byrefseq.map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} + ) + } + // Check assemblies that require further processing for gene annotation ch_assembly .branch{ meta, fasta -> From 6e7e0fa35133dafc5980a3a7ef1308662edb8c36 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 21 Nov 2023 15:50:05 +0100 Subject: [PATCH 35/58] allow quast to standard and byrefseq data --- subworkflows/local/kmerfinder_subworkflow.nf | 6 ++ workflows/bacass.nf | 58 ++++++++++++-------- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf index bc844c77..99b9b4c3 100644 --- a/subworkflows/local/kmerfinder_subworkflow.nf +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -56,9 +56,15 @@ workflow KMERFINDER_SUBWORKFLOW { ch_versions = ch_versions.mix( FIND_DOWNLOAD_REFERENCE.out.versions.ifEmpty(null) ) } + // Get reference sequence IDs + ch_consensus_byrefseq + .map{ refseq, meta, report_txt, fasta -> refseq } + .collect() + .set { ch_refseqid } emit: versions = ch_versions.ifEmpty(null) // channel: [ path(versions.yml) ] + refseqids = ch_refseqid reference_fasta = ch_reference_fasta // channel: [ meta, path(*.fna) ] reference_gff = ch_reference_gff // channel: [ meta, path(*.gff) ] consensus_byrefseq = ch_consensus_byrefseq // channel: [ refseq, meta, report_txt, fasta ] diff --git a/workflows/bacass.nf b/workflows/bacass.nf index d7b36b38..9c1856b5 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -397,8 +397,6 @@ workflow BACASS { // TODO: Create kmerfinder mode for longreads // TODO: hack multiqc to group quast-entries by refseqid? - // TODO: corner casse >1 refseq_id - // TODO: PREPARE REFERENCES SUBWORKFLOW // TODO: PASS QUAST_BYREF TSV TO MULTIQC if ( !params.skip_kmerfinder && params.kmerfinderdb ) { if( params.kmerfinderdb.endsWith('.gz') ){ @@ -414,13 +412,30 @@ workflow BACASS { ch_for_assembly.map{meta, sr, lr -> tuple( meta, sr)}, // [meta, reads] ch_assembly // [meta, consensus] ) + ch_refseqid = KMERFINDER_SUBWORKFLOW.out.refseqids ch_reference_fasta = KMERFINDER_SUBWORKFLOW.out.reference_fasta ch_reference_gff = KMERFINDER_SUBWORKFLOW.out.reference_gff ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) + // Processing output: + ch_consensus_byrefseq + .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] + .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] + .groupTuple(by:0) + .map { + refseq, meta, report_txt, consensus, ref_fasta, ref_gff -> + ch_refseqid.size() + if (ch_refseqid.size().getVal() > 1 ){ + return [refseq, consensus.flatten(), ref_fasta, ref_gff] + } else { + return [[id:'report'], consensus.flatten(), ref_fasta, ref_gff] + } + } + .set { ch_to_quast_byrefseq } } + // // MODULE: QUAST, assembly QC // @@ -428,28 +443,27 @@ workflow BACASS { .collect{ it[1]} .map{ consensus -> tuple([id:'report'], consensus)} .set{ch_to_quast} - QUAST( - ch_to_quast, - [[:],[]], - [[:],[]] - ) - ch_quast_multiqc = QUAST.out.tsv - ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) - - if (!params.skip_kmerfinder){ - // Prepare input for quast - ch_consensus_byrefseq // [ refseq, meta, report_txt, consensus ] - .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] - .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] - .groupTuple(by:0) - .set { ch_to_quast_byrefseq}// channel: [refseq, meta, report, consensus, ref_fasta, ref_gff] - QUAST_BYREFSEQID ( - ch_to_quast_byrefseq.map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus.flatten())}, - ch_to_quast_byrefseq.map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_fasta)}, - ch_to_quast_byrefseq.map{ refseqid, meta, report, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} - ) + if(params.skip_kmerfinder){ + QUAST( + ch_to_quast, + params.reference_fasta ?: [[:],[]], + params.reference_gff ?: [[:],[]] + ) + } else if (ch_to_quast_byrefseq){ + QUAST( + ch_to_quast, + [[:],[]], + [[:],[]] + ) + QUAST_BYREFSEQID( + ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus)}, + ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_fasta)}, + ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} + ) } + ch_quast_multiqc = QUAST.out.tsv + ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) // Check assemblies that require further processing for gene annotation ch_assembly From a184e4156b86e870a696bde50ef2343c0772e7f4 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Wed, 22 Nov 2023 13:55:32 +0100 Subject: [PATCH 36/58] add byrefseq quast reports to multiqc and patch quast --- assets/multiqc_config.yml | 118 +++++++++++++++++++- bin/multiqc_to_custom_csv.py | 184 +++++++++++++++++++++++++++++++ conf/modules.config | 4 +- modules.json | 3 +- modules/local/multiqc_custom.nf | 54 +++++++++ modules/nf-core/quast/main.nf | 4 +- modules/nf-core/quast/quast.diff | 23 ++++ workflows/bacass.nf | 63 ++++++----- 8 files changed, 421 insertions(+), 32 deletions(-) create mode 100755 bin/multiqc_to_custom_csv.py create mode 100644 modules/local/multiqc_custom.nf create mode 100644 modules/nf-core/quast/quast.diff diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 2f09d568..49e9b244 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -2,12 +2,126 @@ report_comment: > This report has been generated by the nf-core/bacass analysis pipeline. For information about how to interpret these results, please see the documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - fastqc + - fastp + - nanoplot + - porechop + - pycoqc + - kraken2 + - quast + - prokka + - bakta + +module_order: + - fastqc: + name: "PREPROCESS: FastQC (raw reads)" + info: "This section of the report shows FastQC results for the raw reads before adapter trimming." + path_filters: + - "./fastqc/*.zip" + - fastp: + name: "PREPROCESS: fastp (adapter trimming)" + info: "This section of the report shows fastp results for reads after adapter and quality trimming." + - nanostat: + name: "PREPROCESS: Nanoplot" + info: "This section of the report shows Nanoplot results for nanopore sequencing data." + path_filters: + - "./nanoplot/*.txt" + - porechop: + name: "PREPROCESS: Porechop" + info: "This section of the report shows Porechop results for reads after adapter trimming." + path_filters: + - "./porechop/*.log" + - pycoqc: + name: "PREPROCESS: PycoQC" + info: "This section of the report shows PycoQC results for quality control of long-read sequencing data." + path_filters: + - "./pycoqc/*.txt" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast (Unicycler)" + anchor: "quast_unicycler" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast_unicycler/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + + report_section_order: - "nf-core-bacass-methods-description": + fastqc: + after: general_stats + fastp: + after: fastqc + nanoplot: + after: general_stats + porechop: + after: nanoplot + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: order: -1000 software_versions: order: -1001 - "nf-core-bacass-summary": + nf-core-bacass-summary: order: -1002 +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Contigs (Unicycler)": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig (Unicycler)": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50 (Unicycler)": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Reference Genome (Kmerfinder)": + description: "Reference genome calculated by Blast" + format: "{:,.0f}" + export_plots: true + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details +sp: + fastp: + fn: "*.fastp.json" diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py new file mode 100755 index 00000000..3f536b1c --- /dev/null +++ b/bin/multiqc_to_custom_csv.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python +# Sourced and Edited from nf-core/viralrecon: +# https://github.com/nf-core/viralrecon/blob/master/bin/multiqc_to_custom_csv.py#L59 +import os +import sys +import errno +import argparse +import yaml + + +def parse_args(args=None): + Description = ( + "Create custom spreadsheet for pertinent MultiQC metrics generated by the nf-core/viralrecon pipeline." + ) + Epilog = "Example usage: python multiqc_to_custom_tsv.py" + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-md", + "--multiqc_data_dir", + type=str, + dest="MULTIQC_DATA_DIR", + default="multiqc_data", + help="Full path to directory containing YAML files for each module, as generated by MultiQC. (default: 'multiqc_data').", + ) + parser.add_argument( + "-op", + "--out_prefix", + type=str, + dest="OUT_PREFIX", + default="summary", + help="Full path to output prefix (default: 'summary').", + ) + return parser.parse_args(args) + + +def make_dir(path): + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + +# Find key in dictionary created from YAML file recursively +# From https://stackoverflow.com/a/37626981 +def find_tag(d, tag): + if tag in d: + yield d[tag] + for k, v in d.items(): + if isinstance(v, dict): + for i in find_tag(v, tag): + yield i + + +def yaml_fields_to_dict(yaml_file, append_dict={}, field_mapping_list=[], valid_sample_list=[]): + integer_fields = [ + "mapped_passed", + "number_of_SNPs", + "number_of_indels", + "MISSENSE", + "# contigs (>= 0 bp)", + "# contigs (>= 5000 bp)", + "Largest contig", + ] + if os.path.exists(yaml_file): + with open(yaml_file) as f: + yaml_dict = yaml.safe_load(f) + for k in yaml_dict.keys(): + key = k + include_sample = True + if len(valid_sample_list) != 0 and key not in valid_sample_list: + include_sample = False + if include_sample: + if key not in append_dict: + append_dict[key] = {} + if field_mapping_list != []: + for i, j in field_mapping_list: + val = list(find_tag(yaml_dict[k], j[0])) + ## Fix for Cutadapt reporting reads/pairs as separate values + if j[0] == "r_written" and len(val) == 0: + val = [list(find_tag(yaml_dict[k], "pairs_written"))[0] * 2] + if len(val) != 0: + val = val[0] + if len(j) == 2: + val = list(find_tag(val, j[1]))[0] + if j[0] in integer_fields: + val = int(val) + if i not in append_dict[key]: + append_dict[key][i] = val + else: + print( + "WARNING: {} key already exists in dictionary so will be overwritten. YAML file {}.".format( + i, yaml_file + ) + ) + else: + append_dict[key] = yaml_dict[k] + else: + print("WARNING: File does not exist: {}".format(yaml_file)) + if len(valid_sample_list) != 0: + for key in valid_sample_list: + if key not in append_dict: + append_dict[key] = {} + if field_mapping_list != []: + for i, j in field_mapping_list: + if i not in append_dict[key]: + append_dict[key][i] = "NA" + else: + print( + "WARNING: {} key already exists in dictionary so will be overwritten. YAML file {}.".format( + i, yaml_file + ) + ) + else: + append_dict[key] = "NA" + return append_dict + + +def metrics_dict_to_file(file_field_list, multiqc_data_dir, out_file, valid_sample_list=[]): + metrics_dict = {} + field_list = [] + for yaml_file, mapping_list in file_field_list: + yaml_file = os.path.join(multiqc_data_dir, yaml_file) + metrics_dict = yaml_fields_to_dict( + yaml_file=yaml_file, + append_dict=metrics_dict, + field_mapping_list=mapping_list, + valid_sample_list=valid_sample_list, + ) + field_list += [x[0] for x in mapping_list] + + if metrics_dict != {}: + make_dir(os.path.dirname(out_file)) + fout = open(out_file, "w") + header = ["Sample"] + field_list + fout.write("{}\n".format(",".join(header))) + for k in sorted(metrics_dict.keys()): + row_list = [k] + for field in field_list: + if field in metrics_dict[k]: + if metrics_dict[k][field]: + row_list.append(str(metrics_dict[k][field]).replace(",", ";")) + else: + row_list.append("NA") + else: + row_list.append("NA") + fout.write("{}\n".format(",".join(row_list))) + fout.close() + return metrics_dict + + +def main(args=None): + args = parse_args(args) + + ## File names for MultiQC YAML along with fields to fetch from each file + illumina_assembly_files = [ + ( + "multiqc_quast_quast_unicycler.yaml", + [ + ("# Contigs (Unicycler)", ["# contigs (>= 0 bp)"]), + ("# Largest contig (Unicycler)", ["Largest contig"]), + ("# N50 (Unicycler)", ["N50"]), + ("# % Genome fraction", ["Genome fraction (%)"]), + ], + ), + ( + "multiqc_quast_extra.yaml", + [ + ("# Reference Genome (Kmerfinder)", ["RefGenome"]), + ] + ), + ] + + ## Write de novo assembly metrics to file + metrics_dict_to_file( + file_field_list=illumina_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) + +if __name__ == "__main__": + sys.exit(main()) diff --git a/conf/modules.config b/conf/modules.config index bcbd5750..7a7296c2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -205,7 +205,7 @@ process { if (filename.equals('versions.yml') || filename.endsWith('.tsv')){ null } else if (filename.startsWith('GCF')){ - "report_bySampleReference/${filename}" + "runs_per_reference/${filename}" } else { "${filename}" @@ -234,7 +234,7 @@ process { } withName: 'MULTIQC' { - ext.args = '' + ext.args = '-k yaml' publishDir = [ path: { "${params.outdir}/multiqc" }, mode: params.publish_dir_mode, diff --git a/modules.json b/modules.json index f173c1de..5f13ad69 100644 --- a/modules.json +++ b/modules.json @@ -85,7 +85,8 @@ "quast": { "branch": "master", "git_sha": "344638191a5d6b3526556410819dfcf24e98039e", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/quast/quast.diff" }, "racon": { "branch": "master", diff --git a/modules/local/multiqc_custom.nf b/modules/local/multiqc_custom.nf new file mode 100644 index 00000000..59b0ceb7 --- /dev/null +++ b/modules/local/multiqc_custom.nf @@ -0,0 +1,54 @@ +process MULTIQC { + label 'process_medium' + + conda "bioconda::multiqc=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : + 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" + + input: + path 'multiqc_config.yaml' + path multiqc_custom_config + path software_versions + //path workflow_summary + path multiqc_logo + path ('fastqc/*') + path ('fastp/*') + path ('nanoplot/*') + path ('porechop/*') + path ('pycoqc/*') + path ('kraken2_short/*') + path ('kraken2_long/*') + path ('quast_unicycler/*') + path ('prokka/*') + path ('bakta/*') + path ('extra/*') + + output: + path "*multiqc_report.html" , emit: report + path "*_data" , emit: data + path "*_plots" , optional:true, emit: plots + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def custom_config = multiqc_custom_config ? "--config $multiqc_custom_config" : '' + """ + ## Run MultiQC once to parse tool logs + multiqc -f $args $custom_config . + + ## Collect extra fields to be included in the report + cp extra/* multiqc_data/ + + ## Parse YAML files dumped by MultiQC to obtain metrics + multiqc_to_custom_csv.py + + ## Run multiqc a second time + multiqc -f $args -e general_stats $custom_config . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/quast/main.nf b/modules/nf-core/quast/main.nf index e265df73..29fb78f5 100644 --- a/modules/nf-core/quast/main.nf +++ b/modules/nf-core/quast/main.nf @@ -14,7 +14,7 @@ process QUAST { output: tuple val(meta), path("${prefix}") , emit: results - tuple val(meta), path("${prefix}.tsv") , emit: tsv + tuple val(meta), path("report.tsv") , emit: tsv tuple val(meta), path("${prefix}_transcriptome.tsv") , optional: true , emit: transcriptome tuple val(meta), path("${prefix}_misassemblies.tsv") , optional: true , emit: misassemblies tuple val(meta), path("${prefix}_unaligned.tsv") , optional: true , emit: unaligned @@ -37,7 +37,7 @@ process QUAST { $args \\ ${consensus.join(' ')} - ln -s ${prefix}/report.tsv ${prefix}.tsv + ln -s ${prefix}/report.tsv report.tsv [ -f ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ] && ln -s ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ${prefix}_transcriptome.tsv [ -f ${prefix}/contigs_reports/misassemblies_report.tsv ] && ln -s ${prefix}/contigs_reports/misassemblies_report.tsv ${prefix}_misassemblies.tsv [ -f ${prefix}/contigs_reports/unaligned_report.tsv ] && ln -s ${prefix}/contigs_reports/unaligned_report.tsv ${prefix}_unaligned.tsv diff --git a/modules/nf-core/quast/quast.diff b/modules/nf-core/quast/quast.diff new file mode 100644 index 00000000..d267a2c9 --- /dev/null +++ b/modules/nf-core/quast/quast.diff @@ -0,0 +1,23 @@ +Changes in module 'nf-core/quast' +--- modules/nf-core/quast/main.nf ++++ modules/nf-core/quast/main.nf +@@ -14,7 +14,7 @@ + + output: + tuple val(meta), path("${prefix}") , emit: results +- tuple val(meta), path("${prefix}.tsv") , emit: tsv ++ tuple val(meta), path("report.tsv") , emit: tsv + tuple val(meta), path("${prefix}_transcriptome.tsv") , optional: true , emit: transcriptome + tuple val(meta), path("${prefix}_misassemblies.tsv") , optional: true , emit: misassemblies + tuple val(meta), path("${prefix}_unaligned.tsv") , optional: true , emit: unaligned +@@ -37,7 +37,7 @@ + $args \\ + ${consensus.join(' ')} + +- ln -s ${prefix}/report.tsv ${prefix}.tsv ++ ln -s ${prefix}/report.tsv report.tsv + [ -f ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ] && ln -s ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ${prefix}_transcriptome.tsv + [ -f ${prefix}/contigs_reports/misassemblies_report.tsv ] && ln -s ${prefix}/contigs_reports/misassemblies_report.tsv ${prefix}_misassemblies.tsv + [ -f ${prefix}/contigs_reports/unaligned_report.tsv ] && ln -s ${prefix}/contigs_reports/unaligned_report.tsv ${prefix}_unaligned.tsv + +************************************************************ diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 9c1856b5..7a6940f9 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -40,8 +40,8 @@ if(! params.skip_kraken2){ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? file(params.multiqc_config) : [] ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) @@ -60,6 +60,8 @@ include { NANOPOLISH } from '../modules/local/nanopolish' include { MEDAKA } from '../modules/local/medaka' include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' include { DFAST } from '../modules/local/dfast' +include { CUSTOM_MQC_TABLES } from '../modules/local/custom_mqc_tables' +include { MULTIQC } from '../modules/local/multiqc_custom' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -93,7 +95,6 @@ include { GUNZIP } from '../modules/nf-core/gunzi include { GUNZIP as GUNZIP_KMERFINDERDB } from '../modules/nf-core/gunzip/main' include { PROKKA } from '../modules/nf-core/prokka/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' // // SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules @@ -396,8 +397,7 @@ workflow BACASS { // // TODO: Create kmerfinder mode for longreads - // TODO: hack multiqc to group quast-entries by refseqid? - // TODO: PASS QUAST_BYREF TSV TO MULTIQC + // TODO: add new column to multiqc with refseq name if ( !params.skip_kmerfinder && params.kmerfinderdb ) { if( params.kmerfinderdb.endsWith('.gz') ){ GUNZIP_KMERFINDERDB ( params.kmerfinderdb ) @@ -418,7 +418,7 @@ workflow BACASS { ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) - // Processing output: + // Processing output: group data according to their ref-genome and rename meta according to the number of identified references ch_consensus_byrefseq .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] @@ -439,6 +439,7 @@ workflow BACASS { // // MODULE: QUAST, assembly QC // + // FIXME: simplify it. I think choolsing anotherapproach will improve it ch_assembly .collect{ it[1]} .map{ consensus -> tuple([id:'report'], consensus)} @@ -450,6 +451,7 @@ workflow BACASS { params.reference_fasta ?: [[:],[]], params.reference_gff ?: [[:],[]] ) + ch_quast_multiqc = QUAST.out.results } else if (ch_to_quast_byrefseq){ QUAST( ch_to_quast, @@ -462,7 +464,7 @@ workflow BACASS { ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} ) } - ch_quast_multiqc = QUAST.out.tsv + ch_quast_multiqc = QUAST_BYREFSEQID.out.results ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) // Check assemblies that require further processing for gene annotation @@ -538,27 +540,38 @@ workflow BACASS { methods_description = WorkflowBacass.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) ch_methods_description = Channel.value(methods_description) - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(ch_fastqc_raw_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_fastqc_trim_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_trim_json_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_quast_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_prokka_txt_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_nanoplot_txt_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_porechop_log_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_pycoqc_multiqc.collect{it[1]}.ifEmpty([])) + // TODO: Clean this. find a better place. + ch_to_quast_byrefseq + .map{ + refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus) + } + .transpose() + .map { + [it[1].getSimpleName(), it[0]['id']] + } + .collectFile(name: 'multiqc_quast_extra.yaml') { + sample_name, refseqid -> + "$sample_name:\n RefGenome: $refseqid\n" + } + .set { ch_extra_multiqc } MULTIQC ( - ch_multiqc_files.collect(), ch_multiqc_config, - ch_multiqc_custom_config.collect().ifEmpty([]), - ch_multiqc_logo.collect().ifEmpty([]) + ch_multiqc_custom_config, + CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect(), + //ch_workflow_summary, // FIXME: Cannot parse this file... + ch_multiqc_logo.collect().ifEmpty([]), + ch_fastqc_raw_multiqc.collect{it[1]}.ifEmpty([]), + ch_trim_json_multiqc.collect{it[1]}.ifEmpty([]), + ch_nanoplot_txt_multiqc.collect{it[1]}.ifEmpty([]), + ch_porechop_log_multiqc.collect{it[1]}.ifEmpty([]), + ch_pycoqc_multiqc.collect{it[1]}.ifEmpty([]), + ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([]), + ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([]), + ch_quast_multiqc.collect{it[1]}.ifEmpty([]), // FIXME: input filename collision + ch_prokka_txt_multiqc.collect{it[1]}.ifEmpty([]), + ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([]), + ch_extra_multiqc.collect().ifEmpty([]) ) multiqc_report = MULTIQC.out.report.toList() } From 997fc917b2dc686785ac2c0e87362bc25f4917c1 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 2 Jan 2024 11:17:16 +0100 Subject: [PATCH 37/58] update multiqc and append fastp metrics to assmebly metrics df --- assets/multiqc_config.yml | 10 +++++++++- bin/multiqc_to_custom_csv.py | 7 +++++++ modules/local/multiqc_custom.nf | 6 +++--- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 49e9b244..9f9f7650 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -28,6 +28,8 @@ module_order: - fastp: name: "PREPROCESS: fastp (adapter trimming)" info: "This section of the report shows fastp results for reads after adapter and quality trimming." + path_filters: + - "./fastp/*.json" - nanostat: name: "PREPROCESS: Nanoplot" info: "This section of the report shows Nanoplot results for nanopore sequencing data." @@ -70,7 +72,7 @@ report_section_order: fastqc: after: general_stats fastp: - after: fastqc + after: general_stats nanoplot: after: general_stats porechop: @@ -99,6 +101,12 @@ custom_data: "Sample": description: "Input sample names" format: "{:,.0f}" + "# Input reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Trimmed reads (fastp)": + description: "Total number of reads remaining after adapter/quality trimming with fastp" + format: "{:,.0f}" "# Contigs (Unicycler)": description: "Total number of contigs calculated by QUAST" format: "{:,.0f}" diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py index 3f536b1c..8d9549c4 100755 --- a/bin/multiqc_to_custom_csv.py +++ b/bin/multiqc_to_custom_csv.py @@ -155,6 +155,13 @@ def main(args=None): ## File names for MultiQC YAML along with fields to fetch from each file illumina_assembly_files = [ + ( + "multiqc_fastp.yaml", + [ + ("# Input reads", ["before_filtering", "total_reads"]), + ("# Trimmed reads (fastp)", ["after_filtering", "total_reads"]), + ] + ), ( "multiqc_quast_quast_unicycler.yaml", [ diff --git a/modules/local/multiqc_custom.nf b/modules/local/multiqc_custom.nf index 59b0ceb7..488bc378 100644 --- a/modules/local/multiqc_custom.nf +++ b/modules/local/multiqc_custom.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_medium' - conda "bioconda::multiqc=1.17" + conda "bioconda::multiqc=1.19" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : - 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" input: path 'multiqc_config.yaml' From 07f6c397be3d55e873dae8280a6edd85f4389d15 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 2 Jan 2024 12:51:40 +0100 Subject: [PATCH 38/58] add new method to complie kmerfinder results into multiqc report --- assets/multiqc_config.yml | 25 ++++++++- bin/csv_to_yaml.py | 58 ++++++++++++++++++++ bin/multiqc_to_custom_csv.py | 11 +++- modules/local/find_download_reference.nf | 1 - modules/local/kmerfinder_summary.nf | 11 +++- modules/local/multiqc_custom.nf | 2 +- subworkflows/local/kmerfinder_subworkflow.nf | 13 +++-- workflows/bacass.nf | 28 ++-------- 8 files changed, 110 insertions(+), 39 deletions(-) create mode 100755 bin/csv_to_yaml.py diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 9f9f7650..82c2f16e 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -119,8 +119,29 @@ custom_data: "# % Genome fraction": description: "% genome fraction calculated by QUAST" format: "{:,.2f}" - "# Reference Genome (Kmerfinder)": - description: "Reference genome calculated by Blast" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder" format: "{:,.0f}" export_plots: true diff --git a/bin/csv_to_yaml.py b/bin/csv_to_yaml.py new file mode 100755 index 00000000..6f3fc9cf --- /dev/null +++ b/bin/csv_to_yaml.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +import sys +import argparse +import csv +import yaml + +def parse_args(args=None): + Description = ( + "Create a yaml file from csv input file grouping samples as keys and resting fields as their value pair." + ) + + Epilog = "Example usage: python csv_to_yaml.py -i myfile.csv -k 'sample_name' -o converted_file" + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-i", + "--input", + type=str, + dest="CSV_FILE", + help="Input file in CSV format." + ) + + parser.add_argument( + "-k", + "--key_field", + type=str, + dest="KEY_FIELD", + help="Name of the key/column grupping field in the input csv." + ) + + parser.add_argument( + "-op", + "--output_prefix", + type=str, + default="output_file", + dest="OUT_PREFIX", + help="Output file name" + ) + return parser.parse_args(args) + +def parse_csv(csv_file): + with open(csv_file, 'r') as c: + csv_reader = csv.DictReader(c) + data = [ row for row in csv_reader] + return data + +def create_yaml(data, key, output_prefix): + yaml_data = {entry[key]: {k: v for k, v in entry.items() if k != key} for entry in data} + with open( output_prefix + '.yaml' , 'w') as yaml_file: + yaml.dump(yaml_data, yaml_file, default_flow_style=False) + +def main(args=None): + args = parse_args(args) + file_list = parse_csv(args.CSV_FILE) + + create_yaml(data=file_list, key=args.KEY_FIELD, output_prefix=args.OUT_PREFIX) + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py index 8d9549c4..444b2860 100755 --- a/bin/multiqc_to_custom_csv.py +++ b/bin/multiqc_to_custom_csv.py @@ -172,9 +172,16 @@ def main(args=None): ], ), ( - "multiqc_quast_extra.yaml", + "multiqc_kmerfinder.yaml", [ - ("# Reference Genome (Kmerfinder)", ["RefGenome"]), + ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), + ("# Best hit assembly ID (Kmerfinder)", ["07-kmerfinder_best_hit_# Assembly"]), + ("# Best hit query coverage (Kmerfinder)", ["07-kmerfinder_best_hit_Query_Coverage"]), + ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), + ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), + ("# Second hit assembly ID (Kmerfinder)", ["07-kmerfinder_second_hit_# Assembly"]), + ("# Second hit query coverage (Kmerfinder)", ["07-kmerfinder_second_hit_Query_Coverage:"]), + ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), ] ), ] diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf index 478d3b67..36b59f0c 100644 --- a/modules/local/find_download_reference.nf +++ b/modules/local/find_download_reference.nf @@ -12,7 +12,6 @@ process FIND_DOWNLOAD_REFERENCE { path(ncbi_reference) output: - tuple val(meta), path( "references_found.tsv") , emit: target_references_tsv tuple val(meta), path( "*.fna.gz") , emit: fna tuple val(meta), path( "*.gff.gz") , emit: gff tuple val(meta), path( "*.faa.gz") , emit: faa diff --git a/modules/local/kmerfinder_summary.nf b/modules/local/kmerfinder_summary.nf index 58fe104f..8e5fe45f 100644 --- a/modules/local/kmerfinder_summary.nf +++ b/modules/local/kmerfinder_summary.nf @@ -2,22 +2,27 @@ process KMERFINDER_SUMMARY { tag "kmerfinder_summary" label 'process_low' - conda "bioconda::python=3.10.0" + conda "bioconda::multiqc=1.19" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.10' : - 'biocontainers/python:3.10' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" input: path(report, stageAs: 'reports/*') output: path "*.csv" , emit: summary + path "*.yaml" , emit: yaml path "versions.yml" , emit: versions script: """ + ## summarizing kmerfinder results kmerfinder_summary.py --path reports/ --output_bn kmerfinder.bn --output_csv kmerfinder_summary.csv + ## Create a yaml file from csv + csv_to_yaml.py -i kmerfinder_summary.csv -k 'sample_name' -op kmerfinder_summary + cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | awk '{print \$2}') diff --git a/modules/local/multiqc_custom.nf b/modules/local/multiqc_custom.nf index 488bc378..30ab46b7 100644 --- a/modules/local/multiqc_custom.nf +++ b/modules/local/multiqc_custom.nf @@ -37,7 +37,7 @@ process MULTIQC { ## Run MultiQC once to parse tool logs multiqc -f $args $custom_config . - ## Collect extra fields to be included in the report + ## Collect additional files to be included in the report cp extra/* multiqc_data/ ## Parse YAML files dumped by MultiQC to obtain metrics diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf index 99b9b4c3..ffe3f74e 100644 --- a/subworkflows/local/kmerfinder_subworkflow.nf +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -29,17 +29,17 @@ workflow KMERFINDER_SUBWORKFLOW { KMERFINDER_SUMMARY ( ch_kmerfinder_report.map{meta, report -> report }.collect() ) - ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) + ch_summary_yaml = KMERFINDER_SUMMARY.out.yaml + ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) - // SUBWORKFLOW: Group sample assemblies by reference geneome + // SUBWORKFLOW: Group assemblies by reference geneome ch_kmerfinder_json .join(ch_kmerfinder_report, by:0) .join(consensus, by:0) .map{ - meta, json, report_txt, fasta -> + meta, report_json, report_txt, fasta -> def refseq = [:] - refseq.id = json - .splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] + refseq.id = report_json.splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] return tuple(refseq, meta, report_txt, fasta) } .groupTuple(by:0) @@ -64,7 +64,8 @@ workflow KMERFINDER_SUBWORKFLOW { emit: versions = ch_versions.ifEmpty(null) // channel: [ path(versions.yml) ] - refseqids = ch_refseqid + summary_yaml = ch_summary_yaml // channel: [ path(kmerfinder_summary.yml) ] + refseqids = ch_refseqid // channel: [ val(refseq1), val(refseq1),...] reference_fasta = ch_reference_fasta // channel: [ meta, path(*.fna) ] reference_gff = ch_reference_gff // channel: [ meta, path(*.gff) ] consensus_byrefseq = ch_consensus_byrefseq // channel: [ refseq, meta, report_txt, fasta ] diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 7a6940f9..98059137 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -60,7 +60,6 @@ include { NANOPOLISH } from '../modules/local/nanopolish' include { MEDAKA } from '../modules/local/medaka' include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' include { DFAST } from '../modules/local/dfast' -include { CUSTOM_MQC_TABLES } from '../modules/local/custom_mqc_tables' include { MULTIQC } from '../modules/local/multiqc_custom' // @@ -394,10 +393,8 @@ workflow BACASS { // // MODULE: Kmerfinder, QC for sample purity - // - // TODO: Create kmerfinder mode for longreads - // TODO: add new column to multiqc with refseq name + if ( !params.skip_kmerfinder && params.kmerfinderdb ) { if( params.kmerfinderdb.endsWith('.gz') ){ GUNZIP_KMERFINDERDB ( params.kmerfinderdb ) @@ -419,13 +416,12 @@ workflow BACASS { ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) // Processing output: group data according to their ref-genome and rename meta according to the number of identified references - ch_consensus_byrefseq + ch_consensus_byrefseq // [ refseq, meta, report_txt, consensus ] .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] .groupTuple(by:0) .map { - refseq, meta, report_txt, consensus, ref_fasta, ref_gff -> - ch_refseqid.size() + refseq, meta, report_txt, consensus, ref_fasta, ref_gff -> ch_refseqid.size() if (ch_refseqid.size().getVal() > 1 ){ return [refseq, consensus.flatten(), ref_fasta, ref_gff] } else { @@ -435,7 +431,6 @@ workflow BACASS { .set { ch_to_quast_byrefseq } } - // // MODULE: QUAST, assembly QC // @@ -540,21 +535,6 @@ workflow BACASS { methods_description = WorkflowBacass.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) ch_methods_description = Channel.value(methods_description) - // TODO: Clean this. find a better place. - ch_to_quast_byrefseq - .map{ - refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus) - } - .transpose() - .map { - [it[1].getSimpleName(), it[0]['id']] - } - .collectFile(name: 'multiqc_quast_extra.yaml') { - sample_name, refseqid -> - "$sample_name:\n RefGenome: $refseqid\n" - } - .set { ch_extra_multiqc } - MULTIQC ( ch_multiqc_config, ch_multiqc_custom_config, @@ -571,7 +551,7 @@ workflow BACASS { ch_quast_multiqc.collect{it[1]}.ifEmpty([]), // FIXME: input filename collision ch_prokka_txt_multiqc.collect{it[1]}.ifEmpty([]), ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([]), - ch_extra_multiqc.collect().ifEmpty([]) + KMERFINDER_SUBWORKFLOW.out.summary_yaml.collectFile(name: 'multiqc_kmerfinder.yaml'), ) multiqc_report = MULTIQC.out.report.toList() } From e0481e02b05f0bcf54beed8a1cffb17563fcc382 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Tue, 2 Jan 2024 16:58:33 +0100 Subject: [PATCH 39/58] add long reads assembly metrics to custom multiqc --- assets/multiqc_config.yml | 147 +------------------------------ assets/multiqc_config_long.yml | 139 +++++++++++++++++++++++++++++ assets/multiqc_config_short.yml | 133 ++++++++++++++++++++++++++++ bin/multiqc_to_custom_csv.py | 71 ++++++++++++--- modules.json | 3 +- modules/local/kmerfinder.nf | 4 +- modules/local/multiqc_custom.nf | 4 +- modules/nf-core/racon/main.nf | 6 +- modules/nf-core/racon/racon.diff | 26 ++++++ workflows/bacass.nf | 25 ++++-- 10 files changed, 386 insertions(+), 172 deletions(-) create mode 100644 assets/multiqc_config_long.yml create mode 100644 assets/multiqc_config_short.yml create mode 100644 modules/nf-core/racon/racon.diff diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 82c2f16e..2f09d568 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -2,155 +2,12 @@ report_comment: > This report has been generated by the nf-core/bacass analysis pipeline. For information about how to interpret these results, please see the documentation. - -data_format: "yaml" - -max_table_rows: 10000 - -run_modules: - - custom_content - - fastqc - - fastp - - nanoplot - - porechop - - pycoqc - - kraken2 - - quast - - prokka - - bakta - -module_order: - - fastqc: - name: "PREPROCESS: FastQC (raw reads)" - info: "This section of the report shows FastQC results for the raw reads before adapter trimming." - path_filters: - - "./fastqc/*.zip" - - fastp: - name: "PREPROCESS: fastp (adapter trimming)" - info: "This section of the report shows fastp results for reads after adapter and quality trimming." - path_filters: - - "./fastp/*.json" - - nanostat: - name: "PREPROCESS: Nanoplot" - info: "This section of the report shows Nanoplot results for nanopore sequencing data." - path_filters: - - "./nanoplot/*.txt" - - porechop: - name: "PREPROCESS: Porechop" - info: "This section of the report shows Porechop results for reads after adapter trimming." - path_filters: - - "./porechop/*.log" - - pycoqc: - name: "PREPROCESS: PycoQC" - info: "This section of the report shows PycoQC results for quality control of long-read sequencing data." - path_filters: - - "./pycoqc/*.txt" - - kraken2: - name: "CONTAMINATION ANALYSIS: Kraken 2" - info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." - path_filters: - - ".*kraken2_*/*report.txt" - - quast: - name: "ASSEMBLY: Quast (Unicycler)" - anchor: "quast_unicycler" - info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." - path_filters: - - "./quast_unicycler/*/report.tsv" - - prokka: - name: "ANNOTATION: Prokka" - info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." - path_filters: - - "./prokka/*.txt" - - bakta: - name: "ANNOTATION: Bakta" - info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." - path_filters: - - "./bakta/*.txt" - - report_section_order: - fastqc: - after: general_stats - fastp: - after: general_stats - nanoplot: - after: general_stats - porechop: - after: nanoplot - kraken2: - after: general_stats - quast: - after: general_stats - prokka: - before: nf-core-bacass-methods-description - bakta: - before: nf-core-bacass-methods-description - nf-core-bacass-methods-description: + "nf-core-bacass-methods-description": order: -1000 software_versions: order: -1001 - nf-core-bacass-summary: + "nf-core-bacass-summary": order: -1002 -custom_data: - summary_assembly_metrics: - section_name: "De novo assembly metrics" - description: "generated by nf-core/bacass" - plot_type: "table" - headers: - "Sample": - description: "Input sample names" - format: "{:,.0f}" - "# Input reads": - description: "Total number of input reads in raw fastq files" - format: "{:,.0f}" - "# Trimmed reads (fastp)": - description: "Total number of reads remaining after adapter/quality trimming with fastp" - format: "{:,.0f}" - "# Contigs (Unicycler)": - description: "Total number of contigs calculated by QUAST" - format: "{:,.0f}" - "# Largest contig (Unicycler)": - description: "Size of largest contig calculated by QUAST" - format: "{:,.0f}" - "# N50 (Unicycler)": - description: "N50 metric for de novo assembly as calculated by QUAST" - format: "{:,.0f}" - "# % Genome fraction": - description: "% genome fraction calculated by QUAST" - format: "{:,.2f}" - "# Best hit (Kmerfinder)": - description: "Specie name of the best hit from Kmerfinder" - format: "{:,.0f}" - "# Best hit assembly ID (Kmerfinder)": - description: "Assembly ID of the best hit from Kmerfinder" - format: "{:,.0f}" - "# Best hit query coverage (Kmerfinder)": - description: "Query coverage value of the best hit from Kmerfinder" - format: "{:,.0f}" - "# Best hit depth (Kmerfinder)": - description: "Depth of the best hit from Kmerfinder" - format: "{:,.0f}" - "# Second hit (Kmerfinder)": - description: "Specie name of the second hit from Kmerfinder" - format: "{:,.0f}" - "# Second hit assembly ID (Kmerfinder)": - description: "Assembly ID of the second hit from Kmerfinder" - format: "{:,.0f}" - "# Second hit query coverage (Kmerfinder)": - description: "Query coverage value of the second hit from Kmerfinder" - format: "{:,.0f}" - "# Second hit depth (Kmerfinder)": - description: "Depth of the second hit from Kmerfinder" - format: "{:,.0f}" - export_plots: true - -# # Customise the module search patterns to speed up execution time -# # - Skip module sub-tools that we are not interested in -# # - Replace file-content searching with filename pattern searching -# # - Don't add anything that is the same as the MultiQC default -# # See https://multiqc.info/docs/#optimise-file-search-patterns for details -sp: - fastp: - fn: "*.fastp.json" diff --git a/assets/multiqc_config_long.yml b/assets/multiqc_config_long.yml new file mode 100644 index 00000000..083ea39b --- /dev/null +++ b/assets/multiqc_config_long.yml @@ -0,0 +1,139 @@ +report_comment: > + This report has been generated by the nf-core/bacass + analysis pipeline. For information about how to interpret these results, please see the + documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - nanostat + - porechop + - pycoqc + - kraken2 + - quast + - prokka + - bakta + +module_order: + - nanostat: + name: "PREPROCESS: Nanoplot" + info: "This section of the report shows Nanoplot results for nanopore sequencing data." + path_filters: + - "./nanoplot/*.txt" + - porechop: + name: "PREPROCESS: Porechop" + info: "This section of the report shows Porechop results for reads after adapter trimming." + path_filters: + - "./porechop/*.log" + - pycoqc: + name: "PREPROCESS: PycoQC" + info: "This section of the report shows PycoQC results for quality control of long-read sequencing data." + path_filters: + - "./pycoqc/*.txt" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + + +report_section_order: + nanostat: + after: general_stats + porechop: + before: nanostat + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: + order: -1000 + software_versions: + order: -1001 + nf-core-bacass-summary: + order: -1002 + +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics (long-reads)" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Input reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Median read lenght": + description: "Median read lenght (bp)" + format: "{:,.0f}" + "# Median read quality": + description: "Median read quality (Phred scale)" + format: "{:,.0f}" + "# Contigs": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder" + format: "{:,.0f}" + +export_plots: true + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details diff --git a/assets/multiqc_config_short.yml b/assets/multiqc_config_short.yml new file mode 100644 index 00000000..ae8eaebe --- /dev/null +++ b/assets/multiqc_config_short.yml @@ -0,0 +1,133 @@ +report_comment: > + This report has been generated by the nf-core/bacass + analysis pipeline. For information about how to interpret these results, please see the + documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - fastqc + - fastp + - kraken2 + - quast + - prokka + - bakta + +module_order: + - fastqc: + name: "PREPROCESS: FastQC (raw reads)" + info: "This section of the report shows FastQC results for the raw reads before adapter trimming." + path_filters: + - "./fastqc/*.zip" + - fastp: + name: "PREPROCESS: fastp (adapter trimming)" + info: "This section of the report shows fastp results for reads after adapter and quality trimming." + path_filters: + - "./fastp/*.json" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + + +report_section_order: + fastqc: + after: general_stats + fastp: + after: general_stats + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: + order: -1000 + software_versions: + order: -1001 + nf-core-bacass-summary: + order: -1002 + +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics (short-reads)" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Input reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Trimmed reads (fastp)": + description: "Total number of reads remaining after adapter/quality trimming with fastp" + format: "{:,.0f}" + "# Contigs": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder" + format: "{:,.0f}" + +export_plots: true + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details +sp: + fastp: + fn: "*.fastp.json" diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py index 444b2860..bccb0143 100755 --- a/bin/multiqc_to_custom_csv.py +++ b/bin/multiqc_to_custom_csv.py @@ -22,6 +22,14 @@ def parse_args(args=None): default="multiqc_data", help="Full path to directory containing YAML files for each module, as generated by MultiQC. (default: 'multiqc_data').", ) + parser.add_argument( + "-t", + "--assembly_type", + type=str, + dest="ASSEMBLY_TYPE", + default="short", + help="String defining the assembly mode for genome de novo assembly (options: short, long, hybrid).", + ) parser.add_argument( "-op", "--out_prefix", @@ -163,11 +171,11 @@ def main(args=None): ] ), ( - "multiqc_quast_quast_unicycler.yaml", + "multiqc_quast.yaml", [ - ("# Contigs (Unicycler)", ["# contigs (>= 0 bp)"]), - ("# Largest contig (Unicycler)", ["Largest contig"]), - ("# N50 (Unicycler)", ["N50"]), + ("# Contigs", ["# contigs (>= 0 bp)"]), + ("# Largest contig", ["Largest contig"]), + ("# N50", ["N50"]), ("# % Genome fraction", ["Genome fraction (%)"]), ], ), @@ -180,19 +188,60 @@ def main(args=None): ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), ("# Second hit assembly ID (Kmerfinder)", ["07-kmerfinder_second_hit_# Assembly"]), - ("# Second hit query coverage (Kmerfinder)", ["07-kmerfinder_second_hit_Query_Coverage:"]), + ("# Second hit query coverage (Kmerfinder)", ["07-kmerfinder_second_hit_Query_Coverage"]), + ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), + ] + ), + ] + + nanopore_assembly_files = [ + ( + "multiqc_nanostat.yaml", + [ + ("# Input reads", ["Number of reads_fastq"]), + ("# Median read lenght", ["Median read length_fastq"]), + ("# Median read quality", ["Median read quality_fastq"]), + ] + ), + ( + "multiqc_quast.yaml", # TODO: "multiqc_quast_quast_{assemblertool}.yaml" + [ + ("# Contigs", ["# contigs (>= 0 bp)"]), + ("# Largest contig", ["Largest contig"]), + ("# N50", ["N50"]), + ("# % Genome fraction", ["Genome fraction (%)"]), + ], + ), + ( + "multiqc_kmerfinder.yaml", + [ + ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), + ("# Best hit assembly ID (Kmerfinder)", ["07-kmerfinder_best_hit_# Assembly"]), + ("# Best hit query coverage (Kmerfinder)", ["07-kmerfinder_best_hit_Query_Coverage"]), + ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), + ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), + ("# Second hit assembly ID (Kmerfinder)", ["07-kmerfinder_second_hit_# Assembly"]), + ("# Second hit query coverage (Kmerfinder)", ["07-kmerfinder_second_hit_Query_Coverage"]), ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), ] ), ] ## Write de novo assembly metrics to file - metrics_dict_to_file( - file_field_list=illumina_assembly_files, - multiqc_data_dir=args.MULTIQC_DATA_DIR, - out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", - valid_sample_list=[], - ) + if args.ASSEMBLY_TYPE == 'short': + metrics_dict_to_file( + file_field_list=illumina_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) + elif args.ASSEMBLY_TYPE == 'long': + metrics_dict_to_file( + file_field_list=nanopore_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) if __name__ == "__main__": sys.exit(main()) diff --git a/modules.json b/modules.json index 5f13ad69..d74965e7 100644 --- a/modules.json +++ b/modules.json @@ -91,7 +91,8 @@ "racon": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/racon/racon.diff" }, "samtools/index": { "branch": "master", diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf index 58a6de83..59e1d5b3 100644 --- a/modules/local/kmerfinder.nf +++ b/modules/local/kmerfinder.nf @@ -17,8 +17,8 @@ process KMERFINDER { path "versions.yml" , emit: versions script: - def prefix = task.ext.prefix ?: "${meta.id}" - def in_reads = reads.size() == 1 ? "${reads}" : "${reads[0]} ${reads[1]}" + def prefix = task.ext.prefix ?: "${meta.id}" + def in_reads = reads[0] && reads[1] ? "${reads[0]} ${reads[1]}" : "${reads}" """ kmerfinder.py \\ diff --git a/modules/local/multiqc_custom.nf b/modules/local/multiqc_custom.nf index 30ab46b7..025fcac3 100644 --- a/modules/local/multiqc_custom.nf +++ b/modules/local/multiqc_custom.nf @@ -19,7 +19,7 @@ process MULTIQC { path ('pycoqc/*') path ('kraken2_short/*') path ('kraken2_long/*') - path ('quast_unicycler/*') + path ('quast/*') path ('prokka/*') path ('bakta/*') path ('extra/*') @@ -41,7 +41,7 @@ process MULTIQC { cp extra/* multiqc_data/ ## Parse YAML files dumped by MultiQC to obtain metrics - multiqc_to_custom_csv.py + multiqc_to_custom_csv.py --assembly_type $params.assembly_type ## Run multiqc a second time multiqc -f $args -e general_stats $custom_config . diff --git a/modules/nf-core/racon/main.nf b/modules/nf-core/racon/main.nf index 6d0cceb2..8f1cbfa9 100644 --- a/modules/nf-core/racon/main.nf +++ b/modules/nf-core/racon/main.nf @@ -11,7 +11,7 @@ process RACON { tuple val(meta), path(reads), path(assembly), path(paf) output: - tuple val(meta), path('*_assembly_consensus.fasta.gz') , emit: improved_assembly + tuple val(meta), path('*.consensus.fasta.gz') , emit: improved_assembly path "versions.yml" , emit: versions when: @@ -26,9 +26,9 @@ process RACON { "${paf}" \\ $args \\ "${assembly}" > \\ - ${prefix}_assembly_consensus.fasta + ${prefix}.consensus.fasta - gzip -n ${prefix}_assembly_consensus.fasta + gzip -n ${prefix}.consensus.fasta cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/racon/racon.diff b/modules/nf-core/racon/racon.diff new file mode 100644 index 00000000..c6e8d118 --- /dev/null +++ b/modules/nf-core/racon/racon.diff @@ -0,0 +1,26 @@ +Changes in module 'nf-core/racon' +--- modules/nf-core/racon/main.nf ++++ modules/nf-core/racon/main.nf +@@ -11,7 +11,7 @@ + tuple val(meta), path(reads), path(assembly), path(paf) + + output: +- tuple val(meta), path('*_assembly_consensus.fasta.gz') , emit: improved_assembly ++ tuple val(meta), path('*.consensus.fasta.gz') , emit: improved_assembly + path "versions.yml" , emit: versions + + when: +@@ -26,9 +26,9 @@ + "${paf}" \\ + $args \\ + "${assembly}" > \\ +- ${prefix}_assembly_consensus.fasta ++ ${prefix}.consensus.fasta + +- gzip -n ${prefix}_assembly_consensus.fasta ++ gzip -n ${prefix}.consensus.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 98059137..f45f1330 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -39,8 +39,11 @@ if(! params.skip_kraken2){ CONFIG FILES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ - -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +if(params.assembly_type){ + ch_multiqc_config = file("$projectDir/assets/multiqc_config_${params.assembly_type}.yml", checkIfExists: true) +} else { + ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +} ch_multiqc_custom_config = params.multiqc_config ? file(params.multiqc_config) : [] ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) @@ -392,8 +395,8 @@ workflow BACASS { } // - // MODULE: Kmerfinder, QC for sample purity - // TODO: Create kmerfinder mode for longreads + // MODULE: Kmerfinder, QC for sample purity. Available for short, long and hybrid assemblies + // if ( !params.skip_kmerfinder && params.kmerfinderdb ) { if( params.kmerfinderdb.endsWith('.gz') ){ @@ -403,11 +406,17 @@ workflow BACASS { ch_kmerfinderdb = params.kmerfinderdb } + if( params.assembly_type == 'short' || params.assembly_type == 'hybrid' ) { + ch_for_kmerfinder = FASTQ_TRIM_FASTP_FASTQC.out.reads + } else if ( params.assembly_type == 'long' ) { + ch_for_kmerfinder = PORECHOP_PORECHOP.out.reads + } + KMERFINDER_SUBWORKFLOW ( ch_kmerfinderdb, params.reference_ncbi_bacteria, - ch_for_assembly.map{meta, sr, lr -> tuple( meta, sr)}, // [meta, reads] - ch_assembly // [meta, consensus] + ch_for_kmerfinder, + ch_assembly ) ch_refseqid = KMERFINDER_SUBWORKFLOW.out.refseqids ch_reference_fasta = KMERFINDER_SUBWORKFLOW.out.reference_fasta @@ -415,7 +424,7 @@ workflow BACASS { ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) - // Processing output: group data according to their ref-genome and rename meta according to the number of identified references + // Group data based on ref-genome and rename meta according to the identified references count. ch_consensus_byrefseq // [ refseq, meta, report_txt, consensus ] .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] @@ -548,7 +557,7 @@ workflow BACASS { ch_pycoqc_multiqc.collect{it[1]}.ifEmpty([]), ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([]), ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([]), - ch_quast_multiqc.collect{it[1]}.ifEmpty([]), // FIXME: input filename collision + ch_quast_multiqc.collect{it[1]}.ifEmpty([]), // TODO: Create a quast channel for each assembler ch_prokka_txt_multiqc.collect{it[1]}.ifEmpty([]), ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([]), KMERFINDER_SUBWORKFLOW.out.summary_yaml.collectFile(name: 'multiqc_kmerfinder.yaml'), From 04ca4b34321712a3608439f5069f649b0202affa Mon Sep 17 00:00:00 2001 From: Dani VM Date: Wed, 3 Jan 2024 13:45:29 +0100 Subject: [PATCH 40/58] fix custom multiqc when kmerfinder is not invoked --- assets/multiqc_config_long.yml | 3 +++ assets/multiqc_config_short.yml | 3 +++ conf/modules.config | 43 +++++++++++++++-------------- modules.json | 5 ---- modules/local/multiqc_custom.nf | 24 +++++++++++------ workflows/bacass.nf | 48 +++++++++++++++++++++------------ 6 files changed, 76 insertions(+), 50 deletions(-) diff --git a/assets/multiqc_config_long.yml b/assets/multiqc_config_long.yml index 083ea39b..7c5349ba 100644 --- a/assets/multiqc_config_long.yml +++ b/assets/multiqc_config_long.yml @@ -17,6 +17,9 @@ run_modules: - prokka - bakta +exclude_modules: + - general_stats + module_order: - nanostat: name: "PREPROCESS: Nanoplot" diff --git a/assets/multiqc_config_short.yml b/assets/multiqc_config_short.yml index ae8eaebe..c068b167 100644 --- a/assets/multiqc_config_short.yml +++ b/assets/multiqc_config_short.yml @@ -16,6 +16,9 @@ run_modules: - prokka - bakta +exclude_modules: + - general_stats + module_order: - fastqc: name: "PREPROCESS: FastQC (raw reads)" diff --git a/conf/modules.config b/conf/modules.config index 7a7296c2..dbb8115e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -160,26 +160,6 @@ process { ] } - withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER' { - ext.args = '' - publishDir = [ - path: { "${params.outdir}/Kmerfinder/${meta.id}" }, - mode: params.publish_dir_mode, - pattern: "*.txt", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER_SUMMARY' { - ext.args = '' - publishDir = [ - path: { "${params.outdir}/Kmerfinder" }, - mode: params.publish_dir_mode, - pattern: "*.csv", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'KRAKEN2_LONG' { ext.args = '' publishDir = [ @@ -307,6 +287,29 @@ if (!params.skip_fastp) { } } } +if (!params.skip_kmerfinder) { + process { + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER' { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/Kmerfinder/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.txt", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER_SUMMARY' { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/Kmerfinder" }, + mode: params.publish_dir_mode, + pattern: "*.csv", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} if (params.annotation_tool == 'bakta') { if (params.baktadb_download == true) { diff --git a/modules.json b/modules.json index d74965e7..639e1712 100644 --- a/modules.json +++ b/modules.json @@ -61,11 +61,6 @@ "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", "installed_by": ["modules"] }, - "multiqc": { - "branch": "master", - "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", - "installed_by": ["modules"] - }, "nanoplot": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", diff --git a/modules/local/multiqc_custom.nf b/modules/local/multiqc_custom.nf index 025fcac3..32a49dd5 100644 --- a/modules/local/multiqc_custom.nf +++ b/modules/local/multiqc_custom.nf @@ -10,7 +10,7 @@ process MULTIQC { path 'multiqc_config.yaml' path multiqc_custom_config path software_versions - //path workflow_summary + path workflow_summary path multiqc_logo path ('fastqc/*') path ('fastp/*') @@ -25,10 +25,11 @@ process MULTIQC { path ('extra/*') output: - path "*multiqc_report.html" , emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions + path "*multiqc_report.html" , emit: report + path "*_data" , emit: data + path "*_assembly_metrics_mqc.csv" , optional:true, emit: csv_assembly + path "*_plots" , optional:true, emit: plots + path "versions.yml" , emit: versions script: def args = task.ext.args ?: '' @@ -38,13 +39,20 @@ process MULTIQC { multiqc -f $args $custom_config . ## Collect additional files to be included in the report - cp extra/* multiqc_data/ + if [ -d extra/ ]; then + cp extra/* multiqc_data/ + fi - ## Parse YAML files dumped by MultiQC to obtain metrics + ## Create multiqc custom data multiqc_to_custom_csv.py --assembly_type $params.assembly_type + ## Avoid the custom Multiqc table when the kmerfinder process is not invoked. + if grep ">skip_kmerfinder<" workflow_summary_mqc.yaml; then + rm *_assembly_metrics_mqc.csv + fi + ## Run multiqc a second time - multiqc -f $args -e general_stats $custom_config . + multiqc -f $args $custom_config . cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/workflows/bacass.nf b/workflows/bacass.nf index f45f1330..d3e05aa3 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -26,26 +26,34 @@ def checkPathParamList = [ params.input, params.multiqc_config, params.kraken2db for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check krakendb -if(! params.skip_kraken2){ - if(params.kraken2db){ +if (!params.skip_kraken2) { + if (params.kraken2db) { kraken2db = file(params.kraken2db) } else { exit 1, "Missing Kraken2 DB arg" } } +// Check kmerfinderdb +if (!params.skip_kmerfinder && !params.kmerfinderdb){ + exit 1, "Missing Kmerfinder DB arg: --kmerfinderdb " +} + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -if(params.assembly_type){ + + +// When invoking kmerfinder, utilize a custom MultiQC config file to generate a specialized report. This report will organize samples into groups based on their reference genome, w were previously calculated by kmerfinder. +if (!params.skip_kmerfinder && params.assembly_type) { ch_multiqc_config = file("$projectDir/assets/multiqc_config_${params.assembly_type}.yml", checkIfExists: true) } else { ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) } ch_multiqc_custom_config = params.multiqc_config ? file(params.multiqc_config) : [] -ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) /* @@ -258,6 +266,7 @@ workflow BACASS { // // MODULE: Miniasm, genome assembly, long reads + // if ( params.assembler == 'miniasm' ) { MINIMAP2_ALIGN ( ch_for_assembly.map{ meta,sr,lr -> tuple(meta,lr) }, @@ -336,7 +345,7 @@ workflow BACASS { ch_for_polish // tuple val(meta), val(reads), file(longreads), file(assembly) .join( MINIMAP2_POLISH.out.bam ) // tuple val(meta), file(bam) .join( SAMTOOLS_INDEX.out.bai ) // tuple val(meta), file(bai) - .join( ch_fast5 ) // tuple val(meta), file(fast5) + .join( ch_fast5 ) // tuple val(meta), file(fast5) .set { ch_for_nanopolish } // tuple val(meta), val(reads), file(longreads), file(assembly), file(bam), file(bai), file(fast5) // TODO: 'nanopolish index' couldn't be tested. No fast5 provided in test datasets. @@ -395,10 +404,13 @@ workflow BACASS { } // - // MODULE: Kmerfinder, QC for sample purity. Available for short, long and hybrid assemblies + // MODULE: Kmerfinder, QC for sample purity. // - if ( !params.skip_kmerfinder && params.kmerfinderdb ) { + ch_kmerfinder_multiqc = Channel.empty() + if (!params.skip_kmerfinder) { + + // Process kmerfinder database if( params.kmerfinderdb.endsWith('.gz') ){ GUNZIP_KMERFINDERDB ( params.kmerfinderdb ) ch_kmerfinderdb = GUNZIP_KMERFINDERDB.out.gunzip @@ -406,6 +418,7 @@ workflow BACASS { ch_kmerfinderdb = params.kmerfinderdb } + // Set kmerfinder input based on assembly type if( params.assembly_type == 'short' || params.assembly_type == 'hybrid' ) { ch_for_kmerfinder = FASTQ_TRIM_FASTP_FASTQC.out.reads } else if ( params.assembly_type == 'long' ) { @@ -422,12 +435,13 @@ workflow BACASS { ch_reference_fasta = KMERFINDER_SUBWORKFLOW.out.reference_fasta ch_reference_gff = KMERFINDER_SUBWORKFLOW.out.reference_gff ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq + ch_kmerfinder_multiqc = KMERFINDER_SUBWORKFLOW.out.summary_yaml ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) // Group data based on ref-genome and rename meta according to the identified references count. ch_consensus_byrefseq // [ refseq, meta, report_txt, consensus ] .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] - .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff] + .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff ] .groupTuple(by:0) .map { refseq, meta, report_txt, consensus, ref_fasta, ref_gff -> ch_refseqid.size() @@ -445,9 +459,9 @@ workflow BACASS { // // FIXME: simplify it. I think choolsing anotherapproach will improve it ch_assembly - .collect{ it[1]} - .map{ consensus -> tuple([id:'report'], consensus)} - .set{ch_to_quast} + .collect{it[1]} + .map{ consensus -> tuple([id:'report'], consensus) } + .set{ ch_to_quast } if(params.skip_kmerfinder){ QUAST( @@ -456,7 +470,7 @@ workflow BACASS { params.reference_gff ?: [[:],[]] ) ch_quast_multiqc = QUAST.out.results - } else if (ch_to_quast_byrefseq){ + } else if (!params.skip_kmerfinder && ch_to_quast_byrefseq) { QUAST( ch_to_quast, [[:],[]], @@ -467,9 +481,9 @@ workflow BACASS { ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_fasta)}, ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} ) + ch_quast_multiqc = QUAST_BYREFSEQID.out.results } - ch_quast_multiqc = QUAST_BYREFSEQID.out.results - ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) + ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) // Check assemblies that require further processing for gene annotation ch_assembly @@ -548,7 +562,7 @@ workflow BACASS { ch_multiqc_config, ch_multiqc_custom_config, CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect(), - //ch_workflow_summary, // FIXME: Cannot parse this file... + ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'), ch_multiqc_logo.collect().ifEmpty([]), ch_fastqc_raw_multiqc.collect{it[1]}.ifEmpty([]), ch_trim_json_multiqc.collect{it[1]}.ifEmpty([]), @@ -557,10 +571,10 @@ workflow BACASS { ch_pycoqc_multiqc.collect{it[1]}.ifEmpty([]), ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([]), ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([]), - ch_quast_multiqc.collect{it[1]}.ifEmpty([]), // TODO: Create a quast channel for each assembler + ch_quast_multiqc.collect{it[1]}.ifEmpty([]), ch_prokka_txt_multiqc.collect{it[1]}.ifEmpty([]), ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([]), - KMERFINDER_SUBWORKFLOW.out.summary_yaml.collectFile(name: 'multiqc_kmerfinder.yaml'), + ch_kmerfinder_multiqc.collectFile(name: 'multiqc_kmerfinder.yaml').ifEmpty([]), ) multiqc_report = MULTIQC.out.report.toList() } From 3c6297af72570aac7ef859f985cdda3d6c235cef Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 4 Jan 2024 11:41:59 +0100 Subject: [PATCH 41/58] add custom multiqc for hybrid assembly --- assets/multiqc_config_hybrid.yml | 166 +++++++++++++++++++++++++++++++ bin/multiqc_to_custom_csv.py | 55 +++++++++- nextflow.config | 2 +- 3 files changed, 218 insertions(+), 5 deletions(-) create mode 100644 assets/multiqc_config_hybrid.yml diff --git a/assets/multiqc_config_hybrid.yml b/assets/multiqc_config_hybrid.yml new file mode 100644 index 00000000..4c036265 --- /dev/null +++ b/assets/multiqc_config_hybrid.yml @@ -0,0 +1,166 @@ +report_comment: > + This report has been generated by the nf-core/bacass + analysis pipeline. For information about how to interpret these results, please see the + documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - fastqc + - fastp + - nanostat + - porechop + - pycoqc + - kraken2 + - quast + - prokka + - bakta + +exclude_modules: + - general_stats + +module_order: + - fastqc: + name: "PREPROCESS: FastQC (raw reads)" + info: "This section of the report shows FastQC results for the raw reads before adapter trimming." + path_filters: + - "./fastqc/*.zip" + - fastp: + name: "PREPROCESS: fastp (adapter trimming)" + info: "This section of the report shows fastp results for reads after adapter and quality trimming." + path_filters: + - "./fastp/*.json" + - nanostat: + name: "PREPROCESS: Nanoplot" + info: "This section of the report shows Nanoplot results for nanopore sequencing data." + path_filters: + - "./nanoplot/*.txt" + - porechop: + name: "PREPROCESS: Porechop" + info: "This section of the report shows Porechop results for reads after adapter trimming." + path_filters: + - "./porechop/*.log" + - pycoqc: + name: "PREPROCESS: PycoQC" + info: "This section of the report shows PycoQC results for quality control of long-read sequencing data." + path_filters: + - "./pycoqc/*.txt" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + +report_section_order: + fastqc: + after: general_stats + fastp: + after: general_stats + nanostat: + after: general_stats + porechop: + before: nanostat + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: + order: -1000 + software_versions: + order: -1001 + nf-core-bacass-summary: + order: -1002 + +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics (shorts & long reads)" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Input short reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Trimmed short reads (fastp)": + description: "Total number of reads remaining after adapter/quality trimming with fastp" + format: "{:,.0f}" + "# Input long reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Median long reads lenght": + description: "Median read lenght (bp)" + format: "{:,.0f}" + "# Median long reads quality": + description: "Median read quality (Phred scale)" + format: "{:,.0f}" + "# Contigs (hybrid assembly)": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig (hybrid assembly)": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50 (hybrid assembly)": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction (hybrid assembly)": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + +export_plots: true + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details +sp: + fastp: + fn: "*.fastp.json" diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py index bccb0143..1e79910b 100755 --- a/bin/multiqc_to_custom_csv.py +++ b/bin/multiqc_to_custom_csv.py @@ -67,7 +67,7 @@ def yaml_fields_to_dict(yaml_file, append_dict={}, field_mapping_list=[], valid_ "number_of_SNPs", "number_of_indels", "MISSENSE", - "# contigs (>= 0 bp)", + "# contigs", "# contigs (>= 5000 bp)", "Largest contig", ] @@ -173,7 +173,7 @@ def main(args=None): ( "multiqc_quast.yaml", [ - ("# Contigs", ["# contigs (>= 0 bp)"]), + ("# Contigs", ["# contigs"]), ("# Largest contig", ["Largest contig"]), ("# N50", ["N50"]), ("# % Genome fraction", ["Genome fraction (%)"]), @@ -204,9 +204,9 @@ def main(args=None): ] ), ( - "multiqc_quast.yaml", # TODO: "multiqc_quast_quast_{assemblertool}.yaml" + "multiqc_quast.yaml", [ - ("# Contigs", ["# contigs (>= 0 bp)"]), + ("# Contigs", ["# contigs"]), ("# Largest contig", ["Largest contig"]), ("# N50", ["N50"]), ("# % Genome fraction", ["Genome fraction (%)"]), @@ -227,6 +227,46 @@ def main(args=None): ), ] + hybrid_assembly_files = [ + ( + "multiqc_fastp.yaml", + [ + ("# Input short reads", ["before_filtering", "total_reads"]), + ("# Trimmed short reads (fastp)", ["after_filtering", "total_reads"]), + ] + ), + ( + "multiqc_nanostat.yaml", + [ + ("# Input long reads", ["Number of reads_fastq"]), + ("# Median long reads lenght", ["Median read length_fastq"]), + ("# Median long reads quality", ["Median read quality_fastq"]), + ] + ), + ( + "multiqc_quast.yaml", + [ + ("# Contigs (hybrid assembly)", ["# contigs"]), + ("# Largest contig (hybrid assembly)", ["Largest contig"]), + ("# N50 (hybrid assembly)", ["N50"]), + ("# % Genome fraction (hybrid assembly)", ["Genome fraction (%)"]), + ], + ), + ( + "multiqc_kmerfinder.yaml", + [ + ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), + ("# Best hit assembly ID (Kmerfinder)", ["07-kmerfinder_best_hit_# Assembly"]), + ("# Best hit query coverage (Kmerfinder)", ["07-kmerfinder_best_hit_Query_Coverage"]), + ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), + ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), + ("# Second hit assembly ID (Kmerfinder)", ["07-kmerfinder_second_hit_# Assembly"]), + ("# Second hit query coverage (Kmerfinder)", ["07-kmerfinder_second_hit_Query_Coverage"]), + ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), + ] + ), + ] + ## Write de novo assembly metrics to file if args.ASSEMBLY_TYPE == 'short': metrics_dict_to_file( @@ -242,6 +282,13 @@ def main(args=None): out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", valid_sample_list=[], ) + elif args.ASSEMBLY_TYPE == 'hybrid': + metrics_dict_to_file( + file_field_list=hybrid_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) if __name__ == "__main__": sys.exit(main()) diff --git a/nextflow.config b/nextflow.config index 94d1fdbb..fa5b3912 100644 --- a/nextflow.config +++ b/nextflow.config @@ -46,7 +46,7 @@ params { skip_fastqc = false skip_fastp = false skip_kraken2 = false - skip_kmerfinder = false + skip_kmerfinder = true skip_pycoqc = false skip_annotation = false skip_polish = false From 660799dfff47a1082cbfdb0112e425be913a6acf Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 4 Jan 2024 15:24:33 +0100 Subject: [PATCH 42/58] add file-check-exist and rename variables --- bin/multiqc_to_custom_csv.py | 4 ---- modules/local/find_download_reference.nf | 6 ++++-- modules/local/kmerfinder.nf | 6 +++--- nextflow.config | 2 +- nextflow_schema.json | 6 +++--- subworkflows/local/kmerfinder_subworkflow.nf | 4 ++-- workflows/bacass.nf | 19 ++++++++++++------- 7 files changed, 25 insertions(+), 22 deletions(-) diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py index 1e79910b..391ca41e 100755 --- a/bin/multiqc_to_custom_csv.py +++ b/bin/multiqc_to_custom_csv.py @@ -63,10 +63,6 @@ def find_tag(d, tag): def yaml_fields_to_dict(yaml_file, append_dict={}, field_mapping_list=[], valid_sample_list=[]): integer_fields = [ - "mapped_passed", - "number_of_SNPs", - "number_of_indels", - "MISSENSE", "# contigs", "# contigs (>= 5000 bp)", "Largest contig", diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf index 36b59f0c..7ca5e8a3 100644 --- a/modules/local/find_download_reference.nf +++ b/modules/local/find_download_reference.nf @@ -9,7 +9,7 @@ process FIND_DOWNLOAD_REFERENCE { input: tuple val(meta), path(reports, stageAs: 'reports/*') - path(ncbi_reference) + path(ncbi_metadata_db) output: tuple val(meta), path( "*.fna.gz") , emit: fna @@ -19,13 +19,15 @@ process FIND_DOWNLOAD_REFERENCE { script: """ + ## Find the common reference genome find_common_reference.py \\ -d reports/ \\ -o references_found.tsv + ## Download the winner reference genome from the ncbi database download_reference.py \\ -file references_found.tsv \\ - -reference $ncbi_reference \\ + -reference $ncbi_metadata_db \\ -out_dir . cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf index 59e1d5b3..92aff76d 100644 --- a/modules/local/kmerfinder.nf +++ b/modules/local/kmerfinder.nf @@ -9,7 +9,7 @@ process KMERFINDER { input: tuple val(meta), path(reads) - path(kmerfinderDB) + path(kmerfinder_db) output: tuple val(meta), path("*_results.txt") , emit: report @@ -24,8 +24,8 @@ process KMERFINDER { kmerfinder.py \\ --infile $in_reads \\ --output_folder . \\ - --db_path ${kmerfinderDB}/bacteria.ATG \\ - -tax ${kmerfinderDB}/bacteria.name \\ + --db_path ${kmerfinder_db}/bacteria.ATG \\ + -tax ${kmerfinder_db}/bacteria.name \\ -x mv results.txt ${prefix}_results.txt diff --git a/nextflow.config b/nextflow.config index fa5b3912..80bb2de4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,7 +21,7 @@ params { kmerfinderdb = "" reference_fasta = "" reference_gff = "" - reference_ncbi_bacteria = "" + ncbi_assembly_metadata = "" // Assembly parameters assembler = 'unicycler' // Allowed: ['unicycler', 'canu', 'miniasm', 'dragonflye'] diff --git a/nextflow_schema.json b/nextflow_schema.json index 0f0e9185..43a732e0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -76,7 +76,7 @@ }, "kmerfinderdb": { "type": "string", - "description": "Database for Kmerfinder.", + "description": "Path to the Kmerfinder bacteria database.", "help_text": "" }, "reference_fasta": { @@ -87,9 +87,9 @@ "type": "string", "description": "Reference GFF file." }, - "reference_ncbi_bacteria": { + "ncbi_assembly_metadata": { "type": "string", - "description": "NCBI Bacteria reference database" + "description": "Master file (*.txt) containing a summary of asseblies available in GeneBank or RefSeq. See: https://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt" } } }, diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf index ffe3f74e..d7775372 100644 --- a/subworkflows/local/kmerfinder_subworkflow.nf +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -9,7 +9,7 @@ include { QUAST } from '../../modules/nf-core/quast/main' workflow KMERFINDER_SUBWORKFLOW { take: kmerfinder_db // channel: [ path ] - ncbi_bacteria_db // channel: [ path ] + ncbi_assembly_metadata // channel: [ path ] reads // channel: [ meta, reads ] consensus // channel: [ meta, consensus ] @@ -49,7 +49,7 @@ workflow KMERFINDER_SUBWORKFLOW { if (!params.reference_fasta && !params.reference_gff) { FIND_DOWNLOAD_REFERENCE ( ch_consensus_byrefseq.map{ refseq, meta, report_txt, fasta -> tuple(refseq, report_txt)}, - ncbi_bacteria_db + ncbi_assembly_metadata ) ch_reference_fasta = FIND_DOWNLOAD_REFERENCE.out.fna ch_reference_gff = FIND_DOWNLOAD_REFERENCE.out.gff diff --git a/workflows/bacass.nf b/workflows/bacass.nf index d3e05aa3..ee174fc3 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -28,17 +28,21 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check krakendb if (!params.skip_kraken2) { if (params.kraken2db) { - kraken2db = file(params.kraken2db) + kraken2db = file(params.kraken2db, checkIfExists: true) } else { exit 1, "Missing Kraken2 DB arg" } } -// Check kmerfinderdb -if (!params.skip_kmerfinder && !params.kmerfinderdb){ - exit 1, "Missing Kmerfinder DB arg: --kmerfinderdb " +// Check kmerfinder dependencies +if (!params.skip_kmerfinder) { + if (!params.kmerfinderdb || !params.ncbi_assembly_metadata) { + exit 1, "[KMERFINDER]: Missing --kmerfinder_db and/or --ncbi_assembly_metadata arguments. Both are required to run KMERFINDER." + } else { + file(params.kmerfinderdb, checkIfExists: true) + file(params.ncbi_assembly_metadata, checkIfExists: true) + } } - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -404,8 +408,9 @@ workflow BACASS { } // - // MODULE: Kmerfinder, QC for sample purity. + // SUBWORKFLOW: Kmerfinder, QC for sample purity. // + // TODO: Executes both kmerfinder and organizes samples by the reference genome (all this through the kmerfinder_subworkflow()). Ideally, users can also utilize kmerfinder independently without the need to download reference genome and grouping data —simply running kmerfinder alone-. ch_kmerfinder_multiqc = Channel.empty() if (!params.skip_kmerfinder) { @@ -427,7 +432,7 @@ workflow BACASS { KMERFINDER_SUBWORKFLOW ( ch_kmerfinderdb, - params.reference_ncbi_bacteria, + params.ncbi_assembly_metadata, ch_for_kmerfinder, ch_assembly ) From aeaa20ee526d06910989529f6039c90f12c83693 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 5 Jan 2024 12:50:15 +0100 Subject: [PATCH 43/58] update documentation and add save_trimmed option --- README.md | 11 +++++++++-- conf/modules.config | 7 +++++-- docs/output.md | 25 ++++++++++++++++++++++--- nextflow.config | 1 + nextflow_schema.json | 7 +++++-- 5 files changed, 42 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 4462de74..aa80cd5f 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,12 @@ On release, automated continuous integration tests run the pipeline on a full-si ### Short Read Assembly -This pipeline is primarily for bacterial assembly of next-generation sequencing reads. It can be used to quality trim your reads using [FastP](https://github.com/OpenGene/fastp) and performs basic sequencing QC using [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Afterwards, the pipeline performs read assembly using [Unicycler](https://github.com/rrwick/Unicycler). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) to verify sample purity. +This pipeline is primarily for bacterial assembly of next-generation sequencing reads. It can be used to quality trim your reads using [FastP](https://github.com/OpenGene/fastp) and performs basic sequencing QC using [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Afterwards, the pipeline performs read assembly using [Unicycler](https://github.com/rrwick/Unicycler). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) and [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) to verify sample purity. ### Long Read Assembly -For users that only have Nanopore data, the pipeline quality trims these using [PoreChop](https://github.com/rrwick/Porechop) and assesses basic sequencing QC utilizing [NanoPlot](https://github.com/wdecoster/NanoPlot) and [PycoQC](https://github.com/a-slide/pycoQC). +For users that only have Nanopore data, the pipeline quality trims these using [PoreChop](https://github.com/rrwick/Porechop) and assesses basic sequencing QC utilizing [NanoPlot](https://github.com/wdecoster/NanoPlot) and [PycoQC](https://github.com/a-slide/pycoQC). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) and [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) to verify sample purity. + The pipeline can then perform long read assembly utilizing [Unicycler](https://github.com/rrwick/Unicycler), [Miniasm](https://github.com/lh3/miniasm) in combination with [Racon](https://github.com/isovic/racon), [Canu](https://github.com/marbl/canu) or [Flye](https://github.com/fenderglass/Flye) by using the [Dragonflye](https://github.com/rpetit3/dragonflye)(\*) pipeline. Long reads assembly can be polished using [Medaka](https://github.com/nanoporetech/medaka) or [NanoPolish](https://github.com/jts/nanopolish) with Fast5 files. > [!NOTE] @@ -48,6 +49,12 @@ For users specifying both short read and long read (NanoPore) data, the pipeline In all cases, the assembly is assessed using [QUAST](http://bioinf.spbau.ru/quast). The resulting bacterial assembly is furthermore annotated using [Prokka](https://github.com/tseemann/prokka), [Bakta](https://github.com/oschwengers/bakta) or [DFAST](https://github.com/nigyta/dfast_core). +In specific cases where samples recorded in the input samplesheet belong to more than one species, the pipeline finds and downloads their respectve reference genomes (this also works with single specie input samplesheet). It then groups the samples into batches and collects assembly QC results based on their corresponding reference genomes. + +> NOTE: This scenario is supported when [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) analysis is performed only. + + +In cases where input samplesheet has files where , the pipeline will group samples in batches according to their reference genomes and will provide a general QUAST containing all the input samples and a by reference genome QUAST report, that is, a quast report for each reference genome. ## Usage > [!NOTE] diff --git a/conf/modules.config b/conf/modules.config index dbb8115e..9d240c78 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -37,7 +37,8 @@ process { publishDir = [ path: { "${params.outdir}/trimming/longreads" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enable: params.save_trimmed ] } @@ -181,6 +182,7 @@ process { publishDir = [ path: { "${params.outdir}/QUAST" }, mode: params.publish_dir_mode, + pattern: "{report,runs_per_reference/*}/{report.html,report.pdf,icarus.html}", saveAs: { filename -> if (filename.equals('versions.yml') || filename.endsWith('.tsv')){ null @@ -252,7 +254,8 @@ if (!params.skip_fastp) { path: { "${params.outdir}/trimming/shortreads" }, mode: params.publish_dir_mode, pattern: "*.fastp.fastq.gz", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_trimmed ], [ path: { "${params.outdir}/trimming/shortreads/json_html" }, diff --git a/docs/output.md b/docs/output.md index ba44aa38..2b043a9c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -119,6 +119,21 @@ Exemplary Kraken2 report screenshot: +## Reads QC and Sample purity + +The pipeline includes a dedicated step for short and long reads QC as well as contamination analysis using [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/). This process helps assess the quality and purity of the samples. + +
+Output files + +- `Kmerfinder/{ID}/` + - `*_results.txt`: Kmerfinder report table containing reads QC results and taxonomic information. + +- `Kmerfinder/`: + - kmerfinder_summary.csv: A CSV file containing the most relevant results of all samples analyzed with Kmerfinder. + +
+ ## Assembly Output Trimmed reads are assembled with [Unicycler](https://github.com/rrwick/Unicycler) in `short` or `hybrid` assembly modes. For long-read assembly, there are also `canu` and `miniasm` available. @@ -181,9 +196,12 @@ The assembly QC is performed with [QUAST](http://quast.sourceforge.net/quast) fo
Output files -- `QUAST` - - `report.tsv`: QUAST's report in text format -- `QUAST/report` +- `QUAST/report/` + - `icarus.html`: QUAST's contig browser as HTML + - `report.html`: QUAST assembly QC as HTML report + - `report.pdf`: QUAST assembly QC as pdf + +- `QUAST/runs_per_reference/{reference_assembly}/` - `icarus.html`: QUAST's contig browser as HTML - `report.html`: QUAST assembly QC as HTML report - `report.pdf`: QUAST assembly QC as pdf @@ -241,6 +259,7 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - `multiqc_plots/`: directory containing static images from the report in various formats. + - summary_assembly_metrics_mqc.csv: custom table containing most relevant assembly QC metrics.
diff --git a/nextflow.config b/nextflow.config index 80bb2de4..aba44d3d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,7 @@ params { input = null // QC and trimming options + save_trimmed = false save_trimmed_fail = false save_merged = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 43a732e0..e39ca896 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -42,6 +42,10 @@ "description": "Parameters for QC and trim short-reads", "default": "", "properties": { + "save_trimmed": { + "type": "boolean", + "description": "save trimmed files" + }, "save_trimmed_fail": { "type": "boolean", "enum": ["true", "false"], @@ -76,8 +80,7 @@ }, "kmerfinderdb": { "type": "string", - "description": "Path to the Kmerfinder bacteria database.", - "help_text": "" + "description": "Path to the Kmerfinder bacteria database." }, "reference_fasta": { "type": "string", From 95d965de55916e6c8ee6410d74e93296ce4e8821 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 5 Jan 2024 16:05:59 +0100 Subject: [PATCH 44/58] add fastp additional options and fix input sample path --- conf/modules.config | 4 ++-- nextflow.config | 1 + nextflow_schema.json | 4 ++++ workflows/bacass.nf | 4 ++-- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 9d240c78..52509b0d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -248,7 +248,7 @@ if (!params.skip_fastqc) { if (!params.skip_fastp) { process { withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTP' { - ext.args = '' + ext.args = params.fastp_args ? params.fastp_args : '' publishDir = [ [ path: { "${params.outdir}/trimming/shortreads" }, @@ -297,7 +297,7 @@ if (!params.skip_kmerfinder) { publishDir = [ path: { "${params.outdir}/Kmerfinder/${meta.id}" }, mode: params.publish_dir_mode, - pattern: "*.txt", + pattern: "*.{txt,json}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } diff --git a/nextflow.config b/nextflow.config index aba44d3d..5ff8c5ce 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,7 @@ params { input = null // QC and trimming options + fastp_args = "" save_trimmed = false save_trimmed_fail = false save_merged = false diff --git a/nextflow_schema.json b/nextflow_schema.json index e39ca896..07ac01e7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -42,6 +42,10 @@ "description": "Parameters for QC and trim short-reads", "default": "", "properties": { + "fastp_args": { + "type": "string", + "description": "This can be used to pass arguments to [Fastp](https://github.com/OpenGene/fastp)" + }, "save_trimmed": { "type": "boolean", "description": "save trimmed files" diff --git a/workflows/bacass.nf b/workflows/bacass.nf index ee174fc3..0604e43a 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -136,8 +136,8 @@ workflow BACASS { def criteria = multiMapCriteria { meta, fastq_1, fastq_2, long_fastq, fast5 -> shortreads: fastq_1 != 'NA' ? tuple(meta, [file(fastq_1), file(fastq_2)]) : null - longreads: long_fastq != 'NA' ? tuple(meta, file(long_fastq)) : null - fast5: fast5 != 'NA' ? tuple(meta, fast5) : null + longreads: long_fastq != 'NA' ? tuple(meta, file(long_fastq)) : null + fast5: fast5 != 'NA' ? tuple(meta, fast5) : null } // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ Channel From b33a37d2566f28087e609d3b0131105668c9243c Mon Sep 17 00:00:00 2001 From: Dani VM Date: Mon, 15 Jan 2024 15:42:20 +0100 Subject: [PATCH 45/58] allow module to emit tsv report --- modules/local/find_download_reference.nf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf index 7ca5e8a3..95cb8e09 100644 --- a/modules/local/find_download_reference.nf +++ b/modules/local/find_download_reference.nf @@ -12,10 +12,11 @@ process FIND_DOWNLOAD_REFERENCE { path(ncbi_metadata_db) output: - tuple val(meta), path( "*.fna.gz") , emit: fna - tuple val(meta), path( "*.gff.gz") , emit: gff - tuple val(meta), path( "*.faa.gz") , emit: faa - path "versions.yml" , emit: versions + tuple val(meta), path("*.fna.gz") , emit: fna + tuple val(meta), path("*.gff.gz") , emit: gff + tuple val(meta), path("*.faa.gz") , emit: faa + tuple val(meta), path("references_found.tsv") , emit: references_tsv + path "versions.yml" , emit: versions script: """ From 07034179ee314e8af9bad28a1df533d55f94ab5b Mon Sep 17 00:00:00 2001 From: Dani VM Date: Wed, 17 Jan 2024 17:15:08 +0100 Subject: [PATCH 46/58] fix kmerFinder by narrowing down the reference genomes to a single winning candidate --- bin/download_reference.py | 4 +- conf/modules.config | 6 +- modules/local/find_download_reference.nf | 13 ++-- subworkflows/local/kmerfinder_subworkflow.nf | 63 ++++++++++++-------- workflows/bacass.nf | 30 ++++------ 5 files changed, 61 insertions(+), 55 deletions(-) diff --git a/bin/download_reference.py b/bin/download_reference.py index 8fa18da4..17a43351 100755 --- a/bin/download_reference.py +++ b/bin/download_reference.py @@ -98,7 +98,9 @@ def download_references(file, reference, out_dir): ] top_reference = infile[0][0] - print(top_reference) + with open(str(top_reference) + ".winner", 'w') as topref: + topref.write(top_reference) + # create the outdir (do nothing if already there) try: diff --git a/conf/modules.config b/conf/modules.config index 52509b0d..ff8e6cc0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -31,6 +31,7 @@ process { ] } + // FIXME: SAVE TRIMMED NOT WORKING withName: 'PORECHOP_PORECHOP' { ext.args = '' ext.prefix = { "${meta.id}.porechop" } @@ -177,12 +178,13 @@ process { ] } - withName: 'QUAST' { + // FIXME: output structure and meta updated. It might require a fixme + withName: 'QUAST|QUAST_BYREFSEQID' { ext.args = '' publishDir = [ path: { "${params.outdir}/QUAST" }, mode: params.publish_dir_mode, - pattern: "{report,runs_per_reference/*}/{report.html,report.pdf,icarus.html}", + //pattern: "{report,runs_per_reference/*}/{report.html,report.pdf,icarus.html}", saveAs: { filename -> if (filename.equals('versions.yml') || filename.endsWith('.tsv')){ null diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf index 95cb8e09..8f7a52e4 100644 --- a/modules/local/find_download_reference.nf +++ b/modules/local/find_download_reference.nf @@ -8,15 +8,16 @@ process FIND_DOWNLOAD_REFERENCE { 'biocontainers/requests:2.26.0' }" input: - tuple val(meta), path(reports, stageAs: 'reports/*') + tuple val(refmeta), path(reports, stageAs: 'reports/*') path(ncbi_metadata_db) output: - tuple val(meta), path("*.fna.gz") , emit: fna - tuple val(meta), path("*.gff.gz") , emit: gff - tuple val(meta), path("*.faa.gz") , emit: faa - tuple val(meta), path("references_found.tsv") , emit: references_tsv - path "versions.yml" , emit: versions + tuple val(refmeta), path("*.fna.gz") , emit: fna + tuple val(refmeta), path("*.gff.gz") , emit: gff + tuple val(refmeta), path("*.faa.gz") , emit: faa + tuple val(refmeta), path("references_found.tsv") , emit: references_tsv + tuple val(refmeta), path("*.winner") , emit: winner + path "versions.yml" , emit: versions script: """ diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf index d7775372..d2e1cd36 100644 --- a/subworkflows/local/kmerfinder_subworkflow.nf +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -8,15 +8,15 @@ include { QUAST } from '../../modules/nf-core/quast/main' workflow KMERFINDER_SUBWORKFLOW { take: - kmerfinder_db // channel: [ path ] - ncbi_assembly_metadata // channel: [ path ] - reads // channel: [ meta, reads ] - consensus // channel: [ meta, consensus ] + kmerfinder_db // channel: [ path ] + ncbi_assembly_metadata // channel: [ path ] + reads // channel: [ meta, reads ] + consensus // channel: [ meta, consensus ] main: ch_versions = Channel.empty() - // MODULE: Kmerfinder, QC for sample purity + // MODULE: Kmerfinder, QC for sample purity. Identifies reference specie and reference genome assembly for each sample. KMERFINDER ( reads, kmerfinder_db @@ -25,48 +25,59 @@ workflow KMERFINDER_SUBWORKFLOW { ch_kmerfinder_json = KMERFINDER.out.json ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) - // MODULE: Kmerfinder summary report + // MODULE: Kmerfinder summary report. Generates a csv summary file collecting all sample reports. KMERFINDER_SUMMARY ( ch_kmerfinder_report.map{meta, report -> report }.collect() ) ch_summary_yaml = KMERFINDER_SUMMARY.out.yaml ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) - // SUBWORKFLOW: Group assemblies by reference geneome + // SUBWORKFLOW: Grouping reports by identified reference species. ch_kmerfinder_json .join(ch_kmerfinder_report, by:0) .join(consensus, by:0) .map{ meta, report_json, report_txt, fasta -> - def refseq = [:] - refseq.id = report_json.splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Assembly"] - return tuple(refseq, meta, report_txt, fasta) + specie = report_json.splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Species"] + return tuple(specie, meta, report_txt, fasta) } - .groupTuple(by:0) - .set { ch_consensus_byrefseq } + .groupTuple(by:0) // Group by the "Species" field + .set { ch_reports_byreference } - // MODULE: Find & Download common reference sequences + // SUBWORKFLOW: For each specie target, this subworkflow collects the reference genomes assemblies ('GCF*'), and subsequently, downloads the winning reference assembly. if (!params.reference_fasta && !params.reference_gff) { FIND_DOWNLOAD_REFERENCE ( - ch_consensus_byrefseq.map{ refseq, meta, report_txt, fasta -> tuple(refseq, report_txt)}, + ch_reports_byreference.map{ specie, meta, report_txt, fasta-> tuple(specie, report_txt) }, ncbi_assembly_metadata ) - ch_reference_fasta = FIND_DOWNLOAD_REFERENCE.out.fna - ch_reference_gff = FIND_DOWNLOAD_REFERENCE.out.gff ch_versions = ch_versions.mix( FIND_DOWNLOAD_REFERENCE.out.versions.ifEmpty(null) ) - } - // Get reference sequence IDs - ch_consensus_byrefseq - .map{ refseq, meta, report_txt, fasta -> refseq } - .collect() - .set { ch_refseqid } + // Arrange sample's assemblyes into channels with their corresponding reference files. + ch_reports_byreference + .join(FIND_DOWNLOAD_REFERENCE.out.fna) + .join(FIND_DOWNLOAD_REFERENCE.out.gff) + .join(FIND_DOWNLOAD_REFERENCE.out.winner) + .map { + specie, meta, report_txt, fasta, fna, gff, winner_id -> + return tuple([id: winner_id.getBaseName()], meta, fasta, fna, gff) + } + .set { ch_consensus_byrefseq } + + } else if (params.reference_fasta && params.reference_gff) { + // TODO: Haven't tested so far + //ch_reports_byreference + // .join(params.reference_fasta) + // .join(params.reference_gff) + // .map { + // refmeta, meta, report_txt, fasta -> + // refmeta.id = params.reference_fasta.getBaseName() + // return tuple(refmeta, meta, fasta, fna, gff) + // } + // .set { ch_consensus_byrefseq } + } emit: versions = ch_versions.ifEmpty(null) // channel: [ path(versions.yml) ] summary_yaml = ch_summary_yaml // channel: [ path(kmerfinder_summary.yml) ] - refseqids = ch_refseqid // channel: [ val(refseq1), val(refseq1),...] - reference_fasta = ch_reference_fasta // channel: [ meta, path(*.fna) ] - reference_gff = ch_reference_gff // channel: [ meta, path(*.gff) ] - consensus_byrefseq = ch_consensus_byrefseq // channel: [ refseq, meta, report_txt, fasta ] + consensus_byrefseq = ch_consensus_byrefseq // channel: [ refmeta, meta, fasta, fna, gff ] } diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 0604e43a..b73817d7 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -430,31 +430,22 @@ workflow BACASS { ch_for_kmerfinder = PORECHOP_PORECHOP.out.reads } + // TODO: Future versions are intended to operate seamlessly without the need for *.fasta assembly files to be included in the subworkflow (by using sample name [meta] only). This enhancement aims to optimize efficiency. KMERFINDER_SUBWORKFLOW ( ch_kmerfinderdb, params.ncbi_assembly_metadata, ch_for_kmerfinder, ch_assembly ) - ch_refseqid = KMERFINDER_SUBWORKFLOW.out.refseqids - ch_reference_fasta = KMERFINDER_SUBWORKFLOW.out.reference_fasta - ch_reference_gff = KMERFINDER_SUBWORKFLOW.out.reference_gff - ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq ch_kmerfinder_multiqc = KMERFINDER_SUBWORKFLOW.out.summary_yaml + ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) - // Group data based on ref-genome and rename meta according to the identified references count. - ch_consensus_byrefseq // [ refseq, meta, report_txt, consensus ] - .join(ch_reference_fasta) // [ refseq, meta, report_txt, consensus, ref_fasta ] - .join(ch_reference_gff) // [ refseq, meta, report_txt, consensus, ref_fasta, ref_gff ] - .groupTuple(by:0) + // Parsing channel to set up QUAST_BYREFSEQ input. + ch_consensus_byrefseq // [ refseq, meta, fasta, ref_fna, ref_gff ] .map { - refseq, meta, report_txt, consensus, ref_fasta, ref_gff -> ch_refseqid.size() - if (ch_refseqid.size().getVal() > 1 ){ - return [refseq, consensus.flatten(), ref_fasta, ref_gff] - } else { - return [[id:'report'], consensus.flatten(), ref_fasta, ref_gff] - } + refmeta, meta, consensus, ref_fna, ref_gff -> + return tuple(refmeta, consensus.flatten(), ref_fna, ref_gff) } .set { ch_to_quast_byrefseq } } @@ -462,7 +453,6 @@ workflow BACASS { // // MODULE: QUAST, assembly QC // - // FIXME: simplify it. I think choolsing anotherapproach will improve it ch_assembly .collect{it[1]} .map{ consensus -> tuple([id:'report'], consensus) } @@ -475,16 +465,16 @@ workflow BACASS { params.reference_gff ?: [[:],[]] ) ch_quast_multiqc = QUAST.out.results - } else if (!params.skip_kmerfinder && ch_to_quast_byrefseq) { + } else if (!params.skip_kmerfinder) { QUAST( ch_to_quast, [[:],[]], [[:],[]] ) QUAST_BYREFSEQID( - ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, consensus)}, - ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_fasta)}, - ch_to_quast_byrefseq.map{ refseqid, consensus, ref_fasta, ref_gff -> tuple( refseqid, ref_gff)} + ch_to_quast_byrefseq.map{ refmeta, consensus, ref_fasta, ref_gff -> tuple( refmeta, consensus)}, + ch_to_quast_byrefseq.map{ refmeta, consensus, ref_fasta, ref_gff -> tuple( refmeta, ref_fasta)}, + ch_to_quast_byrefseq.map{ refmeta, consensus, ref_fasta, ref_gff -> tuple( refmeta, ref_gff)} ) ch_quast_multiqc = QUAST_BYREFSEQID.out.results } From 923696d6ff061e9f00e556d0cb34b7eedc363e51 Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Fri, 17 May 2024 16:42:04 +0200 Subject: [PATCH 47/58] fix uncompress method to parse kmerfinder db --- modules.json | 89 +++++++++++++++----- nextflow.config | 4 +- subworkflows/local/kmerfinder_subworkflow.nf | 1 - workflows/bacass.nf | 26 +++--- 4 files changed, 85 insertions(+), 35 deletions(-) diff --git a/modules.json b/modules.json index 639e1712..5950e168 100644 --- a/modules.json +++ b/modules.json @@ -8,101 +8,146 @@ "bakta/bakta": { "branch": "master", "git_sha": "f05fa7c6753f92be861d606378860dcd5c828880", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bakta/baktadbdownload": { "branch": "master", "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "canu": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "dragonflye": { "branch": "master", "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/dragonflye/dragonflye.diff" }, "fastp": { "branch": "master", "git_sha": "003920c7f9a8ae19b69a97171922880220bedf56", - "installed_by": ["fastq_trim_fastp_fastqc"] + "installed_by": [ + "fastq_trim_fastp_fastqc" + ] }, "fastqc": { "branch": "master", "git_sha": "f4ae1d942bd50c5c0b9bd2de1393ce38315ba57c", - "installed_by": ["fastq_trim_fastp_fastqc"] + "installed_by": [ + "fastq_trim_fastp_fastqc" + ] }, "gunzip": { "branch": "master", "git_sha": "e06548bfa36ee31869b81041879dd6b3a83b1d57", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "kraken2/kraken2": { "branch": "master", "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "miniasm": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "minimap2/align": { "branch": "master", "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "multiqc": { + "branch": "master", + "git_sha": "9e71d8519dfbfc328c078bba14d4bd4c99e39a94", + "installed_by": [ + "modules" + ] }, "nanoplot": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/nanoplot/nanoplot.diff" }, "porechop/porechop": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "prokka": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "quast": { "branch": "master", "git_sha": "344638191a5d6b3526556410819dfcf24e98039e", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/quast/quast.diff" }, "racon": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/racon/racon.diff" }, "samtools/index": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/sort": { "branch": "master", "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "untar": { "branch": "master", "git_sha": "d0b4fc03af52a1cc8c6fb4493b921b57352b1dd8", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -111,10 +156,12 @@ "fastq_trim_fastp_fastqc": { "branch": "master", "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 5ff8c5ce..e9289749 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,7 +23,7 @@ params { kmerfinderdb = "" reference_fasta = "" reference_gff = "" - ncbi_assembly_metadata = "" + ncbi_assembly_metadata = "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt" // Assembly parameters assembler = 'unicycler' // Allowed: ['unicycler', 'canu', 'miniasm', 'dragonflye'] @@ -48,7 +48,7 @@ params { skip_fastqc = false skip_fastp = false skip_kraken2 = false - skip_kmerfinder = true + skip_kmerfinder = false skip_pycoqc = false skip_annotation = false skip_polish = false diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf index d2e1cd36..2f1ac1e9 100644 --- a/subworkflows/local/kmerfinder_subworkflow.nf +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -15,7 +15,6 @@ workflow KMERFINDER_SUBWORKFLOW { main: ch_versions = Channel.empty() - // MODULE: Kmerfinder, QC for sample purity. Identifies reference specie and reference genome assembly for each sample. KMERFINDER ( reads, diff --git a/workflows/bacass.nf b/workflows/bacass.nf index a85295e6..6963d8a6 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -106,7 +106,7 @@ include { KRAKEN2_KRAKEN2 as KRAKEN2_LONG } from '../modules/nf-core/krake include { QUAST } from '../modules/nf-core/quast/main' include { QUAST as QUAST_BYREFSEQID } from '../modules/nf-core/quast/main' include { GUNZIP } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_KMERFINDERDB } from '../modules/nf-core/gunzip/main' +include { UNTAR } from '../modules/nf-core/untar/main' include { PROKKA } from '../modules/nf-core/prokka/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' @@ -410,15 +410,18 @@ workflow BACASS { // // SUBWORKFLOW: Kmerfinder, QC for sample purity. // - // TODO: Executes both kmerfinder and organizes samples by the reference genome (all this through the kmerfinder_subworkflow()). Ideally, users can also utilize kmerfinder independently without the need to download reference genome and grouping data —simply running kmerfinder alone-. + // TODO: Executes both kmerfinder and classifies samples by their reference genome (all this through the kmerfinder_subworkflow()). + // Ideally, users can also utilize kmerfinder independently without the need to download reference genome and grouping data —simply running kmerfinder alone-. ch_kmerfinder_multiqc = Channel.empty() if (!params.skip_kmerfinder) { - + // TODO: mv this to subworkflow // Process kmerfinder database - if( params.kmerfinderdb.endsWith('.gz') ){ - GUNZIP_KMERFINDERDB ( params.kmerfinderdb ) - ch_kmerfinderdb = GUNZIP_KMERFINDERDB.out.gunzip + ch_kmerfinderdb = file(params.kmerfinderdb, checkIfExists: true) + if ( params.kmerfinderdb.endsWith('.gz') ) { + UNTAR ( [[id: ch_kmerfinderdb.getSimpleName()], ch_kmerfinderdb] ) + ch_kmerfinderdb = UNTAR_KMERFINDERDB.out.untar.map{ meta, file -> file } + } else { ch_kmerfinderdb = params.kmerfinderdb } @@ -445,7 +448,7 @@ workflow BACASS { ch_consensus_byrefseq // [ refseq, meta, fasta, ref_fna, ref_gff ] .map { refmeta, meta, consensus, ref_fna, ref_gff -> - return tuple(refmeta, consensus.flatten(), ref_fna, ref_gff) + return tuple(refmeta, consensus.flatten(), ref_fna, ref_gff) } .set { ch_to_quast_byrefseq } } @@ -453,11 +456,10 @@ workflow BACASS { // // MODULE: QUAST, assembly QC // - // FIXME: simplify it. I think choolsing anotherapproach will improve it ch_assembly - .collect{it[1]} - .map{ consensus -> tuple([id:'report'], consensus) } - .set{ ch_to_quast } + .collect{it[1]} + .map{ consensus -> tuple([id:'report'], consensus) } + .set{ ch_to_quast } if(params.skip_kmerfinder){ QUAST( @@ -467,6 +469,8 @@ workflow BACASS { ) ch_quast_multiqc = QUAST.out.results } else if (!params.skip_kmerfinder) { + // Quast runs twice if kmerfinder is allowed. + // This approach allow Quast to calculate relevant parameters such as genome fraction based on a reference genome. QUAST( ch_to_quast, [[:],[]], From b626c5d6d4c7ad2fbb40a9e17b9c67ff12651b08 Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Fri, 17 May 2024 23:44:55 +0200 Subject: [PATCH 48/58] add new kmerfinderdb untar method and fix standalone py --- bin/download_reference.py | 29 ++++++++++++++++-------- modules/local/find_download_reference.nf | 2 +- nextflow.config | 2 +- workflows/bacass.nf | 2 +- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/bin/download_reference.py b/bin/download_reference.py index 17a43351..beb971fc 100755 --- a/bin/download_reference.py +++ b/bin/download_reference.py @@ -52,7 +52,7 @@ #import wget import requests - +# TODO: Generate report def parse_args(args=None): Description = ( "download the reference files \ @@ -117,24 +117,33 @@ def download_references(file, reference, out_dir): if not item.startswith("#") ] - url = [row[19] for row in inref if row[0] in top_reference] + # Initialize an empty list to store the URLs + dir_url = [] + + # Iterate over each row in the inref + for row in inref: + # Construct the ref_query using assembly_accession and asm_name + assembly_accession = row[0] + asm_name = row[15] + ref_query = f"{assembly_accession}_{asm_name}" + + # Check if ref_query matches the search value + if ref_query == top_reference: + # make url # Append the 20th element of the row to the URL list: + assembly_url = row[19] + "/" + ref_query + dir_url.append(assembly_url) - if len(url) == 0: + if len(dir_url) == 0: print("No assemblies responding to the top reference: ", top_reference, " were found") sys.exit(1) - - url = str(url[0]) - url_https = url.replace('ftp', 'https', 1) + dir_url = str(dir_url[0]) # get url and reference file for r_end in reference_ends: - out_file = out_dir + "/" + top_reference + r_end - file_url = url_https + "/" + top_reference + r_end - - print(out_file) + file_url = dir_url + r_end print(file_url) #wget.download(file_url, out_file) diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf index 8f7a52e4..87f73664 100644 --- a/modules/local/find_download_reference.nf +++ b/modules/local/find_download_reference.nf @@ -1,6 +1,6 @@ process FIND_DOWNLOAD_REFERENCE { tag "${task.process}" - label 'process_low' + label 'process_medium' conda "conda-forge::requests=2.26.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/nextflow.config b/nextflow.config index e9289749..cd849838 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,7 +23,7 @@ params { kmerfinderdb = "" reference_fasta = "" reference_gff = "" - ncbi_assembly_metadata = "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt" + ncbi_assembly_metadata = "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt" // Assembly parameters assembler = 'unicycler' // Allowed: ['unicycler', 'canu', 'miniasm', 'dragonflye'] diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 6963d8a6..62c3a8fd 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -420,7 +420,7 @@ workflow BACASS { ch_kmerfinderdb = file(params.kmerfinderdb, checkIfExists: true) if ( params.kmerfinderdb.endsWith('.gz') ) { UNTAR ( [[id: ch_kmerfinderdb.getSimpleName()], ch_kmerfinderdb] ) - ch_kmerfinderdb = UNTAR_KMERFINDERDB.out.untar.map{ meta, file -> file } + ch_kmerfinderdb = UNTAR.out.untar.map{ meta, file -> file } } else { ch_kmerfinderdb = params.kmerfinderdb From 75faba60f1931f8f636725b761700bc26b41c572 Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Mon, 20 May 2024 09:51:39 +0200 Subject: [PATCH 49/58] fix step to prepare kmerfinderdb --- workflows/bacass.nf | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 62c3a8fd..01ac79de 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -39,8 +39,8 @@ if (!params.skip_kmerfinder) { if (!params.kmerfinderdb || !params.ncbi_assembly_metadata) { exit 1, "[KMERFINDER]: Missing --kmerfinder_db and/or --ncbi_assembly_metadata arguments. Both are required to run KMERFINDER." } else { - file(params.kmerfinderdb, checkIfExists: true) - file(params.ncbi_assembly_metadata, checkIfExists: true) + kmerfinderdb = file(params.kmerfinderdb, checkIfExists: true) + ncbi_assembly_metadata = file(params.ncbi_assembly_metadata, checkIfExists: true) } } /* @@ -415,15 +415,12 @@ workflow BACASS { ch_kmerfinder_multiqc = Channel.empty() if (!params.skip_kmerfinder) { - // TODO: mv this to subworkflow - // Process kmerfinder database - ch_kmerfinderdb = file(params.kmerfinderdb, checkIfExists: true) - if ( params.kmerfinderdb.endsWith('.gz') ) { - UNTAR ( [[id: ch_kmerfinderdb.getSimpleName()], ch_kmerfinderdb] ) - ch_kmerfinderdb = UNTAR.out.untar.map{ meta, file -> file } - + // Prepare kmerfinder database + if ( kmerfinderdb.name.endsWith('.gz') ) { + UNTAR ( [[ id: kmerfinderdb.getSimpleName() ], kmerfinderdb] ) + ch_kmerfinderdb_untar = UNTAR.out.untar.map{ meta, file -> file } } else { - ch_kmerfinderdb = params.kmerfinderdb + ch_kmerfinderdb_untar = Channel.from(kmerfinder_db) } // Set kmerfinder input based on assembly type @@ -433,10 +430,10 @@ workflow BACASS { ch_for_kmerfinder = PORECHOP_PORECHOP.out.reads } - // TODO: Future versions are intended to operate seamlessly without the need for *.fasta assembly files to be included in the subworkflow (by using sample name [meta] only). This enhancement aims to optimize efficiency. + // RUN kmerfinder subworkflow KMERFINDER_SUBWORKFLOW ( - ch_kmerfinderdb, - params.ncbi_assembly_metadata, + ch_kmerfinderdb_untar, + ncbi_assembly_metadata, ch_for_kmerfinder, ch_assembly ) @@ -444,8 +441,8 @@ workflow BACASS { ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) - // Parsing channel to set up QUAST_BYREFSEQ input. - ch_consensus_byrefseq // [ refseq, meta, fasta, ref_fna, ref_gff ] + // Set channel to perform by refseq QUAST based on reference genome identified with KMERFINDER. + ch_consensus_byrefseq .map { refmeta, meta, consensus, ref_fna, ref_gff -> return tuple(refmeta, consensus.flatten(), ref_fna, ref_gff) From fd96b5aef896fa7528acc76dc650832cca3cff4e Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Mon, 20 May 2024 09:53:00 +0200 Subject: [PATCH 50/58] add kmerfinder to pipeline tests --- conf/test.config | 1 + conf/test_dfast.config | 1 + conf/test_full.config | 3 +++ conf/test_hybrid.config | 1 + conf/test_hybrid_dragonflye.config | 2 ++ conf/test_long.config | 1 + conf/test_long_dragonflye.config | 1 + conf/test_long_miniasm.config | 1 + nextflow.config | 10 +++++----- 9 files changed, 16 insertions(+), 5 deletions(-) diff --git a/conf/test.config b/conf/test.config index c827fd2d..7569a722 100644 --- a/conf/test.config +++ b/conf/test.config @@ -28,4 +28,5 @@ params { assembly_type = 'short' skip_pycoqc = true skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_dfast.config b/conf/test_dfast.config index b1b02c4b..7178dd9f 100644 --- a/conf/test_dfast.config +++ b/conf/test_dfast.config @@ -28,4 +28,5 @@ params { assembly_type = 'short' skip_pycoqc = true skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_full.config b/conf/test_full.config index 9432d763..48ed5ac0 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -17,4 +17,7 @@ params { // Input data for full size test input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_full.tsv' kraken2db = 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz' + kmerfinderdb = 'https://zenodo.org/records/10458361/files/20190108_kmerfinder_stable_dirs.tar.gz?download=1' + ncbi_assembly_metadata = 'https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt' + } diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config index c27563a8..b732a883 100644 --- a/conf/test_hybrid.config +++ b/conf/test_hybrid.config @@ -26,4 +26,5 @@ params { assembly_type='hybrid' prokka_args=" --fast" skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_hybrid_dragonflye.config b/conf/test_hybrid_dragonflye.config index 669514d3..99607520 100644 --- a/conf/test_hybrid_dragonflye.config +++ b/conf/test_hybrid_dragonflye.config @@ -27,4 +27,6 @@ params { assembler='dragonflye' prokka_args=" --fast" skip_kraken2 = true + skip_kmerfinder = true + skip_kmerfinder = true } diff --git a/conf/test_long.config b/conf/test_long.config index e722aae8..ec2e287a 100644 --- a/conf/test_long.config +++ b/conf/test_long.config @@ -27,4 +27,5 @@ params { assembly_type = 'long' skip_polish = true skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_long_dragonflye.config b/conf/test_long_dragonflye.config index 304fb4d8..6222f989 100644 --- a/conf/test_long_dragonflye.config +++ b/conf/test_long_dragonflye.config @@ -23,4 +23,5 @@ params { assembler = 'dragonflye' skip_kraken2 = true skip_polish = true + skip_kmerfinder = true } diff --git a/conf/test_long_miniasm.config b/conf/test_long_miniasm.config index 07af1a2c..052f86f1 100644 --- a/conf/test_long_miniasm.config +++ b/conf/test_long_miniasm.config @@ -27,4 +27,5 @@ params { assembly_type = 'long' assembler = 'miniasm' kraken2db = "https://genome-idx.s3.amazonaws.com/kraken/16S_Greengenes13.5_20200326.tgz" + skip_kmerfinder = true } diff --git a/nextflow.config b/nextflow.config index cd849838..fff5813e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -19,11 +19,11 @@ params { save_merged = false // Contamination_screening - kraken2db = "" - kmerfinderdb = "" - reference_fasta = "" - reference_gff = "" - ncbi_assembly_metadata = "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt" + kraken2db = '' + kmerfinderdb = '' + reference_fasta = '' + reference_gff = '' + ncbi_assembly_metadata = '' // Assembly parameters assembler = 'unicycler' // Allowed: ['unicycler', 'canu', 'miniasm', 'dragonflye'] From 0fca1d145e696f8f1df1a5c628cbdca681609114 Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Mon, 20 May 2024 10:28:28 +0200 Subject: [PATCH 51/58] kmerfinder subworkflow cleaning --- bin/download_reference.py | 7 ++- subworkflows/local/kmerfinder_subworkflow.nf | 53 ++++++++------------ workflows/bacass.nf | 3 +- 3 files changed, 24 insertions(+), 39 deletions(-) diff --git a/bin/download_reference.py b/bin/download_reference.py index beb971fc..b0630328 100755 --- a/bin/download_reference.py +++ b/bin/download_reference.py @@ -5,12 +5,12 @@ ============================================================= INSTITUTION: BU-ISCIII AUTHOR: Guillermo J. Gorines Cordero -MAIL: guillermo.gorines@urjc.es +EDITED BY: Daniel VM VERSION: 0.1 CREATED: Early 2022 REVISED: 18-2-2022 EDITED: 14-11-2023 -DESCRIPTION: +DESCRIPTION: 20-05-2024 Given a file with the kmerfinder results and frequencies (probably created by find_common_reference.py), and the NCBI assembly sheet, download the top-reference genome, gff and protein files from @@ -71,7 +71,7 @@ def parse_args(args=None): ) parser.add_argument( "-reference", - help="File containing the paths to bacterial references." + help="File containing the paths to bacterial references. See example in: https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt" ) parser.add_argument( "-out_dir", @@ -140,7 +140,6 @@ def download_references(file, reference, out_dir): dir_url = str(dir_url[0]) # get url and reference file - for r_end in reference_ends: out_file = out_dir + "/" + top_reference + r_end file_url = dir_url + r_end diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf index 2f1ac1e9..bab7ae94 100644 --- a/subworkflows/local/kmerfinder_subworkflow.nf +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -15,6 +15,7 @@ workflow KMERFINDER_SUBWORKFLOW { main: ch_versions = Channel.empty() + // MODULE: Kmerfinder, QC for sample purity. Identifies reference specie and reference genome assembly for each sample. KMERFINDER ( reads, @@ -24,14 +25,14 @@ workflow KMERFINDER_SUBWORKFLOW { ch_kmerfinder_json = KMERFINDER.out.json ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) - // MODULE: Kmerfinder summary report. Generates a csv summary file collecting all sample reports. + // MODULE: Kmerfinder summary report. Generates a csv report file collecting all sample references. KMERFINDER_SUMMARY ( - ch_kmerfinder_report.map{meta, report -> report }.collect() + ch_kmerfinder_report.map{ meta, report -> report }.collect() ) ch_summary_yaml = KMERFINDER_SUMMARY.out.yaml ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) - // SUBWORKFLOW: Grouping reports by identified reference species. + // SUBWORKFLOW: Create a channel to organize assemblies and reports based on the identified Kmerfinder reference. ch_kmerfinder_json .join(ch_kmerfinder_report, by:0) .join(consensus, by:0) @@ -43,37 +44,23 @@ workflow KMERFINDER_SUBWORKFLOW { .groupTuple(by:0) // Group by the "Species" field .set { ch_reports_byreference } - // SUBWORKFLOW: For each specie target, this subworkflow collects the reference genomes assemblies ('GCF*'), and subsequently, downloads the winning reference assembly. - if (!params.reference_fasta && !params.reference_gff) { - FIND_DOWNLOAD_REFERENCE ( - ch_reports_byreference.map{ specie, meta, report_txt, fasta-> tuple(specie, report_txt) }, - ncbi_assembly_metadata - ) - ch_versions = ch_versions.mix( FIND_DOWNLOAD_REFERENCE.out.versions.ifEmpty(null) ) - - // Arrange sample's assemblyes into channels with their corresponding reference files. - ch_reports_byreference - .join(FIND_DOWNLOAD_REFERENCE.out.fna) - .join(FIND_DOWNLOAD_REFERENCE.out.gff) - .join(FIND_DOWNLOAD_REFERENCE.out.winner) - .map { - specie, meta, report_txt, fasta, fna, gff, winner_id -> - return tuple([id: winner_id.getBaseName()], meta, fasta, fna, gff) - } - .set { ch_consensus_byrefseq } + // SUBWORKFLOW: For each species target, this subworkflow collects reference genome assemblies ('GCF*') and subsequently downloads the best matching reference assembly. + FIND_DOWNLOAD_REFERENCE ( + ch_reports_byreference.map{ specie, meta, report_txt, fasta-> tuple(specie, report_txt) }, + ncbi_assembly_metadata + ) + ch_versions = ch_versions.mix( FIND_DOWNLOAD_REFERENCE.out.versions.ifEmpty(null) ) - } else if (params.reference_fasta && params.reference_gff) { - // TODO: Haven't tested so far - //ch_reports_byreference - // .join(params.reference_fasta) - // .join(params.reference_gff) - // .map { - // refmeta, meta, report_txt, fasta -> - // refmeta.id = params.reference_fasta.getBaseName() - // return tuple(refmeta, meta, fasta, fna, gff) - // } - // .set { ch_consensus_byrefseq } - } + // Organize sample assemblies into channels based on their corresponding reference files. + ch_reports_byreference + .join(FIND_DOWNLOAD_REFERENCE.out.fna) + .join(FIND_DOWNLOAD_REFERENCE.out.gff) + .join(FIND_DOWNLOAD_REFERENCE.out.winner) + .map { + specie, meta, report_txt, fasta, fna, gff, winner_id -> + return tuple([id: winner_id.getBaseName()], meta, fasta, fna, gff) + } + .set { ch_consensus_byrefseq } emit: versions = ch_versions.ifEmpty(null) // channel: [ path(versions.yml) ] diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 01ac79de..306f8096 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -410,8 +410,7 @@ workflow BACASS { // // SUBWORKFLOW: Kmerfinder, QC for sample purity. // - // TODO: Executes both kmerfinder and classifies samples by their reference genome (all this through the kmerfinder_subworkflow()). - // Ideally, users can also utilize kmerfinder independently without the need to download reference genome and grouping data —simply running kmerfinder alone-. + // Executes both kmerfinder and classifies samples by their reference genome (all this through the kmerfinder_subworkflow()). ch_kmerfinder_multiqc = Channel.empty() if (!params.skip_kmerfinder) { From 7778a12b2575699c0d8d7f4b3c59815aa94a2154 Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Mon, 20 May 2024 11:10:34 +0200 Subject: [PATCH 52/58] remove unnecessary dependencies after merging branch --- modules.json | 9 +- modules/nf-core/multiqc/environment.yml | 7 -- modules/nf-core/multiqc/main.nf | 55 ------------ modules/nf-core/multiqc/meta.yml | 58 ------------- modules/nf-core/multiqc/tests/main.nf.test | 84 ------------------- .../nf-core/multiqc/tests/main.nf.test.snap | 41 --------- modules/nf-core/multiqc/tests/tags.yml | 2 - workflows/bacass.nf | 4 +- 8 files changed, 3 insertions(+), 257 deletions(-) delete mode 100644 modules/nf-core/multiqc/environment.yml delete mode 100644 modules/nf-core/multiqc/main.nf delete mode 100644 modules/nf-core/multiqc/meta.yml delete mode 100644 modules/nf-core/multiqc/tests/main.nf.test delete mode 100644 modules/nf-core/multiqc/tests/main.nf.test.snap delete mode 100644 modules/nf-core/multiqc/tests/tags.yml diff --git a/modules.json b/modules.json index f69cfefe..6dc814ab 100644 --- a/modules.json +++ b/modules.json @@ -18,9 +18,7 @@ "canu": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "dragonflye": { "branch": "master", @@ -58,11 +56,6 @@ "git_sha": "2c2d1cf80866dbd6dd0ea5d61ddd59533a72d41e", "installed_by": ["modules"] }, - "multiqc": { - "branch": "master", - "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": ["modules"] - }, "nanoplot": { "branch": "master", "git_sha": "a31407dfaf0cb0d04768d5cb439fc6f4523a6981", diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml deleted file mode 100644 index ca39fb67..00000000 --- a/modules/nf-core/multiqc/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: multiqc -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::multiqc=1.21 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf deleted file mode 100644 index 47ac352f..00000000 --- a/modules/nf-core/multiqc/main.nf +++ /dev/null @@ -1,55 +0,0 @@ -process MULTIQC { - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.21--pyhdfd78af_0' : - 'biocontainers/multiqc:1.21--pyhdfd78af_0' }" - - input: - path multiqc_files, stageAs: "?/*" - path(multiqc_config) - path(extra_multiqc_config) - path(multiqc_logo) - - output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def config = multiqc_config ? "--config $multiqc_config" : '' - def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' - def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' - """ - multiqc \\ - --force \\ - $args \\ - $config \\ - $extra_config \\ - $logo \\ - . - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ - - stub: - """ - mkdir multiqc_data - touch multiqc_plots - touch multiqc_report.html - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml deleted file mode 100644 index 45a9bc35..00000000 --- a/modules/nf-core/multiqc/meta.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: multiqc -description: Aggregate results from bioinformatics analyses across many samples into a single report -keywords: - - QC - - bioinformatics tools - - Beautiful stand-alone HTML report -tools: - - multiqc: - description: | - MultiQC searches a given directory for analysis logs and compiles a HTML report. - It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. - homepage: https://multiqc.info/ - documentation: https://multiqc.info/docs/ - licence: ["GPL-3.0-or-later"] -input: - - multiqc_files: - type: file - description: | - List of reports / files recognised by MultiQC, for example the html and zip output of FastQC - - multiqc_config: - type: file - description: Optional config yml for MultiQC - pattern: "*.{yml,yaml}" - - extra_multiqc_config: - type: file - description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. - pattern: "*.{yml,yaml}" - - multiqc_logo: - type: file - description: Optional logo file for MultiQC - pattern: "*.{png}" -output: - - report: - type: file - description: MultiQC report file - pattern: "multiqc_report.html" - - data: - type: directory - description: MultiQC data dir - pattern: "multiqc_data" - - plots: - type: file - description: Plots created by MultiQC - pattern: "*_data" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" -maintainers: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test deleted file mode 100644 index f1c4242e..00000000 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ /dev/null @@ -1,84 +0,0 @@ -nextflow_process { - - name "Test Process MULTIQC" - script "../main.nf" - process "MULTIQC" - - tag "modules" - tag "modules_nfcore" - tag "multiqc" - - test("sarscov2 single-end [fastqc]") { - - when { - process { - """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = [] - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("multiqc_versions_single") } - ) - } - - } - - test("sarscov2 single-end [fastqc] [config]") { - - when { - process { - """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("multiqc_versions_config") } - ) - } - } - - test("sarscov2 single-end [fastqc] - stub") { - - options "-stub" - - when { - process { - """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = [] - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out.report.collect { file(it).getName() } + - process.out.data.collect { file(it).getName() } + - process.out.plots.collect { file(it).getName() } + - process.out.versions ).match("multiqc_stub") } - ) - } - - } -} diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap deleted file mode 100644 index bfebd802..00000000 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ /dev/null @@ -1,41 +0,0 @@ -{ - "multiqc_versions_single": { - "content": [ - [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-29T08:48:55.657331" - }, - "multiqc_stub": { - "content": [ - [ - "multiqc_report.html", - "multiqc_data", - "multiqc_plots", - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-29T08:49:49.071937" - }, - "multiqc_versions_config": { - "content": [ - [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-29T08:49:25.457567" - } -} \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml deleted file mode 100644 index bea6c0d3..00000000 --- a/modules/nf-core/multiqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -multiqc: - - modules/nf-core/multiqc/** diff --git a/workflows/bacass.nf b/workflows/bacass.nf index c8670657..7ad3a74f 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -575,10 +575,10 @@ workflow BACASS { ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([]), ch_kmerfinder_multiqc.collectFile(name: 'multiqc_kmerfinder.yaml').ifEmpty([]), ) - multiqc_report = MULTIQC.out.report.toList() + multiqc_report = MULTIQC_CUSTOM.out.report.toList() emit: - multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html + multiqc_report = MULTIQC_CUSTOM.out.report.toList() // channel: /path/to/multiqc_report.html versions = ch_versions // channel: [ path(versions.yml) ] } From 820ca64e86ef0e373271fafaa3aa4278f99497ca Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Mon, 20 May 2024 11:12:13 +0200 Subject: [PATCH 53/58] fix linting after mergin branch --- README.md | 2 +- assets/multiqc_config_long.yml | 2 - assets/multiqc_config_short.yml | 1 - bin/csv_to_yaml.py | 29 +++++----- bin/download_reference.py | 30 +++++----- bin/kmerfinder_summary.py | 12 ++-- bin/multiqc_to_custom_csv.py | 97 +++++++++++++++++++++++---------- docs/output.md | 2 + 8 files changed, 105 insertions(+), 70 deletions(-) diff --git a/README.md b/README.md index ae07d9b9..2996e355 100644 --- a/README.md +++ b/README.md @@ -52,8 +52,8 @@ In specific cases where samples recorded in the input samplesheet belong to more > NOTE: This scenario is supported when [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) analysis is performed only. - In cases where input samplesheet has files where , the pipeline will group samples in batches according to their reference genomes and will provide a general QUAST containing all the input samples and a by reference genome QUAST report, that is, a quast report for each reference genome. + ## Usage > [!NOTE] diff --git a/assets/multiqc_config_long.yml b/assets/multiqc_config_long.yml index 7c5349ba..51795ec6 100644 --- a/assets/multiqc_config_long.yml +++ b/assets/multiqc_config_long.yml @@ -57,7 +57,6 @@ module_order: path_filters: - "./bakta/*.txt" - report_section_order: nanostat: after: general_stats @@ -134,7 +133,6 @@ custom_data: format: "{:,.0f}" export_plots: true - # # Customise the module search patterns to speed up execution time # # - Skip module sub-tools that we are not interested in # # - Replace file-content searching with filename pattern searching diff --git a/assets/multiqc_config_short.yml b/assets/multiqc_config_short.yml index c068b167..2ce2eca6 100644 --- a/assets/multiqc_config_short.yml +++ b/assets/multiqc_config_short.yml @@ -51,7 +51,6 @@ module_order: path_filters: - "./bakta/*.txt" - report_section_order: fastqc: after: general_stats diff --git a/bin/csv_to_yaml.py b/bin/csv_to_yaml.py index 6f3fc9cf..2a14249b 100755 --- a/bin/csv_to_yaml.py +++ b/bin/csv_to_yaml.py @@ -4,19 +4,14 @@ import csv import yaml + def parse_args(args=None): - Description = ( - "Create a yaml file from csv input file grouping samples as keys and resting fields as their value pair." - ) + Description = "Create a yaml file from csv input file grouping samples as keys and resting fields as their value pair." Epilog = "Example usage: python csv_to_yaml.py -i myfile.csv -k 'sample_name' -o converted_file" parser = argparse.ArgumentParser(description=Description, epilog=Epilog) parser.add_argument( - "-i", - "--input", - type=str, - dest="CSV_FILE", - help="Input file in CSV format." + "-i", "--input", type=str, dest="CSV_FILE", help="Input file in CSV format." ) parser.add_argument( @@ -24,7 +19,7 @@ def parse_args(args=None): "--key_field", type=str, dest="KEY_FIELD", - help="Name of the key/column grupping field in the input csv." + help="Name of the key/column grupping field in the input csv.", ) parser.add_argument( @@ -33,26 +28,32 @@ def parse_args(args=None): type=str, default="output_file", dest="OUT_PREFIX", - help="Output file name" + help="Output file name", ) return parser.parse_args(args) + def parse_csv(csv_file): - with open(csv_file, 'r') as c: + with open(csv_file, "r") as c: csv_reader = csv.DictReader(c) - data = [ row for row in csv_reader] + data = [row for row in csv_reader] return data + def create_yaml(data, key, output_prefix): - yaml_data = {entry[key]: {k: v for k, v in entry.items() if k != key} for entry in data} - with open( output_prefix + '.yaml' , 'w') as yaml_file: + yaml_data = { + entry[key]: {k: v for k, v in entry.items() if k != key} for entry in data + } + with open(output_prefix + ".yaml", "w") as yaml_file: yaml.dump(yaml_data, yaml_file, default_flow_style=False) + def main(args=None): args = parse_args(args) file_list = parse_csv(args.CSV_FILE) create_yaml(data=file_list, key=args.KEY_FIELD, output_prefix=args.OUT_PREFIX) + if __name__ == "__main__": sys.exit(main()) diff --git a/bin/download_reference.py b/bin/download_reference.py index b0630328..88e89364 100755 --- a/bin/download_reference.py +++ b/bin/download_reference.py @@ -49,15 +49,14 @@ import argparse import os -#import wget +# import wget import requests + # TODO: Generate report def parse_args(args=None): - Description = ( - "download the reference files \ + Description = "download the reference files \ (fna, faa, gff)from the reference NCBI file." - ) Epilog = """Usage example: \ python download_reference.py \ -file \ @@ -66,17 +65,13 @@ def parse_args(args=None): parser = argparse.ArgumentParser(description=Description, epilog=Epilog) parser.add_argument( - "-file", - help="File containing the ranking of references from kmerfinder." + "-file", help="File containing the ranking of references from kmerfinder." ) parser.add_argument( "-reference", - help="File containing the paths to bacterial references. See example in: https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt" - ) - parser.add_argument( - "-out_dir", - help="Output directory." + help="File containing the paths to bacterial references. See example in: https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt", ) + parser.add_argument("-out_dir", help="Output directory.") return parser.parse_args(args) @@ -98,10 +93,9 @@ def download_references(file, reference, out_dir): ] top_reference = infile[0][0] - with open(str(top_reference) + ".winner", 'w') as topref: + with open(str(top_reference) + ".winner", "w") as topref: topref.write(top_reference) - # create the outdir (do nothing if already there) try: os.mkdir(out_dir) @@ -134,7 +128,11 @@ def download_references(file, reference, out_dir): dir_url.append(assembly_url) if len(dir_url) == 0: - print("No assemblies responding to the top reference: ", top_reference, " were found") + print( + "No assemblies responding to the top reference: ", + top_reference, + " were found", + ) sys.exit(1) dir_url = str(dir_url[0]) @@ -145,9 +143,9 @@ def download_references(file, reference, out_dir): file_url = dir_url + r_end print(file_url) - #wget.download(file_url, out_file) + # wget.download(file_url, out_file) response = requests.get(file_url, stream=True) - with open(out_file, 'wb') as out: + with open(out_file, "wb") as out: for chunk in response.iter_content(chunk_size=8192): out.write(chunk) diff --git a/bin/kmerfinder_summary.py b/bin/kmerfinder_summary.py index 612a2525..5d9fb513 100755 --- a/bin/kmerfinder_summary.py +++ b/bin/kmerfinder_summary.py @@ -15,7 +15,6 @@ def check_arg(args=None): - """ Description: Function collect arguments from command line using argparse @@ -32,14 +31,14 @@ def check_arg(args=None): parser = argparse.ArgumentParser( prog="07-kmerfinder.py", formatter_class=argparse.RawDescriptionHelpFormatter, - description="07-kmerfinder.py creates a csv file from results.txt file", # FIXME + description="07-kmerfinder.py creates a csv file from results.txt file", # FIXME ) parser.add_argument( "--path", "-p", required=True, - help="Insert path of results.txt file like /home/user/Service_folder/ANALYSIS/07-kmerfinder", # FIXME + help="Insert path of results.txt file like /home/user/Service_folder/ANALYSIS/07-kmerfinder", # FIXME ) parser.add_argument( @@ -61,7 +60,6 @@ def check_arg(args=None): def kmerfinder_dictionary(file_txt): - """ Description: Function to extract the relevant part of result.txt file @@ -71,7 +69,7 @@ def kmerfinder_dictionary(file_txt): dictionary """ - step = "07-kmerfinder_" # FIXME + step = "07-kmerfinder_" # FIXME num_lines = sum(1 for line in open(file_txt)) hits = num_lines - 1 # to count the total number of hits @@ -110,7 +108,6 @@ def kmerfinder_dictionary(file_txt): def dictionary2bn(dictionary, binary_file): - """ Description: @@ -134,7 +131,6 @@ def dictionary2bn(dictionary, binary_file): def dictionary2csv(dictionary, csv_file): - """ Description: @@ -185,7 +181,7 @@ def dictionary2csv(dictionary, csv_file): kmer_all = {} for sample in sample_list: - file_name = os.path.join(path, sample + "_results.txt" ) + file_name = os.path.join(path, sample + "_results.txt") kmer_all[sample] = kmerfinder_dictionary(file_name) print("kmerfinder_dictionary done") diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py index 391ca41e..3838ab5f 100755 --- a/bin/multiqc_to_custom_csv.py +++ b/bin/multiqc_to_custom_csv.py @@ -9,9 +9,7 @@ def parse_args(args=None): - Description = ( - "Create custom spreadsheet for pertinent MultiQC metrics generated by the nf-core/viralrecon pipeline." - ) + Description = "Create custom spreadsheet for pertinent MultiQC metrics generated by the nf-core/viralrecon pipeline." Epilog = "Example usage: python multiqc_to_custom_tsv.py" parser = argparse.ArgumentParser(description=Description, epilog=Epilog) parser.add_argument( @@ -61,7 +59,9 @@ def find_tag(d, tag): yield i -def yaml_fields_to_dict(yaml_file, append_dict={}, field_mapping_list=[], valid_sample_list=[]): +def yaml_fields_to_dict( + yaml_file, append_dict={}, field_mapping_list=[], valid_sample_list=[] +): integer_fields = [ "# contigs", "# contigs (>= 5000 bp)", @@ -83,7 +83,9 @@ def yaml_fields_to_dict(yaml_file, append_dict={}, field_mapping_list=[], valid_ val = list(find_tag(yaml_dict[k], j[0])) ## Fix for Cutadapt reporting reads/pairs as separate values if j[0] == "r_written" and len(val) == 0: - val = [list(find_tag(yaml_dict[k], "pairs_written"))[0] * 2] + val = [ + list(find_tag(yaml_dict[k], "pairs_written"))[0] * 2 + ] if len(val) != 0: val = val[0] if len(j) == 2: @@ -121,7 +123,9 @@ def yaml_fields_to_dict(yaml_file, append_dict={}, field_mapping_list=[], valid_ return append_dict -def metrics_dict_to_file(file_field_list, multiqc_data_dir, out_file, valid_sample_list=[]): +def metrics_dict_to_file( + file_field_list, multiqc_data_dir, out_file, valid_sample_list=[] +): metrics_dict = {} field_list = [] for yaml_file, mapping_list in file_field_list: @@ -164,7 +168,7 @@ def main(args=None): [ ("# Input reads", ["before_filtering", "total_reads"]), ("# Trimmed reads (fastp)", ["after_filtering", "total_reads"]), - ] + ], ), ( "multiqc_quast.yaml", @@ -179,14 +183,26 @@ def main(args=None): "multiqc_kmerfinder.yaml", [ ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), - ("# Best hit assembly ID (Kmerfinder)", ["07-kmerfinder_best_hit_# Assembly"]), - ("# Best hit query coverage (Kmerfinder)", ["07-kmerfinder_best_hit_Query_Coverage"]), + ( + "# Best hit assembly ID (Kmerfinder)", + ["07-kmerfinder_best_hit_# Assembly"], + ), + ( + "# Best hit query coverage (Kmerfinder)", + ["07-kmerfinder_best_hit_Query_Coverage"], + ), ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), - ("# Second hit assembly ID (Kmerfinder)", ["07-kmerfinder_second_hit_# Assembly"]), - ("# Second hit query coverage (Kmerfinder)", ["07-kmerfinder_second_hit_Query_Coverage"]), + ( + "# Second hit assembly ID (Kmerfinder)", + ["07-kmerfinder_second_hit_# Assembly"], + ), + ( + "# Second hit query coverage (Kmerfinder)", + ["07-kmerfinder_second_hit_Query_Coverage"], + ), ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), - ] + ], ), ] @@ -197,7 +213,7 @@ def main(args=None): ("# Input reads", ["Number of reads_fastq"]), ("# Median read lenght", ["Median read length_fastq"]), ("# Median read quality", ["Median read quality_fastq"]), - ] + ], ), ( "multiqc_quast.yaml", @@ -212,14 +228,26 @@ def main(args=None): "multiqc_kmerfinder.yaml", [ ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), - ("# Best hit assembly ID (Kmerfinder)", ["07-kmerfinder_best_hit_# Assembly"]), - ("# Best hit query coverage (Kmerfinder)", ["07-kmerfinder_best_hit_Query_Coverage"]), + ( + "# Best hit assembly ID (Kmerfinder)", + ["07-kmerfinder_best_hit_# Assembly"], + ), + ( + "# Best hit query coverage (Kmerfinder)", + ["07-kmerfinder_best_hit_Query_Coverage"], + ), ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), - ("# Second hit assembly ID (Kmerfinder)", ["07-kmerfinder_second_hit_# Assembly"]), - ("# Second hit query coverage (Kmerfinder)", ["07-kmerfinder_second_hit_Query_Coverage"]), + ( + "# Second hit assembly ID (Kmerfinder)", + ["07-kmerfinder_second_hit_# Assembly"], + ), + ( + "# Second hit query coverage (Kmerfinder)", + ["07-kmerfinder_second_hit_Query_Coverage"], + ), ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), - ] + ], ), ] @@ -229,7 +257,7 @@ def main(args=None): [ ("# Input short reads", ["before_filtering", "total_reads"]), ("# Trimmed short reads (fastp)", ["after_filtering", "total_reads"]), - ] + ], ), ( "multiqc_nanostat.yaml", @@ -237,7 +265,7 @@ def main(args=None): ("# Input long reads", ["Number of reads_fastq"]), ("# Median long reads lenght", ["Median read length_fastq"]), ("# Median long reads quality", ["Median read quality_fastq"]), - ] + ], ), ( "multiqc_quast.yaml", @@ -252,33 +280,45 @@ def main(args=None): "multiqc_kmerfinder.yaml", [ ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), - ("# Best hit assembly ID (Kmerfinder)", ["07-kmerfinder_best_hit_# Assembly"]), - ("# Best hit query coverage (Kmerfinder)", ["07-kmerfinder_best_hit_Query_Coverage"]), + ( + "# Best hit assembly ID (Kmerfinder)", + ["07-kmerfinder_best_hit_# Assembly"], + ), + ( + "# Best hit query coverage (Kmerfinder)", + ["07-kmerfinder_best_hit_Query_Coverage"], + ), ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), - ("# Second hit assembly ID (Kmerfinder)", ["07-kmerfinder_second_hit_# Assembly"]), - ("# Second hit query coverage (Kmerfinder)", ["07-kmerfinder_second_hit_Query_Coverage"]), + ( + "# Second hit assembly ID (Kmerfinder)", + ["07-kmerfinder_second_hit_# Assembly"], + ), + ( + "# Second hit query coverage (Kmerfinder)", + ["07-kmerfinder_second_hit_Query_Coverage"], + ), ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), - ] + ], ), ] ## Write de novo assembly metrics to file - if args.ASSEMBLY_TYPE == 'short': + if args.ASSEMBLY_TYPE == "short": metrics_dict_to_file( file_field_list=illumina_assembly_files, multiqc_data_dir=args.MULTIQC_DATA_DIR, out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", valid_sample_list=[], ) - elif args.ASSEMBLY_TYPE == 'long': + elif args.ASSEMBLY_TYPE == "long": metrics_dict_to_file( file_field_list=nanopore_assembly_files, multiqc_data_dir=args.MULTIQC_DATA_DIR, out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", valid_sample_list=[], ) - elif args.ASSEMBLY_TYPE == 'hybrid': + elif args.ASSEMBLY_TYPE == "hybrid": metrics_dict_to_file( file_field_list=hybrid_assembly_files, multiqc_data_dir=args.MULTIQC_DATA_DIR, @@ -286,5 +326,6 @@ def main(args=None): valid_sample_list=[], ) + if __name__ == "__main__": sys.exit(main()) diff --git a/docs/output.md b/docs/output.md index f35f7690..ca8a3e47 100644 --- a/docs/output.md +++ b/docs/output.md @@ -126,6 +126,7 @@ The pipeline includes a dedicated step for short and long reads QC as well as co Output files - `Kmerfinder/{ID}/` + - `*_results.txt`: Kmerfinder report table containing reads QC results and taxonomic information. - `Kmerfinder/`: @@ -196,6 +197,7 @@ The assembly QC is performed with [QUAST](http://quast.sourceforge.net/quast) fo Output files - `QUAST/report/` + - `icarus.html`: QUAST's contig browser as HTML - `report.html`: QUAST assembly QC as HTML report - `report.pdf`: QUAST assembly QC as pdf From 0410564e6bf9a3689514cda92f2e9cd6015e1fdc Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Mon, 20 May 2024 11:34:39 +0200 Subject: [PATCH 54/58] update CHANGLEOG in #135 --- CHANGELOG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d48b275..c5ea8246 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` +- [#135](https://github.com/nf-core/bacass/pull/135) Replaced nf-core MultiQC module with a custom MultiQC module. + ### `Added` +- [#135](https://github.com/nf-core/bacass/pull/135) Implementation of KmerFinder subworkflow Custom Quast, and Custom MultiQC Reports: + + - Added KmerFinder subworkflow for read quality control, purity assessment, and sample grouping based on reference genome estimation. + - Enhanced Quast Assembly QC to run both general and reference genome-based analyses when KmerFinder is invoked. + - Implemented custom MultiQC module with multiqc_config.yml files for different assembly modes (short, long, hybrid). + - Generated custom MultiQC HTML report consolidating metrics from KmerFinder, Quast, and other relevant sources. + - [#133](https://github.com/nf-core/bacass/pull/133) Update nf-core/bacass to the new nf-core 2.14.1 `TEMPLATE`. ### `Fixed` From 8967e55a3608d3d75259444dc35859ca8454e311 Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Thu, 23 May 2024 15:11:57 +0200 Subject: [PATCH 55/58] add reviewer suggestions #135 pt.1 --- README.md | 4 +- conf/modules.config | 20 +++++---- conf/test_full.config | 3 +- conf/test_hybrid_dragonflye.config | 6 +-- docs/output.md | 9 +--- modules/local/kmerfinder.nf | 8 ++-- nextflow.config | 2 +- nextflow_schema.json | 2 +- subworkflows/local/kmerfinder_subworkflow.nf | 34 ++++++++++----- .../utils_nfcore_bacass_pipeline/main.nf | 26 +++++++++++- workflows/bacass.nf | 42 +++---------------- 11 files changed, 79 insertions(+), 77 deletions(-) diff --git a/README.md b/README.md index 2996e355..589004f5 100644 --- a/README.md +++ b/README.md @@ -48,12 +48,10 @@ For users specifying both short read and long read (NanoPore) data, the pipeline In all cases, the assembly is assessed using [QUAST](http://bioinf.spbau.ru/quast). The resulting bacterial assembly is furthermore annotated using [Prokka](https://github.com/tseemann/prokka), [Bakta](https://github.com/oschwengers/bakta) or [DFAST](https://github.com/nigyta/dfast_core). -In specific cases where samples recorded in the input samplesheet belong to more than one species, the pipeline finds and downloads their respectve reference genomes (this also works with single specie input samplesheet). It then groups the samples into batches and collects assembly QC results based on their corresponding reference genomes. +If Kmerfinder is invoked, the pipeline will group samples according to the [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/)-estimated reference genomes. Afterwards, two QUAST steps will be carried out: an initial ('general') [QUAST](http://bioinf.spbau.ru/quast) of all samples without reference genomes, and subsequently, a 'by reference genome' [QUAST](http://bioinf.spbau.ru/quast) to aggregate samples with their reference genomes. > NOTE: This scenario is supported when [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) analysis is performed only. -In cases where input samplesheet has files where , the pipeline will group samples in batches according to their reference genomes and will provide a general QUAST containing all the input samples and a by reference genome QUAST report, that is, a quast report for each reference genome. - ## Usage > [!NOTE] diff --git a/conf/modules.config b/conf/modules.config index 5cb3bd5f..78bd92bb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -31,15 +31,21 @@ process { ] } - // FIXME: SAVE TRIMMED NOT WORKING withName: 'PORECHOP_PORECHOP' { ext.args = '' ext.prefix = { "${meta.id}.porechop" } publishDir = [ - path: { "${params.outdir}/trimming/longreads" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enable: params.save_trimmed + [ + path: { "${params.outdir}/trimming/longreads" }, + pattern: "*.fastq.gz", + mode: params.publish_dir_mode, + enabled: params.save_trimmed + ], + [ + path: { "${params.outdir}/trimming/longreads" }, + pattern: "*.log", + mode: params.publish_dir_mode, + ] ] } @@ -178,13 +184,11 @@ process { ] } - // FIXME: output structure and meta updated. It might require a fixme withName: 'QUAST|QUAST_BYREFSEQID' { ext.args = '' publishDir = [ path: { "${params.outdir}/QUAST" }, mode: params.publish_dir_mode, - //pattern: "{report,runs_per_reference/*}/{report.html,report.pdf,icarus.html}", saveAs: { filename -> if (filename.equals('versions.yml') || filename.endsWith('.tsv')){ null @@ -217,7 +221,7 @@ process { ] } - withName: 'MULTIQC' { + withName: 'MULTIQC_CUSTOM' { ext.args = '-k yaml' publishDir = [ path: { "${params.outdir}/multiqc" }, diff --git a/conf/test_full.config b/conf/test_full.config index 1af8ca49..669ae228 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -17,7 +17,6 @@ params { // Input data for full size test input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_full.tsv' kraken2db = 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz' - kmerfinderdb = 'https://zenodo.org/records/10458361/files/20190108_kmerfinder_stable_dirs.tar.gz?download=1' + kmerfinderdb = 'https://zenodo.org/records/10458361/files/20190108_kmerfinder_stable_dirs.tar.gz' ncbi_assembly_metadata = 'https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt' - } diff --git a/conf/test_hybrid_dragonflye.config b/conf/test_hybrid_dragonflye.config index d23c8ea1..9cf34364 100644 --- a/conf/test_hybrid_dragonflye.config +++ b/conf/test_hybrid_dragonflye.config @@ -23,9 +23,9 @@ params { input = params.pipelines_testdata_base_path + 'bacass/bacass_hybrid_dragonflye.tsv' // some extra args to speed tests up - assembly_type ='hybrid' - assembler ='dragonflye' - prokka_args =" --fast" + assembly_type = 'hybrid' + assembler = 'dragonflye' + prokka_args = " --fast" skip_kraken2 = true skip_kmerfinder = true } diff --git a/docs/output.md b/docs/output.md index ca8a3e47..500aa6d4 100644 --- a/docs/output.md +++ b/docs/output.md @@ -126,11 +126,9 @@ The pipeline includes a dedicated step for short and long reads QC as well as co Output files - `Kmerfinder/{ID}/` - - `*_results.txt`: Kmerfinder report table containing reads QC results and taxonomic information. - -- `Kmerfinder/`: - - kmerfinder_summary.csv: A CSV file containing the most relevant results of all samples analyzed with Kmerfinder. +- `Kmerfinder/` + - `kmerfinder_summary.csv`: A CSV file containing the most relevant results of all samples analyzed with Kmerfinder. @@ -197,12 +195,9 @@ The assembly QC is performed with [QUAST](http://quast.sourceforge.net/quast) fo Output files - `QUAST/report/` - - `icarus.html`: QUAST's contig browser as HTML - `report.html`: QUAST assembly QC as HTML report - `report.pdf`: QUAST assembly QC as pdf - -- `QUAST/runs_per_reference/{reference_assembly}/` - `icarus.html`: QUAST's contig browser as HTML - `report.html`: QUAST assembly QC as HTML report - `report.pdf`: QUAST assembly QC as pdf diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf index 92aff76d..cca5f359 100644 --- a/modules/local/kmerfinder.nf +++ b/modules/local/kmerfinder.nf @@ -8,8 +8,7 @@ process KMERFINDER { 'biocontainers/kmerfinder:3.0.2--hdfd78af_0' }" input: - tuple val(meta), path(reads) - path(kmerfinder_db) + tuple val(meta), path(reads), path(kmerfinder_db) output: tuple val(meta), path("*_results.txt") , emit: report @@ -19,7 +18,8 @@ process KMERFINDER { script: def prefix = task.ext.prefix ?: "${meta.id}" def in_reads = reads[0] && reads[1] ? "${reads[0]} ${reads[1]}" : "${reads}" - + // WARNING: Ensure to update software version in this line if you modify the container/environment. + def kmerfinder_version = "3.0.2" """ kmerfinder.py \\ --infile $in_reads \\ @@ -33,7 +33,7 @@ process KMERFINDER { cat <<-END_VERSIONS > versions.yml "${task.process}": - kmerfinder: \$(echo "3.0.2") + kmerfinder: \$(echo "${kmerfinder_version}") END_VERSIONS """ } diff --git a/nextflow.config b/nextflow.config index 206891bd..0535ddb3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -73,7 +73,7 @@ params { validate_params = true schema_ignore_params = 'modules,igenomes_base' version = false - pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/' // Config options diff --git a/nextflow_schema.json b/nextflow_schema.json index d6337f1c..ce8b1a54 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -95,7 +95,7 @@ }, "ncbi_assembly_metadata": { "type": "string", - "description": "Master file (*.txt) containing a summary of asseblies available in GeneBank or RefSeq. See: https://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt" + "description": "Master file (*.txt) containing a summary of assemblies available in GeneBank or RefSeq. See: https://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt" } } }, diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf index bab7ae94..ef777ebc 100644 --- a/subworkflows/local/kmerfinder_subworkflow.nf +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -1,36 +1,50 @@ // // Kmerfinder subworkflow for species identification & QC // +include { UNTAR } from '../../modules/nf-core/untar/main' include { KMERFINDER } from '../../modules/local/kmerfinder' include { KMERFINDER_SUMMARY } from '../../modules/local/kmerfinder_summary' include { FIND_DOWNLOAD_REFERENCE } from '../../modules/local/find_download_reference' -include { QUAST } from '../../modules/nf-core/quast/main' workflow KMERFINDER_SUBWORKFLOW { take: - kmerfinder_db // channel: [ path ] - ncbi_assembly_metadata // channel: [ path ] reads // channel: [ meta, reads ] consensus // channel: [ meta, consensus ] main: ch_versions = Channel.empty() + // Prepare kmerfinder database + ch_kmerfinderdb = file(params.kmerfinderdb, checkIfExists: true) + ch_ncbi_assembly_metadata = file(params.ncbi_assembly_metadata, checkIfExists: true) + + if ( ch_kmerfinderdb.name.endsWith('.gz') ) { + UNTAR ( [[ id: ch_kmerfinderdb.getSimpleName() ], ch_kmerfinderdb] ) + ch_kmerfinderdb_untar = UNTAR.out.untar.map{ meta, file -> file } + ch_versions = ch_versions.mix(UNTAR.out.versions) + } else { + ch_kmerfinderdb_untar = Channel.from(params.kmerfinderdb) + } + // MODULE: Kmerfinder, QC for sample purity. Identifies reference specie and reference genome assembly for each sample. + reads + .combine(ch_kmerfinderdb_untar) + .map{ meta, reads, db -> tuple(meta, reads, db) } + .set{ ch_to_kmerfinder } + KMERFINDER ( - reads, - kmerfinder_db + ch_to_kmerfinder ) ch_kmerfinder_report = KMERFINDER.out.report ch_kmerfinder_json = KMERFINDER.out.json - ch_versions = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix(KMERFINDER.out.versions) // MODULE: Kmerfinder summary report. Generates a csv report file collecting all sample references. KMERFINDER_SUMMARY ( ch_kmerfinder_report.map{ meta, report -> report }.collect() ) ch_summary_yaml = KMERFINDER_SUMMARY.out.yaml - ch_versions = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix(KMERFINDER_SUMMARY.out.versions) // SUBWORKFLOW: Create a channel to organize assemblies and reports based on the identified Kmerfinder reference. ch_kmerfinder_json @@ -47,9 +61,9 @@ workflow KMERFINDER_SUBWORKFLOW { // SUBWORKFLOW: For each species target, this subworkflow collects reference genome assemblies ('GCF*') and subsequently downloads the best matching reference assembly. FIND_DOWNLOAD_REFERENCE ( ch_reports_byreference.map{ specie, meta, report_txt, fasta-> tuple(specie, report_txt) }, - ncbi_assembly_metadata + ch_ncbi_assembly_metadata ) - ch_versions = ch_versions.mix( FIND_DOWNLOAD_REFERENCE.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix(FIND_DOWNLOAD_REFERENCE.out.versions) // Organize sample assemblies into channels based on their corresponding reference files. ch_reports_byreference @@ -63,7 +77,7 @@ workflow KMERFINDER_SUBWORKFLOW { .set { ch_consensus_byrefseq } emit: - versions = ch_versions.ifEmpty(null) // channel: [ path(versions.yml) ] + versions = ch_versions // channel: [ path(versions.yml) ] summary_yaml = ch_summary_yaml // channel: [ path(kmerfinder_summary.yml) ] consensus_byrefseq = ch_consensus_byrefseq // channel: [ refmeta, meta, fasta, fna, gff ] } diff --git a/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf b/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf index 05cae8bc..b707cbcb 100644 --- a/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf @@ -75,7 +75,7 @@ workflow PIPELINE_INITIALISATION { // // Custom validation for pipeline parameters // - //validateInputParameters() + validateInputParameters() // // Create channel from input file provided through params.input @@ -156,6 +156,26 @@ workflow PIPELINE_COMPLETION { // def validateInputParameters() { // Add functions here for parameters validation + // Check Kraken2 dependencies + if (!params.skip_kraken2 && !params.kraken2db) { + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " Kraken2 database not provided.\n" + + " Please specify the '--kraken2db' parameter to provide the necessary database.\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + error(error_string) + } + + // Check kmerfinder dependencies + if (!params.skip_kmerfinder) { + if (!params.kmerfinderdb || !params.ncbi_assembly_metadata) { + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " Kmerfinder database and NCBI assembly metadata not provided.\n" + + " Please specify the '--kmerfinderdb' and '--ncbi_assembly_metadata' parameters.\n" + + " Both are required to run Kmerfinder.\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + error(error_string) + } + } } // @@ -184,7 +204,8 @@ def toolCitationText() { "ProeChop (Wick RR et al. 2017)", "Nanoplot (Wouter De Coster and Rosa Rademakers 2023)", "PycoQC (Adrien Leger & Tommaso Leonardi 2019)", - "Kreken2 (Derrick E. Wood et al. 2019)", + "Kraken2 (Derrick E. Wood et al. 2019)", + "Kmerfinder (Larsen et al. 2014)", "Unicycler (Ryan R Wick et al. 2017)", "Minimap & Miniasm (Heng Li 2016)", "Dragonflye (Robert A Petit III )", @@ -212,6 +233,7 @@ def toolBibliographyText() { "
  • Wouter De Coster, Rosa Rademakers, NanoPack2: population-scale evaluation of long-read sequencing data, Bioinformatics, Volume 39, Issue 5, May 2023, btad311, https://doi.org/10.1093/bioinformatics/btad311
  • ", "
  • Leger et al., (2019). pycoQC, interactive quality control for Oxford Nanopore Sequencing. Journal of Open Source Software, 4(34), 1236, https://doi.org/10.21105/joss.01236
  • ", "
  • Wood, D.E., Lu, J. & Langmead, B. Improved metagenomic analysis with Kraken 2. Genome Biol 20, 257 (2019). https://doi.org/10.1186/s13059-019-1891-0
  • ", + "
  • RBenchmarking of Methods for Genomic Taxonomy. Larsen MV, Cosentino S, Lukjancenko O, Saputra D, Rasmussen S, Hasman H, Sicheritz-Pontén T, Aarestrup FM, Ussery DW, Lund O. J Clin Microbiol. 2014 Feb 26.
  • ", "
  • Wick RR, Judd LM, Gorrie CL, Holt KE. Unicycler: Resolving bacterial genome assemblies from short and long sequencing reads. PLoS Comput Biol. 2017 Jun 8;13(6):e1005595. doi: 10.1371/journal.pcbi.1005595.
  • ", "
  • Heng Li, Minimap and miniasm: fast mapping and de novo assembly for noisy long sequences, Bioinformatics, Volume 32, Issue 14, July 2016, Pages 2103–2110, https://doi.org/10.1093/bioinformatics/btw152
  • ", "
  • Petit III, R. A. dragonflye: assemble bacterial isolate genomes from Nanopore reads (Version 1.1.2). https://github.com/rpetit3/dragonflye
  • ", diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 7ad3a74f..6fea1924 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -8,24 +8,6 @@ def checkPathParamList = [ params.input, params.multiqc_config, params.kraken2db, params.dfast_config ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } -// Check krakendb -if (!params.skip_kraken2) { - if (params.kraken2db) { - kraken2db = file(params.kraken2db, checkIfExists: true) - } else { - exit 1, "Missing Kraken2 DB arg" - } -} - -// Check kmerfinder dependencies -if (!params.skip_kmerfinder) { - if (!params.kmerfinderdb || !params.ncbi_assembly_metadata) { - exit 1, "[KMERFINDER]: Missing --kmerfinder_db and/or --ncbi_assembly_metadata arguments. Both are required to run KMERFINDER." - } else { - kmerfinderdb = file(params.kmerfinderdb, checkIfExists: true) - ncbi_assembly_metadata = file(params.ncbi_assembly_metadata, checkIfExists: true) - } -} /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -89,7 +71,6 @@ include { KRAKEN2_KRAKEN2 as KRAKEN2_LONG } from '../modules/nf-core/krake include { QUAST } from '../modules/nf-core/quast/main' include { QUAST as QUAST_BYREFSEQID } from '../modules/nf-core/quast/main' include { GUNZIP } from '../modules/nf-core/gunzip/main' -include { UNTAR } from '../modules/nf-core/untar/main' include { PROKKA } from '../modules/nf-core/prokka/main' // @@ -411,31 +392,20 @@ workflow BACASS { ch_kmerfinder_multiqc = Channel.empty() if (!params.skip_kmerfinder) { - // Prepare kmerfinder database - if ( kmerfinderdb.name.endsWith('.gz') ) { - UNTAR ( [[ id: kmerfinderdb.getSimpleName() ], kmerfinderdb] ) - ch_kmerfinderdb_untar = UNTAR.out.untar.map{ meta, file -> file } - } else { - ch_kmerfinderdb_untar = Channel.from(kmerfinder_db) - } - - // Set kmerfinder input based on assembly type + // Set kmerfinder channel based on assembly type if( params.assembly_type == 'short' || params.assembly_type == 'hybrid' ) { ch_for_kmerfinder = FASTQ_TRIM_FASTP_FASTQC.out.reads } else if ( params.assembly_type == 'long' ) { ch_for_kmerfinder = PORECHOP_PORECHOP.out.reads } - // RUN kmerfinder subworkflow KMERFINDER_SUBWORKFLOW ( - ch_kmerfinderdb_untar, - ncbi_assembly_metadata, ch_for_kmerfinder, ch_assembly ) ch_kmerfinder_multiqc = KMERFINDER_SUBWORKFLOW.out.summary_yaml ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq - ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null)) + ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions) // Set channel to perform by refseq QUAST based on reference genome identified with KMERFINDER. ch_consensus_byrefseq @@ -476,7 +446,7 @@ workflow BACASS { ) ch_quast_multiqc = QUAST_BYREFSEQID.out.results } - ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null)) + ch_versions = ch_versions.mix(QUAST.out.versions) // Check assemblies that require further processing for gene annotation ch_assembly @@ -571,15 +541,15 @@ workflow BACASS { ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([]), ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([]), ch_quast_multiqc.collect{it[1]}.ifEmpty([]), - ch_prokka_txt_multiqc.collect{it[1]}.ifEmpty([]), - ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([]), + ch_prokka_txt_multiqc.collect().ifEmpty([]), + ch_bakta_txt_multiqc.collect().ifEmpty([]), ch_kmerfinder_multiqc.collectFile(name: 'multiqc_kmerfinder.yaml').ifEmpty([]), ) multiqc_report = MULTIQC_CUSTOM.out.report.toList() emit: multiqc_report = MULTIQC_CUSTOM.out.report.toList() // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [ path(versions.yml) ] + versions = ch_versions // channel: [ path(versions.yml) ] } /* From 7075f3d2be3efa628124123381c22a4a6cdcf4c0 Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Thu, 23 May 2024 15:24:16 +0200 Subject: [PATCH 56/58] fix multqc channels --- workflows/bacass.nf | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 6fea1924..314d704f 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -14,16 +14,8 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +// Place config files here -// When invoking kmerfinder, utilize a custom MultiQC config file to generate a specialized report. This report will organize samples into groups based on their reference genome, w were previously calculated by kmerfinder. -if (!params.skip_kmerfinder && params.assembly_type) { - ch_multiqc_config = file("$projectDir/assets/multiqc_config_${params.assembly_type}.yml", checkIfExists: true) -} else { - ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -} -ch_multiqc_custom_config = params.multiqc_config ? file(params.multiqc_config) : [] -ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() -ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -519,20 +511,20 @@ workflow BACASS { // // MODULE: MultiQC // - ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) + ch_multiqc_config = !params.skip_kmerfinder && params.assembly_type ? Channel.fromPath("$projectDir/assets/multiqc_config_${params.assembly_type}.yml", checkIfExists: true) : Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath(params.multiqc_logo, checkIfExists: true) : Channel.empty() summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + ch_multiqc_custom_methods_description = params.multiqc_methods_description ? Channel.fromPath(params.multiqc_methods_description, checkIfExists: true) : Channel.fromPath("$projectDir/assets/methods_description_template.yml", checkIfExists: true) MULTIQC_CUSTOM ( - ch_multiqc_config, - ch_multiqc_custom_config, - ch_multiqc_logo.collect().ifEmpty([]), + ch_multiqc_config.ifEmpty([]), + ch_multiqc_custom_config.ifEmpty([]), + ch_multiqc_logo.ifEmpty([]), ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'), - ch_multiqc_custom_methods_description, - ch_collated_versions, + ch_multiqc_custom_methods_description.ifEmpty([]), + ch_collated_versions.ifEmpty([]), ch_fastqc_raw_multiqc.collect{it[1]}.ifEmpty([]), ch_trim_json_multiqc.collect{it[1]}.ifEmpty([]), ch_nanoplot_txt_multiqc.collect{it[1]}.ifEmpty([]), From 53629ce18e1a0c66e6716604e266f95ece38b5fa Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Thu, 23 May 2024 15:48:54 +0200 Subject: [PATCH 57/58] add reviewer suggestions #135 pt.2 --- modules/nf-core/quast/main.nf | 4 ++-- modules/nf-core/quast/quast.diff | 23 ----------------------- 2 files changed, 2 insertions(+), 25 deletions(-) delete mode 100644 modules/nf-core/quast/quast.diff diff --git a/modules/nf-core/quast/main.nf b/modules/nf-core/quast/main.nf index 68f06851..d8f36284 100644 --- a/modules/nf-core/quast/main.nf +++ b/modules/nf-core/quast/main.nf @@ -14,7 +14,7 @@ process QUAST { output: tuple val(meta), path("${prefix}") , emit: results - tuple val(meta), path("report.tsv") , emit: tsv + tuple val(meta), path("${prefix}.tsv") , emit: tsv tuple val(meta), path("${prefix}_transcriptome.tsv") , optional: true , emit: transcriptome tuple val(meta), path("${prefix}_misassemblies.tsv") , optional: true , emit: misassemblies tuple val(meta), path("${prefix}_unaligned.tsv") , optional: true , emit: unaligned @@ -37,7 +37,7 @@ process QUAST { $args \\ ${consensus.join(' ')} - ln -s ${prefix}/report.tsv report.tsv + ln -s ${prefix}/report.tsv ${prefix}.tsv [ -f ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ] && ln -s ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ${prefix}_transcriptome.tsv [ -f ${prefix}/contigs_reports/misassemblies_report.tsv ] && ln -s ${prefix}/contigs_reports/misassemblies_report.tsv ${prefix}_misassemblies.tsv [ -f ${prefix}/contigs_reports/unaligned_report.tsv ] && ln -s ${prefix}/contigs_reports/unaligned_report.tsv ${prefix}_unaligned.tsv diff --git a/modules/nf-core/quast/quast.diff b/modules/nf-core/quast/quast.diff deleted file mode 100644 index d267a2c9..00000000 --- a/modules/nf-core/quast/quast.diff +++ /dev/null @@ -1,23 +0,0 @@ -Changes in module 'nf-core/quast' ---- modules/nf-core/quast/main.nf -+++ modules/nf-core/quast/main.nf -@@ -14,7 +14,7 @@ - - output: - tuple val(meta), path("${prefix}") , emit: results -- tuple val(meta), path("${prefix}.tsv") , emit: tsv -+ tuple val(meta), path("report.tsv") , emit: tsv - tuple val(meta), path("${prefix}_transcriptome.tsv") , optional: true , emit: transcriptome - tuple val(meta), path("${prefix}_misassemblies.tsv") , optional: true , emit: misassemblies - tuple val(meta), path("${prefix}_unaligned.tsv") , optional: true , emit: unaligned -@@ -37,7 +37,7 @@ - $args \\ - ${consensus.join(' ')} - -- ln -s ${prefix}/report.tsv ${prefix}.tsv -+ ln -s ${prefix}/report.tsv report.tsv - [ -f ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ] && ln -s ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ${prefix}_transcriptome.tsv - [ -f ${prefix}/contigs_reports/misassemblies_report.tsv ] && ln -s ${prefix}/contigs_reports/misassemblies_report.tsv ${prefix}_misassemblies.tsv - [ -f ${prefix}/contigs_reports/unaligned_report.tsv ] && ln -s ${prefix}/contigs_reports/unaligned_report.tsv ${prefix}_unaligned.tsv - -************************************************************ From 7b66a5e8d84312322cef434bf64047b15697f8fb Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Thu, 23 May 2024 16:51:04 +0200 Subject: [PATCH 58/58] fix test_long_miniasm git CI test in #135 --- workflows/bacass.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 314d704f..07b60b0b 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -348,7 +348,7 @@ workflow BACASS { ch_kraken_long_multiqc = Channel.empty() if ( !params.skip_kraken2 ) { KRAKEN2_DB_PREPARATION ( - kraken2db + params.kraken2db ) ch_versions = ch_versions.mix(KRAKEN2_DB_PREPARATION.out.versions) KRAKEN2 (