diff --git a/.github/workflows/sanger_test.yml b/.github/workflows/sanger_test.yml index 32849b2e..cc2f2a19 100644 --- a/.github/workflows/sanger_test.yml +++ b/.github/workflows/sanger_test.yml @@ -23,6 +23,7 @@ jobs: parameters: | { "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}", + "use_work_dir_as_temp": true, } profiles: test,sanger,singularity,cleanup - uses: actions/upload-artifact@v3 diff --git a/CHANGELOG.md b/CHANGELOG.md index bf3e3e6a..050399ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,41 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[0.6.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.6.0)] – Bellsprout – [2024-09-13] + +The pipeline has now been validated for draft (unpublished) assemblies. + +- The pipeline now queries the NCBI database instead of GoaT to establish the + taxonomic classification of the species and the relevant Busco lineages. + In case the taxon_id is not found, the pipeline falls back to GoaT, which + is aware of upcoming taxon_ids in ENA. +- New `--busco_lineages` parameter to choose specific Busco lineages instead of + automatically selecting based on the taxonomy. +- All parameters are now passed the regular Nextflow way. There is no support + for the original Yaml configuration files of the Snakemake version. +- New option `--skip_taxon_filtering` to skip the taxon filtering in blast searches. + Mostly relevant for draft assemblies. +- Introduced the `--use_work_dir_as_temp` parameter to avoid leaving files in `/tmp`. + +### Parameters + +| Old parameter | New parameter | +| ------------- | ---------------------- | +| --yaml | | +| | --busco_lineages | +| | --skip_taxon_filtering | +| | --use_work_dir_as_temp | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present.
**NB:** Parameter has been **added** if just the new parameter information is present.
**NB:** Parameter has been **removed** if new parameter information isn't present. + +### Software dependencies + +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported. + +| Dependency | Old version | New version | +| ---------- | ----------- | ----------- | +| goat | 0.2.5 | | + ## [[0.5.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.1)] – Snorlax (patch 1) – [2024-08-22] ### Enhancements & fixes diff --git a/README.md b/README.md index c7b92970..b55ab353 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ It takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome 1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows)) 2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk)) -3. Fetch associated BUSCO lineages ([`goat/taxonsearch`](https://github.com/genomehubs/goat-cli)) +3. Determine the appropriate BUSCO lineages from the taxonomy. 4. Run BUSCO ([`busco`](https://busco.ezlab.org/)) 5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond)) diff --git a/assets/mapping_taxids-busco_dataset_name.2019-12-16.txt b/assets/mapping_taxids-busco_dataset_name.2019-12-16.txt new file mode 100644 index 00000000..7105beeb --- /dev/null +++ b/assets/mapping_taxids-busco_dataset_name.2019-12-16.txt @@ -0,0 +1,166 @@ +422676 aconoidasida +7898 actinopterygii +5338 agaricales +155619 agaricomycetes +33630 alveolata +5794 apicomplexa +6854 arachnida +6656 arthropoda +4890 ascomycota +8782 aves +5204 basidiomycota +68889 boletales +3699 brassicales +134362 capnodiales +33554 carnivora +91561 cetartiodactyla +34395 chaetothyriales +3041 chlorophyta +5796 coccidia +28738 cyprinodontiformes +7147 diptera +147541 dothideomycetes +3193 embryophyta +33392 endopterygota +314146 euarchontoglires +33682 euglenozoa +2759 eukaryota +5042 eurotiales +147545 eurotiomycetes +9347 eutheria +72025 fabales +4751 fungi +314147 glires +1028384 glomerellales +5178 helotiales +7524 hemiptera +7399 hymenoptera +5125 hypocreales +50557 insecta +314145 laurasiatheria +147548 leotiomycetes +7088 lepidoptera +4447 liliopsida +40674 mammalia +33208 metazoa +6029 microsporidia +6447 mollusca +4827 mucorales +1913637 mucoromycota +6231 nematoda +33183 onygenales +9126 passeriformes +5820 plasmodium +92860 pleosporales +38820 poales +5303 polyporales +9443 primates +4891 saccharomycetes +8457 sauropsida +4069 solanales +147550 sordariomycetes +33634 stramenopiles +32523 tetrapoda +155616 tremellomycetes +7742 vertebrata +33090 viridiplantae +71240 eudicots +57723 acidobacteria +201174 actinobacteria_phylum +1760 actinobacteria_class +28211 alphaproteobacteria +135622 alteromonadales +200783 aquificae +1385 bacillales +91061 bacilli +2 bacteria +171549 bacteroidales +976 bacteroidetes +68336 bacteroidetes-chlorobi_group +200643 bacteroidia +28216 betaproteobacteria +80840 burkholderiales +213849 campylobacterales +1706369 cellvibrionales +204428 chlamydiae +1090 chlorobi +200795 chloroflexi +135613 chromatiales +1118 chroococcales +186801 clostridia +186802 clostridiales +84999 coriobacteriales +84998 coriobacteriia +85007 corynebacteriales +1117 cyanobacteria +768507 cytophagales +768503 cytophagia +68525 delta-epsilon-subdivisions +28221 deltaproteobacteria +213118 desulfobacterales +213115 desulfovibrionales +69541 desulfuromonadales +91347 enterobacterales +186328 entomoplasmatales +29547 epsilonproteobacteria +1239 firmicutes +200644 flavobacteriales +117743 flavobacteriia +32066 fusobacteria +203491 fusobacteriales +1236 gammaproteobacteria +186826 lactobacillales +118969 legionellales +85006 micrococcales +31969 mollicutes +2085 mycoplasmatales +206351 neisseriales +32003 nitrosomonadales +1161 nostocales +135619 oceanospirillales +1150 oscillatoriales +135625 pasteurellales +203682 planctomycetes +85009 propionibacteriales +1224 proteobacteria +72274 pseudomonadales +356 rhizobiales +227290 rhizobium-agrobacterium_group +204455 rhodobacterales +204441 rhodospirillales +766 rickettsiales +909929 selenomonadales +117747 sphingobacteriia +204457 sphingomonadales +136 spirochaetales +203691 spirochaetes +203692 spirochaetia +85011 streptomycetales +85012 streptosporangiales +1890424 synechococcales +508458 synergistetes +544448 tenericutes +68295 thermoanaerobacterales +200918 thermotogae +72273 thiotrichales +1737405 tissierellales +1737404 tissierellia +74201 verrucomicrobia +135623 vibrionales +135614 xanthomonadales +2157 archaea +2266 thermoproteales +2281 sulfolobales +114380 desulfurococcales +183967 thermoplasmata +651137 thaumarchaeota +2182 methanococcales +2191 methanomicrobiales +183925 methanobacteria +183924 thermoprotei +2235 halobacteriales +1644060 natrialbales +224756 methanomicrobia +1644055 haloferacales +183963 halobacteria +28890 euryarchaeota diff --git a/assets/test/nt_mMelMel3.1/taxonomy4blast.sqlite3 b/assets/test/nt_mMelMel3.1/taxonomy4blast.sqlite3 new file mode 100644 index 00000000..dc933c1f Binary files /dev/null and b/assets/test/nt_mMelMel3.1/taxonomy4blast.sqlite3 differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/taxonomy4blast.sqlite3 b/assets/test_full/nt_gfLaeSulp1.1/taxonomy4blast.sqlite3 new file mode 100644 index 00000000..2a56a82f Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/taxonomy4blast.sqlite3 differ diff --git a/bin/generate_config.py b/bin/generate_config.py new file mode 100755 index 00000000..e3d00dfa --- /dev/null +++ b/bin/generate_config.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python3 + +import argparse +import dataclasses +import os +import sys +import sqlite3 +import requests +import typing +import yaml + +NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/taxonomy/taxon/%s" +GOAT_LOOKUP_API = "https://goat.genomehubs.org/api/v2/lookup?searchTerm=%s&result=taxon&size=10&taxonomy=ncbi" +GOAT_RECORD_API = "https://goat.genomehubs.org/api/v2/record?recordId=%s&result=taxon&size=10&taxonomy=ncbi" +NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report?filters.assembly_version=all_assemblies" + +RANKS = [ + "genus", + "family", + "order", + "class", + "phylum", + "kingdom", + "superkingdom", +] + +BUSCO_BASAL_LINEAGES = ["eukaryota_odb10", "bacteria_odb10", "archaea_odb10"] + + +def parse_args(args=None): + Description = "Produce the various configuration files needed within the pipeline" + + parser = argparse.ArgumentParser(description=Description) + parser.add_argument("--fasta", required=True, help="Path to the Fasta file of the assembly.") + parser.add_argument("--taxon_query", required=True, help="Query string/integer for this taxon.") + parser.add_argument("--lineage_tax_ids", required=True, help="Mapping between BUSCO lineages and taxon IDs.") + parser.add_argument("--yml_out", required=True, help="Output YML file.") + parser.add_argument("--csv_out", required=True, help="Output CSV file.") + parser.add_argument("--accession", help="Accession number of the assembly (optional).", default=None) + parser.add_argument("--busco", help="Requested BUSCO lineages.", default=None) + parser.add_argument("--blastn", required=True, help="Path to the NCBI Taxonomy database") + parser.add_argument("--version", action="version", version="%(prog)s 1.4") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +@dataclasses.dataclass +class TaxonInfo: + taxon_id: int + organism_name: str + rank: typing.Optional[str] + lineage: typing.List["TaxonInfo"] + + +def make_taxon_info_from_goat(body: dict[str, str]) -> TaxonInfo: + rank = body["taxon_rank"] + if rank == "no rank": + rank = None + if "lineage" in body: + lineage = [make_taxon_info_from_goat(b) for b in body["lineage"]] + else: + lineage = [] + return TaxonInfo(int(body["taxon_id"]), body["scientific_name"], rank, lineage) + + +def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo: + if isinstance(taxon_name, int): + taxon_id = taxon_name + record_id = "taxon-%d" % taxon_name + else: + # Resolve the taxon_id of the species + response = requests.get(GOAT_LOOKUP_API % taxon_name).json() + taxon_id = int(response["results"][0]["result"]["taxon_id"]) + record_id = response["results"][0]["id"] + + # Using API, get the taxon_ids of the species and all parents + response = requests.get(GOAT_RECORD_API % record_id).json() + body = response["records"][0]["record"] + return make_taxon_info_from_goat(body) + + +def fetch_taxon_info_from_ncbi(taxon_name: typing.Union[str, int], with_lineage=True) -> typing.Optional[TaxonInfo]: + # Using API, get the taxon_ids of the species and all parents + response = requests.get(NCBI_TAXONOMY_API % taxon_name).json() + if "taxonomy" in response["taxonomy_nodes"][0]: + body = response["taxonomy_nodes"][0]["taxonomy"] + if with_lineage: + lineage = [ + fetch_taxon_info_from_ncbi(t, with_lineage=False) for t in reversed(body["lineage"][2:]) + ] # skip root and cellular_organisms + else: + lineage = [] + return TaxonInfo(body["tax_id"], body["organism_name"], body.get("rank"), lineage) + + +def fetch_taxon_info(taxon_name: typing.Union[str, int]) -> TaxonInfo: + return fetch_taxon_info_from_ncbi(taxon_name) or fetch_taxon_info_from_goat(taxon_name) + + +def get_classification(taxon_info: TaxonInfo) -> typing.Dict[str, str]: + ancestors: typing.Dict[str, str] = {} + for anc_taxon_info in taxon_info.lineage: + if anc_taxon_info.rank: + ancestors[anc_taxon_info.rank.lower()] = anc_taxon_info.organism_name + # https://ncbiinsights.ncbi.nlm.nih.gov/2024/06/04/changes-ncbi-taxonomy-classifications/ + # "superkingdom" will be called "domain" + if "superkingdom" not in ancestors: + ancestors["superkingdom"] = ancestors["domain"] + return {r: ancestors[r] for r in RANKS if r in ancestors} + + +def get_odb(taxon_info: TaxonInfo, lineage_tax_ids: str, requested_buscos: typing.Optional[str]) -> typing.List[str]: + # Read the mapping between the BUSCO lineages and their taxon_id + with open(lineage_tax_ids) as file_in: + lineage_tax_ids_dict: typing.Dict[int, str] = {} + for line in file_in: + arr = line.split() + lineage_tax_ids_dict[int(arr[0])] = arr[1] + "_odb10" + + if requested_buscos: + odb_arr = requested_buscos.split(",") + valid_odbs = set(lineage_tax_ids_dict.values()) + for odb in odb_arr: + if odb not in valid_odbs: + print(f"Invalid BUSCO lineage: {odb}", file=sys.stderr) + sys.exit(1) + else: + # Do the intersection to find the ancestors that have a BUSCO lineage + odb_arr = [ + lineage_tax_ids_dict[anc_taxon_info.taxon_id] + for anc_taxon_info in taxon_info.lineage + if anc_taxon_info.taxon_id in lineage_tax_ids_dict + ] + + return odb_arr + + +def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]]: + response = requests.get(NCBI_DATASETS_API % accession).json() + if response["total_count"] != 1: + print(f"Assembly not found: {accession}", file=sys.stderr) + sys.exit(1) + assembly_report = response["reports"][0] + assembly_info = assembly_report["assembly_info"] + scaffold_count = assembly_report["assembly_stats"]["number_of_component_sequences"] + scaffold_count += assembly_report["assembly_stats"].get("number_of_organelles", 0) + span = int(assembly_report["assembly_stats"]["total_sequence_length"]) + span += sum(int(oi["total_seq_length"]) for oi in assembly_report.get("organelle_info", [])) + d = { + "accession": accession, + "alias": assembly_info["assembly_name"], + "bioproject": assembly_info["bioproject_accession"], + "level": assembly_info["assembly_level"].lower(), + "scaffold-count": scaffold_count, + "span": span, + } + if "biosample" in assembly_info: + d["biosample"] = assembly_info["biosample"]["accession"] + if "wgs_info" in assembly_report: + d["prefix"] = assembly_report["wgs_info"]["wgs_project_accession"] + return d + + +def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int: + con = sqlite3.connect(os.path.join(blastn, "taxonomy4blast.sqlite3")) + cur = con.cursor() + for taxon_id in [taxon_info.taxon_id] + [anc_taxon_info.taxon_id for anc_taxon_info in taxon_info.lineage]: + res = cur.execute("SELECT * FROM TaxidInfo WHERE taxid = ?", (taxon_id,)) + if res.fetchone(): + return taxon_id + + +def print_yaml( + file_out, + assembly_info: typing.Dict[str, typing.Union[str, int]], + taxon_info: TaxonInfo, + classification: typing.Dict[str, str], + odb_arr: typing.List[str], +): + data = { + "assembly": assembly_info, + "busco": { + "basal_lineages": BUSCO_BASAL_LINEAGES, + # "download_dir": + "lineages": odb_arr + [lin for lin in BUSCO_BASAL_LINEAGES if lin not in odb_arr], + }, + # "reads": {}, + "revision": 1, + "settings": { + # Only settings.stats_windows is mandatory, everything else is superfluous + "blast_chunk": 100000, + "blast_max_chunks": 10, + "blast_min_length": 1000, + "blast_overlap": 0, + "stats_chunk": 1000, + "stats_windows": [0.1, 0.01, 100000, 1000000], + # "taxdump": , + "tmp": "/tmp", + }, + "similarity": { + # Only the presence similarity.diamond_blastx seems mandatory, everything else is superfluous + "blastn": { + "name": "nt", + # "path": , + }, + "defaults": {"evalue": 1e-10, "import_evalue": 1e-25, "max_target_seqs": 10, "taxrule": "buscogenes"}, + "diamond_blastp": { + "import_max_target_seqs": 100000, + "name": "reference_proteomes", + # "path": , + "taxrule": "blastp=buscogenes", + }, + "diamond_blastx": { + "name": "reference_proteomes", + # "path": , + }, + }, + "taxon": { + "taxid": str(taxon_info.taxon_id), # str on purpose, to mimic blobtoolkit + "name": taxon_info.organism_name, + **classification, + }, + "version": 2, + } + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + + with open(file_out, "w") as fout: + yaml.dump(data, fout) + + +def print_csv(file_out, taxon_id: int, odb_arr: typing.List[str]): + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + + with open(file_out, "w") as fout: + print("taxon_id", taxon_id, sep=",", file=fout) + for odb_val in odb_arr: + print("busco_lineage", odb_val, sep=",", file=fout) + + +def main(args=None): + args = parse_args(args) + + assembly_info: typing.Dict[str, typing.Union[str, int]] + if args.accession: + assembly_info = get_assembly_info(args.accession) + else: + assembly_info = {"level": "scaffold"} + assembly_info["file"] = args.fasta + + taxon_info = fetch_taxon_info(args.taxon_query) + classification = get_classification(taxon_info) + odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco) + taxon_id = adjust_taxon_id(args.blastn, taxon_info) + + print_yaml(args.yml_out, assembly_info, taxon_info, classification, odb_arr) + print_csv(args.csv_out, taxon_id, odb_arr) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/update_versions.py b/bin/update_versions.py index 9e014b46..1d3491a3 100755 --- a/bin/update_versions.py +++ b/bin/update_versions.py @@ -15,15 +15,33 @@ def parse_args(args=None): parser.add_argument("--meta_in", help="Input JSON file.", required=True) parser.add_argument("--meta_out", help="Output JSON file.", required=True) parser.add_argument("--software", help="Input YAML file.", required=True) - parser.add_argument("--version", action="version", version="%(prog)s 1.1.0") + parser.add_argument("--read_id", action="append", help="ID of a read set") + parser.add_argument("--read_type", action="append", help="Type of a read set") + parser.add_argument("--read_path", action="append", help="Path of a read set") + parser.add_argument("--blastp", help="Path to the blastp database", required=True) + parser.add_argument("--blastx", help="Path to the blastx database", required=True) + parser.add_argument("--blastn", help="Path to the blastn database", required=True) + parser.add_argument("--taxdump", help="Path to the taxonomy database", required=True) + parser.add_argument("--version", action="version", version="%(prog)s 1.2.0") return parser.parse_args(args) -def update_meta(meta, software): - with open(meta) as fh: +def datatype_to_platform(s): + if s == "ont": + return "OXFORD_NANOPORE" + elif s.startswith("pacbio"): + return "PACBIO_SMRT" + elif s in ["hic", "illumina"]: + return "ILLUMINA" + else: + return "OTHER" + + +def update_meta(args): + with open(args.meta_in) as fh: infile = json.load(fh) - with open(software) as fh: + with open(args.software) as fh: versions = yaml.safe_load(fh) new_dict = dict() @@ -36,13 +54,27 @@ def update_meta(meta, software): del new_dict["sanger-tol/blobtoolkit"] infile["settings"]["software_versions"] = new_dict + infile["settings"]["taxdump"] = args.taxdump + for k in ["blastn", "diamond_blastp", "diamond_blastx"]: + infile["similarity"].setdefault(k, {}) + infile["similarity"]["blastn"]["path"] = args.blastn + infile["similarity"]["diamond_blastp"]["path"] = args.blastp + infile["similarity"]["diamond_blastx"]["path"] = args.blastx + + infile["reads"] = {} + for id, datatype, path in zip(args.read_id, args.read_type, args.read_path): + infile["reads"][id] = { + "file": path, + "plaform": datatype_to_platform(datatype), + } + return infile def main(args=None): args = parse_args(args) - data = update_meta(args.meta_in, args.software) + data = update_meta(args) with open(args.meta_out, "w") as fh: json.dump(data, fh) diff --git a/conf/modules.config b/conf/modules.config index ac597dc4..1193cf5f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -64,10 +64,6 @@ process { ext.args = "-c" } - withName: "GOAT_TAXONSEARCH" { - ext.args = "--lineage --busco" - } - withName: "PIGZ_COMPRESS" { publishDir = [ path: { "${params.outdir}/base_content" }, @@ -85,7 +81,8 @@ process { } withName: "BUSCO" { - scratch = true + // Obey "use_work_dir_as_temp", except for large genomes + scratch = { !params.use_work_dir_as_temp || (meta.genome_size < 2000000000) } ext.args = { 'test' in workflow.profile.tokenize(',') ? // Additional configuration to speed processes up during testing. // Note: BUSCO *must* see the double-quotes around the parameters diff --git a/conf/test.config b/conf/test.config index 623cf3f9..8ad70cae 100644 --- a/conf/test.config +++ b/conf/test.config @@ -35,4 +35,7 @@ params { blastp = "${projectDir}/assets/test/mMelMel3.1.buscogenes.dmnd" blastx = "${projectDir}/assets/test/mMelMel3.1.buscoregions.dmnd" blastn = "${projectDir}/assets/test/nt_mMelMel3.1" + + // Need to be set to avoid overfilling /tmp + use_work_dir_as_temp = true } diff --git a/conf/test_full.config b/conf/test_full.config index 6af9eecb..ca78130e 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -30,4 +30,7 @@ params { blastp = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd" blastx = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd" blastn = "${projectDir}/assets/test_full/nt_gfLaeSulp1.1" + + // Need to be set to avoid overfilling /tmp + use_work_dir_as_temp = true } diff --git a/conf/test_raw.config b/conf/test_raw.config index 47cc4267..0cf1d16f 100644 --- a/conf/test_raw.config +++ b/conf/test_raw.config @@ -36,4 +36,7 @@ params { blastp = "${projectDir}/assets/test/mMelMel3.1.buscogenes.dmnd" blastx = "${projectDir}/assets/test/mMelMel3.1.buscoregions.dmnd" blastn = "${projectDir}/assets/test/nt_mMelMel3.1/" + + // Need to be set to avoid overfilling /tmp + use_work_dir_as_temp = true } diff --git a/docs/usage.md b/docs/usage.md index 4789ff84..4d0ad33e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -162,12 +162,6 @@ If you have [GNU parallel](https://www.gnu.org/software/parallel/) installed, yo find v5/data -name "*.tar.gz" | parallel "cd {//}; tar -xzf {/}" ``` -## YAML File and Nextflow configuration - -As in the Snakemake version [a YAML configuration file](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/blobtoolkit-pipeline/src#configuration) is needed to generate metadata summary. This YAML config file can be generated with a genome accession value for released assemblies (for example, GCA_XXXXXXXXX.X) or can be passed for draft assemblies (for example, [GCA_922984935.2.yaml](assets/test/GCA_922984935.2.yaml) using the `--yaml` parameter. Even for draft assemblies, a placeholder value should be passed with the `--accession` parameter. - -The data in the YAML is currently ignored in the Nextflow pipeline version. The YAML file is retained only to allow compatibility with the BlobDir dataset generated by the [Snakemake version](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/blobtoolkit-pipeline/src). The taxonomic information in the YAML file can be obtained from [NCBI Taxonomy](https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/). - ## Changes from Snakemake to Nextflow ### Commands @@ -189,9 +183,13 @@ Nextflow nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession GCA_ACCESSION --taxon TAXON_ID --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB # Draft Assemblies -nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession TAG --taxon TAXON_ID --yaml CONFIG --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB +nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME --taxon TAXON_ID --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB ``` +The Nextflow pipeline does not support taking input from the Yaml files of the Snakemake pipeline. +Instead, Nextflow has a uniform way of setting input parameters on the command-line or via a JSON / Yaml file, +see for some examples. + ### Subworkflows Here is a full list of snakemake subworkflows and their Nextflow couterparts: @@ -235,7 +233,6 @@ List of tools for any given dataset can be fetched from the API, for example htt | busco | 5.3.2 | 5.5.0 | | diamond | 2.0.15 | 2.1.8 | | fasta_windows | | 0.2.4 | -| goat | | 0.2.5 | | minimap2 | 2.24 | 2.24 | | ncbi-datasets-cli | 14.1.0 | | | nextflow | | 23.10.0 | diff --git a/modules.json b/modules.json index ebb45a6c..3ce2a831 100644 --- a/modules.json +++ b/modules.json @@ -45,11 +45,6 @@ "installed_by": ["modules"], "patch": "modules/nf-core/fastawindows/fastawindows.diff" }, - "goat/taxonsearch": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] - }, "gunzip": { "branch": "master", "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", diff --git a/modules/local/blobtoolkit/config.nf b/modules/local/blobtoolkit/config.nf deleted file mode 100644 index 32d4eacd..00000000 --- a/modules/local/blobtoolkit/config.nf +++ /dev/null @@ -1,37 +0,0 @@ -process BLOBTOOLKIT_CONFIG { - tag "$meta.id" - label 'process_single' - - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - exit 1, "GENERATE_CONFIG module does not support Conda. Please use Docker / Singularity / Podman instead." - } - container "docker.io/genomehubs/blobtoolkit:4.3.9" - - input: - tuple val(meta), val(reads) - tuple val(meta), val(fasta) - - output: - tuple val(meta), path("${meta.id}/*.yaml"), emit: yaml - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def input_reads = reads.collect{"--reads $it"}.join(' ') - """ - btk pipeline \\ - generate-config \\ - ${prefix} \\ - $args \\ - ${input_reads} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') - END_VERSIONS - """ -} diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf index a5556348..0c3aaaaf 100644 --- a/modules/local/blobtoolkit/updatemeta.nf +++ b/modules/local/blobtoolkit/updatemeta.nf @@ -9,7 +9,13 @@ process BLOBTOOLKIT_UPDATEMETA { input: tuple val(meta), path(input) + val reads path versions + // The following are passed as "val" because we just want to know the full paths. No staging necessary + val blastp + val blastx + val blastn + val taxdump output: tuple val(meta), path("*.json"), emit: json @@ -21,11 +27,17 @@ process BLOBTOOLKIT_UPDATEMETA { script: def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" + def input_reads = reads.collect{"--read_id ${it[0].id} --read_type ${it[0].datatype} --read_path ${it[1]}"}.join(' ') """ update_versions.py \\ ${args} \\ --meta_in ${input}/meta.json \\ --software ${versions} \\ + --blastp ${blastp} \\ + --blastx ${blastx} \\ + --blastn ${blastn} \\ + --taxdump ${taxdump} \\ + $input_reads \\ --meta_out ${prefix}.meta.json cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf new file mode 100644 index 00000000..3aab7ee9 --- /dev/null +++ b/modules/local/generate_config.nf @@ -0,0 +1,44 @@ +process GENERATE_CONFIG { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::requests=2.28.1 conda-forge::pyyaml=6.0" + container "docker.io/genomehubs/blobtoolkit:4.3.9" + + input: + tuple val(meta), val(fasta) + val taxon_query + val busco_lin + path lineage_tax_ids + tuple val(meta2), path(blastn) + + output: + tuple val(meta), path("*.yaml"), emit: yaml + tuple val(meta), path("*.csv") , emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def busco_param = busco_lin ? "--busco '${busco_lin}'" : "" + def accession_params = params.accession ? "--accession ${params.accession}" : "" + """ + generate_config.py \\ + --fasta $fasta \\ + --taxon_query "$taxon_query" \\ + --lineage_tax_ids $lineage_tax_ids \\ + $busco_param \\ + $accession_params \\ + --blastn $blastn \\ + --yml_out ${prefix}.yaml \\ + --csv_out ${prefix}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + generate_config.py: \$(generate_config.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/goat/taxonsearch/environment.yml b/modules/nf-core/goat/taxonsearch/environment.yml deleted file mode 100644 index e56e71f1..00000000 --- a/modules/nf-core/goat/taxonsearch/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: goat_taxonsearch -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::goat=0.2.5 diff --git a/modules/nf-core/goat/taxonsearch/main.nf b/modules/nf-core/goat/taxonsearch/main.nf deleted file mode 100644 index 62c12baa..00000000 --- a/modules/nf-core/goat/taxonsearch/main.nf +++ /dev/null @@ -1,36 +0,0 @@ -process GOAT_TAXONSEARCH { - tag "$meta.id" - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/goat:0.2.5--h9d3141d_2': - 'biocontainers/goat:0.2.5--h9d3141d_2' }" - - input: - tuple val(meta), val(taxon), path(taxa_file) - - output: - tuple val(meta), path("*.tsv"), emit: taxonsearch - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - input = taxa_file ? "-f ${taxa_file}" : "-t \"${taxon}\"" - if (!taxon && !taxa_file) error "No input. Valid input: single taxon identifier or a .txt file with identifiers" - if (taxon && taxa_file ) error "Only one input is required: a single taxon identifier or a .txt file with identifiers" - """ - goat-cli taxon search \\ - $args \\ - $input > ${prefix}.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - goat: \$(goat-cli --version | cut -d' ' -f2) - END_VERSIONS - """ -} diff --git a/modules/nf-core/goat/taxonsearch/meta.yml b/modules/nf-core/goat/taxonsearch/meta.yml deleted file mode 100644 index 1bb19e30..00000000 --- a/modules/nf-core/goat/taxonsearch/meta.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: "goat_taxonsearch" -description: Query metadata for any taxon across the tree of life. -keywords: - - public datasets - - ncbi - - genomes on a tree -tools: - - goat: - description: | - goat-cli is a command line interface to query the - Genomes on a Tree Open API. - homepage: https://github.com/genomehubs/goat-cli - documentation: https://github.com/genomehubs/goat-cli/wiki - tool_dev_url: https://genomehubs.github.io/goat-cli/goat_cli/ - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test'] - - taxon: - type: string - description: | - The taxon to search. An NCBI taxon ID, or the name of a taxon at any rank. - - taxa_file: - type: file - description: | - A file of NCBI taxonomy ID's (tips) and/or binomial names. Each line - should contain a single entry.File size is limited to 500 entries. - pattern: "*.txt" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - taxonsearch: - type: file - description: TSV file containing search results. - pattern: "*.tsv" -authors: - - "@alxndrdiaz" -maintainers: - - "@alxndrdiaz" - - "@muffato" diff --git a/nextflow.config b/nextflow.config index 84bae361..18994df9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,10 +12,10 @@ params { // Specify your pipeline's command line flags // Input options input = null - yaml = null align = false mask = false fetchngs_samplesheet = false + busco_lineages = null // Reference options fasta = null @@ -28,9 +28,15 @@ params { // Databases and related options taxdump = null busco = null + // Taken from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.*.2019-12-16.txt.tar.gz + lineage_tax_ids = "${projectDir}/assets/mapping_taxids-busco_dataset_name.2019-12-16.txt" blastp = null blastx = null blastn = null + skip_taxon_filtering = false + + // Execution options + use_work_dir_as_temp = false // MultiQC options multiqc_config = null @@ -243,7 +249,7 @@ manifest { description = """Quality assessment of genome assemblies""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '0.5.1' + version = '0.6.0' doi = '10.5281/zenodo.7949058' } diff --git a/nextflow_schema.json b/nextflow_schema.json index b392e2a5..e722369d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -32,16 +32,21 @@ "description": "Turn on optional genome masking if needed.", "fa_icon": "fas fa-toggle-off" }, + "skip_taxon_filtering": { + "type": "boolean", + "description": "Skip filtering out hits from the same species in blast* searches.", + "fa_icon": "fas fa-toggle-off" + }, "fetchngs_samplesheet": { "type": "boolean", "description": "Turn on the conversion from a nf-core/fetchngs samplesheet.", "fa_icon": "fas fa-toggle-off" }, - "yaml": { + "busco_lineages": { "type": "string", - "format": "file-path", - "description": "Custom config file for draft assembly", - "fa_icon": "fas fa-file-alt" + "pattern": "([^,]+_odb10,)*[^,]+_odb10", + "description": "Name of the ODB10 lineages to run BUSCO on. By default, the pipeline will automatically selecy the correct lineage based on the taxonomy.", + "fa_icon": "fas fa-bone" }, "image_format": { "type": "string", @@ -100,7 +105,7 @@ "type": "object", "fa_icon": "fas fa-database", "description": "Define the location and parameters to work with databases.", - "required": ["blastp", "blastx", "blastn", "taxdump"], + "required": ["blastp", "blastx", "blastn", "lineage_tax_ids", "taxdump"], "properties": { "busco": { "type": "string", @@ -108,6 +113,13 @@ "description": "Local directory where clade-specific BUSCO lineage datasets are stored", "fa_icon": "fas fa-folder-open" }, + "lineage_tax_ids": { + "type": "string", + "format": "file-path", + "description": "Local file that holds a mapping between BUSCO lineages and taxon IDs.", + "help_text": "Initialised from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.*.2019-12-16.txt.tar.gz", + "fa_icon": "fas fa-file-code" + }, "blastp": { "type": "string", "format": "file-path", @@ -136,6 +148,21 @@ } } }, + "execution": { + "title": "Execution", + "type": "object", + "description": "Control the execution of the pipeline.", + "default": "", + "properties": { + "use_work_dir_as_temp": { + "type": "boolean", + "description": "Set to true to make tools (e.g. sort, FastK, MerquryFK) use the work directory for their temporary files, rather than the system default.", + "fa_icon": "fas fa-arrow-circle-down", + "hidden": true + } + }, + "fa_icon": "fas fa-running" + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -341,6 +368,9 @@ { "$ref": "#/definitions/databases" }, + { + "$ref": "#/definitions/execution" + }, { "$ref": "#/definitions/institutional_config_options" }, diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index 2a89471f..4b07723e 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -1,8 +1,7 @@ // -// Run BUSCO for a genome from GOAT and runs diamond_blastp +// Run BUSCO for a genome and runs diamond_blastp // -include { GOAT_TAXONSEARCH } from '../../modules/nf-core/goat/taxonsearch/main' include { BUSCO } from '../../modules/nf-core/busco/main' include { BLOBTOOLKIT_EXTRACTBUSCOS } from '../../modules/local/blobtoolkit/extractbuscos' include { DIAMOND_BLASTP } from '../../modules/nf-core/diamond/blastp/main' @@ -12,47 +11,24 @@ include { RESTRUCTUREBUSCODIR } from '../../modules/local/restructurebusco workflow BUSCO_DIAMOND { take: fasta // channel: [ val(meta), path(fasta) ] - taxon // channel: val(taxon) + busco_lin // channel: val([busco_lineages]) busco_db // channel: path(busco_db) blastp // channel: path(blastp_db) + taxon_id // channel: val(taxon_id) main: ch_versions = Channel.empty() - // - // Fetch BUSCO lineages for taxon - // - GOAT_TAXONSEARCH ( - fasta.combine(taxon).map { meta, fasta, taxon -> [ meta, taxon, [] ] } - ) - ch_versions = ch_versions.mix ( GOAT_TAXONSEARCH.out.versions.first() ) - - - // - // Get NCBI species ID - // - GOAT_TAXONSEARCH.out.taxonsearch - | map { meta, csv -> csv.splitCsv(header:true, sep:'\t', strip:true) } - | map { row -> [ row.taxon_rank, row.taxon_id ] } - | transpose() - | filter { rank,id -> rank =~ /species/ } - | map { rank, id -> id} - | first - | set { ch_taxid } - - // // Prepare the BUSCO linages // // 0. Initialise sone variables basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ] def lineage_position = 0 - // 1. Parse the GOAT_TAXONSEARCH output - GOAT_TAXONSEARCH.out.taxonsearch - | map { meta, csv -> csv.splitCsv(header:true, sep:'\t', strip:true) } - | map { row -> row.odb10_lineage.findAll { it != "" } } + // 1. Start from the taxon's lineages + busco_lin // 2. Add the (missing) basal lineages | map { lineages -> (lineages + basal_lineages).unique() } | flatten () @@ -117,7 +93,7 @@ workflow BUSCO_DIAMOND { // Hardcoded to match the format expected by blobtools def outext = 'txt' def cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' - DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols, ch_taxid ) + DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols, taxon_id ) ch_versions = ch_versions.mix ( DIAMOND_BLASTP.out.versions.first() ) @@ -148,7 +124,6 @@ workflow BUSCO_DIAMOND { first_table = ch_first_table // channel: [ val(meta), path(full_table) ] all_tables = ch_indexed_buscos // channel: [ val(meta), path(full_tables) ] blastp_txt = DIAMOND_BLASTP.out.txt // channel: [ val(meta), path(txt) ] - taxon_id = ch_taxid // channel: taxon_id multiqc // channel: [ meta, summary ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/finalise_blobdir.nf b/subworkflows/local/finalise_blobdir.nf index ffbbd534..352cb01b 100644 --- a/subworkflows/local/finalise_blobdir.nf +++ b/subworkflows/local/finalise_blobdir.nf @@ -8,6 +8,7 @@ include { COMPRESSBLOBDIR } from '../../modules/local/compressblobdir' workflow FINALISE_BLOBDIR { take: blobdir // channel: [ val(meta), path(blobdir) ] + reads // channel: [ [meta, reads] ] software // channel: [ val(meta), path(software_yml) ] summary // channel: [ val(meta), path(summary_json) ] @@ -18,7 +19,15 @@ workflow FINALISE_BLOBDIR { // // MODULE: Update meta json file // - BLOBTOOLKIT_UPDATEMETA ( blobdir, software ) + BLOBTOOLKIT_UPDATEMETA ( + blobdir, + reads, + software, + params.blastp, + params.blastx, + params.blastn, + params.taxdump, + ) // // MODULE: Compress all the json files diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index d498269f..7a8fd112 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -6,13 +6,16 @@ include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/samtools/flagstat/main' include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' include { FETCHNGSSAMPLESHEET_CHECK } from '../../modules/local/fetchngssamplesheet_check' -include { BLOBTOOLKIT_CONFIG } from '../../modules/local/blobtoolkit/config' +include { GENERATE_CONFIG } from '../../modules/local/generate_config' workflow INPUT_CHECK { take: samplesheet // file: /path/to/samplesheet.csv fasta // channel: [ meta, path(fasta) ] - yaml // channel: [ meta, path(config ] + taxon // channel: val(taxon) + busco_lin // channel: val([busco_lin]) + lineage_tax_ids // channel: /path/to/lineage_tax_ids + blastn // channel: [ val(meta), path(blastn_db) ] main: ch_versions = Channel.empty() @@ -58,24 +61,53 @@ workflow INPUT_CHECK { | set { reads } - if ( !params.yaml ) { - read_files - | map { meta, data -> meta.id.split("_")[0..-2].join("_") } - | combine ( fasta ) - | map { sample, meta, fasta -> [ meta, sample ] } - | groupTuple() - | set { grouped_reads } - - BLOBTOOLKIT_CONFIG ( grouped_reads, fasta ) - ch_versions = ch_versions.mix ( BLOBTOOLKIT_CONFIG.out.versions.first() ) - ch_config = BLOBTOOLKIT_CONFIG.out.yaml - } else { - ch_config = yaml + GENERATE_CONFIG ( + fasta, + taxon, + busco_lin, + lineage_tax_ids, + blastn, + ) + ch_versions = ch_versions.mix(GENERATE_CONFIG.out.versions.first()) + + + // + // Parse the CSV file + // + GENERATE_CONFIG.out.csv + | map { meta, csv -> csv } + | splitCsv(header: ['key', 'value']) + | branch { + taxon_id: it.key == "taxon_id" + return it.value + busco_lineage: it.key == "busco_lineage" + return it.value } + | set { ch_parsed_csv } + + + // + // Get the taxon ID if we do taxon filtering in blast* searches + // + ch_parsed_csv.taxon_id + | map { params.skip_taxon_filtering ? '' : it } + | first + | set { ch_taxon_id } + + + // + // Get the BUSCO linages + // + ch_parsed_csv.busco_lineage + | collect + | set { ch_busco_lineages } + emit: reads // channel: [ val(meta), path(datafile) ] - config = ch_config // channel: [ val(meta), path(yaml) ] + config = GENERATE_CONFIG.out.yaml // channel: [ val(meta), path(yaml) ] + taxon_id = ch_taxon_id // channel: val(taxon_id) + busco_lineages = ch_busco_lineages // channel: val([busco_lin]) versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/run_blastn.nf b/subworkflows/local/run_blastn.nf index 7dc92fd7..d479e832 100644 --- a/subworkflows/local/run_blastn.nf +++ b/subworkflows/local/run_blastn.nf @@ -47,23 +47,37 @@ workflow RUN_BLASTN { | set { ch_chunks } // Run blastn search - // run blastn excluding taxon_id - BLASTN_TAXON ( ch_chunks, blastn, taxon_id ) - ch_versions = ch_versions.mix ( BLASTN_TAXON.out.versions.first() ) - - // check if blastn output table is empty - BLASTN_TAXON.out.txt - | branch { meta, txt -> - empty: txt.isEmpty() - not_empty: true - } - | set { ch_blastn_taxon_out } + if (params.skip_taxon_filtering) { + + // skip BLASTN_TAXON + ch_blast_blastn_input = ch_chunks + + // fake ch_blastn_taxon_out.not_empty + ch_blastn_taxon_out = [ + not_empty: Channel.empty() + ] + + } else { - // repeat the blastn search without excluding taxon_id - ch_blastn_taxon_out.empty - | join ( ch_chunks ) - | map { meta, txt, fasta -> [meta, fasta] } - | set { ch_blast_blastn_input } + // run blastn excluding taxon_id + BLASTN_TAXON ( ch_chunks, blastn, taxon_id ) + ch_versions = ch_versions.mix ( BLASTN_TAXON.out.versions.first() ) + + // check if blastn output table is empty + BLASTN_TAXON.out.txt + | branch { meta, txt -> + empty: txt.isEmpty() + not_empty: true + } + | set { ch_blastn_taxon_out } + + // repeat the blastn search without excluding taxon_id + ch_blastn_taxon_out.empty + | join ( ch_chunks ) + | map { meta, txt, fasta -> [meta, fasta] } + | set { ch_blast_blastn_input } + + } BLAST_BLASTN ( ch_blast_blastn_input, blastn, [] ) ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() ) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 3610cdde..cd97fd99 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -17,7 +17,7 @@ WorkflowBlobtoolkit.initialise(params, log) // Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxdump, params.busco, params.blastp, params.blastx ] +def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxdump, params.busco, params.blastp, params.blastx, params.lineage_tax_ids ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters @@ -30,11 +30,11 @@ if (params.blastn) { ch_blastn = Channel.value([ [ 'id': file(params.blastn).bas if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' } if (params.fetchngs_samplesheet && !params.align) { exit 1, '--align not specified, even though the input samplesheet is a nf-core/fetchngs one - i.e has fastq files!' } +if (params.lineage_tax_ids) { ch_lineage_tax_ids = Channel.fromPath(params.lineage_tax_ids).first() } else { exit 1, 'Mapping BUSCO lineage <-> taxon_ids not specified' } + // Create channel for optional parameters +if (params.busco_lineages) { ch_busco_lin = Channel.value(params.busco_lineages) } else { ch_busco_lin = Channel.value([]) } if (params.busco) { ch_busco_db = Channel.fromPath(params.busco).first() } else { ch_busco_db = Channel.value([]) } -if (params.yaml) { ch_yaml = Channel.fromPath(params.yaml) } else { ch_yaml = Channel.empty() } -if (params.yaml && params.accession) { exit 1, '--yaml cannot be provided at the same time as --accession !' } -if (!params.yaml && !params.accession) { exit 1, '--yaml and --accession are both mising. Pick one !' } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -102,7 +102,14 @@ workflow BLOBTOOLKIT { // // SUBWORKFLOW: Check samplesheet and create channels for downstream analysis // - INPUT_CHECK ( ch_input, PREPARE_GENOME.out.genome, ch_yaml ) + INPUT_CHECK ( + ch_input, + PREPARE_GENOME.out.genome, + ch_taxon, + ch_busco_lin, + ch_lineage_tax_ids, + ch_blastn, + ) ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions ) // @@ -123,13 +130,14 @@ workflow BLOBTOOLKIT { ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions ) // - // SUBWORKFLOW: Run BUSCO using lineages fetched from GOAT, then run diamond_blastp + // SUBWORKFLOW: Run BUSCO using lineages fetched from GoaT, then run diamond_blastp // BUSCO_DIAMOND ( PREPARE_GENOME.out.genome, - ch_taxon, + INPUT_CHECK.out.busco_lineages, ch_busco_db, ch_blastp, + INPUT_CHECK.out.taxon_id, ) ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions ) @@ -140,7 +148,7 @@ workflow BLOBTOOLKIT { PREPARE_GENOME.out.genome, BUSCO_DIAMOND.out.first_table, ch_blastx, - BUSCO_DIAMOND.out.taxon_id, + INPUT_CHECK.out.taxon_id, ) ch_versions = ch_versions.mix ( RUN_BLASTX.out.versions ) @@ -152,7 +160,7 @@ workflow BLOBTOOLKIT { RUN_BLASTX.out.blastx_out, PREPARE_GENOME.out.genome, ch_blastn, - BUSCO_DIAMOND.out.taxon_id, + INPUT_CHECK.out.taxon_id, ) // @@ -199,6 +207,7 @@ workflow BLOBTOOLKIT { // FINALISE_BLOBDIR ( BLOBTOOLS.out.blobdir, + INPUT_CHECK.out.reads.collect(flat: false).ifEmpty([]), CUSTOM_DUMPSOFTWAREVERSIONS.out.yml, VIEW.out.summary )