diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 8d12b98a..ca21ecfe 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -19,10 +19,10 @@ jobs: - uses: actions/setup-node@v4 - name: Install editorconfig-checker - run: npm install -g editorconfig-checker + run: npm install -g editorconfig-checker@3.0.2 - name: Run ECLint check - run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile') + run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile\|.sqlite3') Prettier: runs-on: ubuntu-latest diff --git a/.nf-core.yml b/.nf-core.yml index 85e18745..0ad948cc 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -12,6 +12,9 @@ lint: - LICENCE - lib/NfcoreTemplate.groovy - CODE_OF_CONDUCT.md + - assets/sendmail_template.txt + - assets/email_template.html + - assets/email_template.txt - assets/nf-core-blobtoolkit_logo_light.png - docs/images/nf-core-blobtoolkit_logo_light.png - docs/images/nf-core-blobtoolkit_logo_dark.png @@ -19,6 +22,8 @@ lint: - .github/PULL_REQUEST_TEMPLATE.md - .github/workflows/linting.yml - .github/workflows/branch.yml + - .github/CONTRIBUTING.md + - .github/workflows/linting_comment.yml multiqc_config: - report_comment nextflow_config: diff --git a/CHANGELOG.md b/CHANGELOG.md index 050399ed..23549f6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[0.7.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.7.0)] – Psyduck – [2024-10-02] + +The pipeline is now considered to be a complete and suitable replacement for the Snakemake version. + +- Fetch information about the chromosomes of the assemblies. Used to power + "grid plots". +- Fill in accurate read information in the blobDir. Users are now reqiured + to indicate in the samplesheet whether the reads are paired or single. + ## [[0.6.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.6.0)] – Bellsprout – [2024-09-13] The pipeline has now been validated for draft (unpublished) assemblies. @@ -87,13 +96,13 @@ The pipeline has now been validated on dozens of genomes, up to 11 Gbp. Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported. -| Dependency | Old version | New version | -| ----------- | ------------- | ------------- | -| blobtoolkit | 4.3.3 | 4.3.9 | -| blast | 2.14.0 | 2.15.0 | -| multiqc | 1.17 and 1.18 | 1.20 and 1.21 | -| samtools | 1.18 | 1.19.2 | -| seqtk | 1.3 | 1.4 | +| Dependency | Old version | New version | +| ----------- | ------------- | ----------------- | +| blobtoolkit | 4.3.3 | 4.3.9 | +| blast | 2.14.0 | 2.15.0 and 2.14.1 | +| multiqc | 1.17 and 1.18 | 1.20 and 1.21 | +| samtools | 1.18 | 1.19.2 | +| seqtk | 1.3 | 1.4 | > **NB:** Dependency has been **updated** if both old and new version information is present.
**NB:** Dependency has been **added** if just the new version information is present.
**NB:** Dependency has been **removed** if version information isn't present. diff --git a/CITATIONS.md b/CITATIONS.md index 8b960779..a3a1a070 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -12,6 +12,10 @@ ## Pipeline tools +- [BLAST+](https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html) + + > Camacho, Chritiam, et al. “BLAST+: architecture and applications.” BMC Bioinformatics, vol. 10, no. 412, Dec. 2009, https://doi.org/10.1186/1471-2105-10-421 + - [BlobToolKit](https://github.com/blobtoolkit/blobtoolkit) > Challis, Richard, et al. “BlobToolKit – Interactive Quality Assessment of Genome Assemblies.” G3 Genes|Genomes|Genetics, vol. 10, no. 4, Apr. 2020, pp. 1361–74, https://doi.org/10.1534/g3.119.400908. @@ -26,9 +30,7 @@ - [Fasta_windows](https://github.com/tolkit/fasta_windows) -- [GoaT](https://goat.genomehubs.org) - - > Challis, Richard, et al. “Genomes on a Tree (GoaT): A versatile, scalable search engine for genomic and sequencing project metadata across the eukaryotic tree of life.” Wellcome Open Research, vol. 8, no. 24, 2023, https://doi.org/10.12688/wellcomeopenres.18658.1. + > Brown, Max, et al. "Fasta_windows v0.2.3". GitHub, 2021. https://github.com/tolkit/fasta_windows - [Minimap2](https://github.com/lh3/minimap2) @@ -42,6 +44,10 @@ > Danecek, Petr, et al. “Twelve Years of SAMtools and BCFtools.” GigaScience, vol. 10, no. 2, Jan. 2021, https://doi.org/10.1093/gigascience/giab008. +- [SeqTK](https://github.com/lh3/seqtk) + + > Li, Heng. "SeqTK v1.4" GitHub, 2023, https://github.com/lh3/seqtk + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index b55ab353..ed74da7c 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,8 @@ It takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome 4. Run BUSCO ([`busco`](https://busco.ezlab.org/)) 5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond)) -7. Run BLASTx against sequences with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) -8. Run BLASTn against sequences still with not hit ([`blast/blastx`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) +7. Run BLASTx against sequences with no hit ([`diamond/blastx`](https://github.com/bbuchfink/diamond)) +8. Run BLASTn against sequences still with not hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) 9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit)) 11. Imports analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit)) @@ -37,13 +37,17 @@ First, prepare a samplesheet with your input data that looks as follows: `samplesheet.csv`: ```csv -sample,datatype,datafile -mMelMel3,hic,GCA_922984935.2.hic.mMelMel3.cram -mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram -mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram +sample,datatype,datafile,library_layout +mMelMel3,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED +mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram,PAIRED +mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE ``` -Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. +Each row represents an aligned file. +Rows with the same sample identifier are considered technical replicates. +The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`). +The library layout indicates whether the reads are paired or single. +The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. Now, you can run the pipeline using: @@ -77,9 +81,8 @@ sanger-tol/blobtoolkit was written in Nextflow by [Alexander Ramos Diaz](https:/ We thank the following people for their assistance in the development of this pipeline: - - - [Guoying Qi](https://github.com/gq1) +- [Bethan Yates](https://github.com/BethYates) ## Contributions and Support @@ -89,8 +92,6 @@ If you would like to contribute to this pipeline, please see the [contributing g If you use sanger-tol/blobtoolkit for your analysis, please cite it using the following doi: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058) - - An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). diff --git a/assets/schema_input.json b/assets/schema_input.json index 26ed41cb..812541d8 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -23,6 +23,11 @@ "type": "string", "pattern": "^\\S+\\.(bam|cram|fa|fa.gz|fasta|fasta.gz|fq|fq.gz|fastq|fastq.gz)$", "errorMessage": "Data file for reads cannot contain spaces and must be BAM/CRAM/FASTQ/FASTA" + }, + "library_layout": { + "type": "string", + "pattern": "^(SINGLE|PAIRED)$", + "errorMessage": "The only valid layouts are SINGLE and PAIRED" } }, "required": ["datafile", "datatype", "sample"] diff --git a/assets/test/samplesheet.csv b/assets/test/samplesheet.csv index 5f8d4463..c8d555be 100644 --- a/assets/test/samplesheet.csv +++ b/assets/test/samplesheet.csv @@ -1,5 +1,5 @@ -sample,datatype,datafile -mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram -mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram -mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram -mMelMel3,ont,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram +sample,datatype,datafile,library_layout +mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram,PAIRED +mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram,PAIRED +mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram,PAIRED +mMelMel3,ont,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram,SINGLE diff --git a/assets/test/samplesheet_raw.csv b/assets/test/samplesheet_raw.csv index 830753a7..53a5a42e 100644 --- a/assets/test/samplesheet_raw.csv +++ b/assets/test/samplesheet_raw.csv @@ -1,4 +1,4 @@ -sample,datatype,datafile -mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3#1_subset.cram -mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4#1_subset.cram -mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel3/hic-arima2/35528_2#1_subset.cram +sample,datatype,datafile,library_layout +mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3#1_subset.cram,PAIRED +mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4#1_subset.cram,PAIRED +mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel3/hic-arima2/35528_2#1_subset.cram,PAIRED diff --git a/assets/test/samplesheet_s3.csv b/assets/test/samplesheet_s3.csv index dbb34181..532f584e 100644 --- a/assets/test/samplesheet_s3.csv +++ b/assets/test/samplesheet_s3.csv @@ -1,5 +1,5 @@ -sample,datatype,datafile -mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram -mMelMel1,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram -mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram -mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram +sample,datatype,datafile,library_layout +mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram,PAIRED +mMelMel1,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram,PAIRED +mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram,PAIRED +mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram,SINGLE diff --git a/assets/test_full/full_samplesheet.csv b/assets/test_full/full_samplesheet.csv index 6a3ba69d..52f029e2 100644 --- a/assets/test_full/full_samplesheet.csv +++ b/assets/test_full/full_samplesheet.csv @@ -1,3 +1,3 @@ -sample,datatype,datafile -gfLaeSulp1,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram -gfLaeSulp1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram +sample,datatype,datafile,library_layout +gfLaeSulp1,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram,PAIRED +gfLaeSulp1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram,SINGLE diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 6b5392bf..bc468469 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -45,11 +45,17 @@ class RowChecker: "ont", ) + VALID_LAYOUTS = ( + "SINGLE", + "PAIRED", + ) + def __init__( self, sample_col="sample", type_col="datatype", file_col="datafile", + layout_col="library_layout", **kwargs, ): """ @@ -62,11 +68,14 @@ def __init__( the read data (default "datatype"). file_col (str): The name of the column that contains the file path for the read data (default "datafile"). + layout_col(str): The name of the column that contains the layout of the + library (i.e. "PAIRED" or "SINGLE"). """ super().__init__(**kwargs) self._sample_col = sample_col self._type_col = type_col self._file_col = file_col + self._layout_col = layout_col self._seen = set() self.modified = [] @@ -82,6 +91,7 @@ def validate_and_transform(self, row): self._validate_sample(row) self._validate_type(row) self._validate_file(row) + self._validate_layout(row) self._seen.add((row[self._sample_col], row[self._file_col])) self.modified.append(row) @@ -94,7 +104,7 @@ def _validate_sample(self, row): def _validate_type(self, row): """Assert that the data type matches expected values.""" - if not any(row[self._type_col] for datatype in self.VALID_DATATYPES): + if row[self._type_col] not in self.VALID_DATATYPES: raise AssertionError( f"The datatype is unrecognized: {row[self._type_col]}\n" f"It should be one of: {', '.join(self.VALID_DATATYPES)}" @@ -114,6 +124,14 @@ def _validate_data_format(self, filename): f"It should be one of: {', '.join(self.VALID_FORMATS)}" ) + def _validate_layout(self, row): + """Assert that the library layout matches expected values.""" + if not row[self._layout_col] in self.VALID_LAYOUTS: + raise AssertionError( + f"The library layout is unrecognized: {row[self._layout_col]}\n" + f"It should be one of: {', '.join(self.VALID_LAYOUTS)}" + ) + def validate_unique_samples(self): """ Assert that the combination of sample name and aligned filename is unique. @@ -178,7 +196,7 @@ def check_samplesheet(file_in, file_out): This function checks that the samplesheet follows the following structure, see also the `blobtoolkit samplesheet`_:: - sample,datatype,datafile + sample,datatype,datafile,library_layout sample1,hic,/path/to/file1.cram sample1,pacbio,/path/to/file2.cram sample1,ont,/path/to/file3.cram @@ -187,7 +205,7 @@ def check_samplesheet(file_in, file_out): https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/assets/test/samplesheet.csv """ - required_columns = {"sample", "datatype", "datafile"} + required_columns = {"sample", "datatype", "datafile", "library_layout"} # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. with file_in.open(newline="") as in_handle: reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) diff --git a/bin/generate_config.py b/bin/generate_config.py index e3d00dfa..8bdec323 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -13,6 +13,7 @@ GOAT_LOOKUP_API = "https://goat.genomehubs.org/api/v2/lookup?searchTerm=%s&result=taxon&size=10&taxonomy=ncbi" GOAT_RECORD_API = "https://goat.genomehubs.org/api/v2/record?recordId=%s&result=taxon&size=10&taxonomy=ncbi" NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report?filters.assembly_version=all_assemblies" +NCBI_SEQUENCE_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/sequence_reports" RANKS = [ "genus", @@ -34,12 +35,19 @@ def parse_args(args=None): parser.add_argument("--fasta", required=True, help="Path to the Fasta file of the assembly.") parser.add_argument("--taxon_query", required=True, help="Query string/integer for this taxon.") parser.add_argument("--lineage_tax_ids", required=True, help="Mapping between BUSCO lineages and taxon IDs.") - parser.add_argument("--yml_out", required=True, help="Output YML file.") - parser.add_argument("--csv_out", required=True, help="Output CSV file.") + parser.add_argument("--output_prefix", required=True, help="Prefix to name the output files.") parser.add_argument("--accession", help="Accession number of the assembly (optional).", default=None) parser.add_argument("--busco", help="Requested BUSCO lineages.", default=None) - parser.add_argument("--blastn", required=True, help="Path to the NCBI Taxonomy database") - parser.add_argument("--version", action="version", version="%(prog)s 1.4") + parser.add_argument("--nt", required=True, help="Path to the NT database") + parser.add_argument("--read_id", action="append", help="ID of a read set") + parser.add_argument("--read_type", action="append", help="Type of a read set") + parser.add_argument("--read_layout", action="append", help="Layout of a read set") + parser.add_argument("--read_path", action="append", help="Path of a read set") + parser.add_argument("--blastp", help="Path to the blastp database", required=True) + parser.add_argument("--blastx", help="Path to the blastx database", required=True) + parser.add_argument("--blastn", help="Path to the blastn database", required=True) + parser.add_argument("--taxdump", help="Path to the taxonomy database", required=True) + parser.add_argument("--version", action="version", version="%(prog)s 2.0") return parser.parse_args(args) @@ -165,8 +173,23 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int] return d -def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int: - con = sqlite3.connect(os.path.join(blastn, "taxonomy4blast.sqlite3")) +def get_sequence_report(accession: str): + response = requests.get(NCBI_SEQUENCE_API % accession).json() + if not response["reports"]: + print(f"Assembly not found: {accession}", file=sys.stderr) + sys.exit(1) + sequence_report = response["reports"] + for rec in sequence_report: + if rec["role"] == "assembled-molecule": + rec["assembly_level"] = rec["assigned_molecule_location_type"] + else: + rec["assembly_level"] = "na" + rec["chr_name"] = "na" + return sequence_report + + +def adjust_taxon_id(nt: str, taxon_info: TaxonInfo) -> int: + con = sqlite3.connect(os.path.join(nt, "taxonomy4blast.sqlite3")) cur = con.cursor() for taxon_id in [taxon_info.taxon_id] + [anc_taxon_info.taxon_id for anc_taxon_info in taxon_info.lineage]: res = cur.execute("SELECT * FROM TaxidInfo WHERE taxid = ?", (taxon_id,)) @@ -174,49 +197,60 @@ def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int: return taxon_id +def datatype_to_platform(s): + if s == "ont": + return "OXFORD_NANOPORE" + elif s.startswith("pacbio"): + return "PACBIO_SMRT" + elif s in ["hic", "illumina"]: + return "ILLUMINA" + else: + return "OTHER" + + def print_yaml( file_out, assembly_info: typing.Dict[str, typing.Union[str, int]], taxon_info: TaxonInfo, classification: typing.Dict[str, str], - odb_arr: typing.List[str], + reads, + blastp, + blastx, + blastn, + taxdump, ): data = { "assembly": assembly_info, - "busco": { - "basal_lineages": BUSCO_BASAL_LINEAGES, - # "download_dir": - "lineages": odb_arr + [lin for lin in BUSCO_BASAL_LINEAGES if lin not in odb_arr], + "reads": { + "paired": [], + "single": [], }, - # "reads": {}, "revision": 1, "settings": { - # Only settings.stats_windows is mandatory, everything else is superfluous "blast_chunk": 100000, "blast_max_chunks": 10, "blast_min_length": 1000, "blast_overlap": 0, "stats_chunk": 1000, "stats_windows": [0.1, 0.01, 100000, 1000000], - # "taxdump": , + "taxdump": taxdump, "tmp": "/tmp", }, "similarity": { - # Only the presence similarity.diamond_blastx seems mandatory, everything else is superfluous "blastn": { "name": "nt", - # "path": , + "path": blastn, }, "defaults": {"evalue": 1e-10, "import_evalue": 1e-25, "max_target_seqs": 10, "taxrule": "buscogenes"}, "diamond_blastp": { "import_max_target_seqs": 100000, "name": "reference_proteomes", - # "path": , + "path": blastp, "taxrule": "blastp=buscogenes", }, "diamond_blastx": { "name": "reference_proteomes", - # "path": , + "path": blastx, }, }, "taxon": { @@ -226,6 +260,16 @@ def print_yaml( }, "version": 2, } + + for id, datatype, layout, path in reads: + data["reads"][layout.lower()].append( + { + "prefix": id, + "file": path, + "plaform": datatype_to_platform(datatype), + } + ) + out_dir = os.path.dirname(file_out) make_dir(out_dir) @@ -243,23 +287,66 @@ def print_csv(file_out, taxon_id: int, odb_arr: typing.List[str]): print("busco_lineage", odb_val, sep=",", file=fout) +def print_tsvs(output_prefix, sequence_report): + categories_tsv = f"{output_prefix}.categories.tsv" + synonyms_tsv = f"{output_prefix}.synonyms.tsv" + with open(categories_tsv, "w") as fhc: + with open(synonyms_tsv, "w") as fhs: + print("identifier", "assembly_role", "assembly_level", "assembly_unit", sep="\t", file=fhc) + print("identifier", "name", "assigned_name", "refseq_accession", sep="\t", file=fhs) + for rec in sequence_report: + print( + rec["genbank_accession"], + rec["role"], + rec["assembly_level"], + rec["assembly_unit"], + sep="\t", + file=fhc, + ) + print( + rec["genbank_accession"], + rec["sequence_name"], + rec["chr_name"], + rec.get("refseq_accession", "na"), + sep="\t", + file=fhs, + ) + + def main(args=None): args = parse_args(args) assembly_info: typing.Dict[str, typing.Union[str, int]] if args.accession: assembly_info = get_assembly_info(args.accession) + sequence_report = get_sequence_report(args.accession) else: assembly_info = {"level": "scaffold"} + sequence_report = None assembly_info["file"] = args.fasta taxon_info = fetch_taxon_info(args.taxon_query) classification = get_classification(taxon_info) odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco) - taxon_id = adjust_taxon_id(args.blastn, taxon_info) - - print_yaml(args.yml_out, assembly_info, taxon_info, classification, odb_arr) - print_csv(args.csv_out, taxon_id, odb_arr) + taxon_id = adjust_taxon_id(args.nt, taxon_info) + + if sequence_report: + print_tsvs(args.output_prefix, sequence_report) + + reads = zip(args.read_id, args.read_type, args.read_layout, args.read_path) + + print_yaml( + f"{args.output_prefix}.yaml", + assembly_info, + taxon_info, + classification, + reads, + args.blastp, + args.blastx, + args.blastn, + args.taxdump, + ) + print_csv(f"{args.output_prefix}.csv", taxon_id, odb_arr) if __name__ == "__main__": diff --git a/bin/update_versions.py b/bin/update_versions.py index 1d3491a3..21f379a5 100755 --- a/bin/update_versions.py +++ b/bin/update_versions.py @@ -15,28 +15,10 @@ def parse_args(args=None): parser.add_argument("--meta_in", help="Input JSON file.", required=True) parser.add_argument("--meta_out", help="Output JSON file.", required=True) parser.add_argument("--software", help="Input YAML file.", required=True) - parser.add_argument("--read_id", action="append", help="ID of a read set") - parser.add_argument("--read_type", action="append", help="Type of a read set") - parser.add_argument("--read_path", action="append", help="Path of a read set") - parser.add_argument("--blastp", help="Path to the blastp database", required=True) - parser.add_argument("--blastx", help="Path to the blastx database", required=True) - parser.add_argument("--blastn", help="Path to the blastn database", required=True) - parser.add_argument("--taxdump", help="Path to the taxonomy database", required=True) - parser.add_argument("--version", action="version", version="%(prog)s 1.2.0") + parser.add_argument("--version", action="version", version="%(prog)s 1.3.0") return parser.parse_args(args) -def datatype_to_platform(s): - if s == "ont": - return "OXFORD_NANOPORE" - elif s.startswith("pacbio"): - return "PACBIO_SMRT" - elif s in ["hic", "illumina"]: - return "ILLUMINA" - else: - return "OTHER" - - def update_meta(args): with open(args.meta_in) as fh: infile = json.load(fh) @@ -54,20 +36,6 @@ def update_meta(args): del new_dict["sanger-tol/blobtoolkit"] infile["settings"]["software_versions"] = new_dict - infile["settings"]["taxdump"] = args.taxdump - for k in ["blastn", "diamond_blastp", "diamond_blastx"]: - infile["similarity"].setdefault(k, {}) - infile["similarity"]["blastn"]["path"] = args.blastn - infile["similarity"]["diamond_blastp"]["path"] = args.blastp - infile["similarity"]["diamond_blastx"]["path"] = args.blastx - - infile["reads"] = {} - for id, datatype, path in zip(args.read_id, args.read_type, args.read_path): - infile["reads"][id] = { - "file": path, - "plaform": datatype_to_platform(datatype), - } - return infile diff --git a/conf/base.config b/conf/base.config index 75fa0d06..f24b6b99 100644 --- a/conf/base.config +++ b/conf/base.config @@ -66,11 +66,11 @@ process { time = { check_max( 1.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } } - // Temporarily the same settings as CCS + // Baased on a handful of runs withName: '.*:MINIMAP2_ALIGNMENT:MINIMAP2_(HIC|ILMN)' { cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) } memory = { check_max( 800.MB * log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) + 14.GB * Math.ceil( Math.pow(meta2.genome_size / 1000000000, 0.6)) * task.attempt, 'memory' ) } - time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + time = { check_max( 1.h * Math.ceil( meta.read_count / 75000000 ) * task.attempt, 'time' ) } } withName: 'WINDOWSTATS_INPUT' { diff --git a/conf/modules.config b/conf/modules.config index 1193cf5f..ceeb9c67 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -22,10 +22,20 @@ process { withName: "WINDOWMASKER_MKCOUNTS" { ext.args = "-infmt fasta -sformat obinary" + publishDir = [ + path: { "${params.outdir}/repeats/windowmasker" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename.substring(0, filename.length() - 3) + "obinary" } + ] } withName: "WINDOWMASKER_USTAT" { ext.args = "-infmt fasta -dust T -outfmt fasta" + publishDir = [ + path: { "${params.outdir}/repeats/windowmasker" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] } withName: "MINIMAP2_HIC" { diff --git a/docs/output.md b/docs/output.md index e3204a1d..8af0fd40 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,7 +2,7 @@ ## Introduction -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +This document describes the output produced by the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. @@ -15,6 +15,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [BlobDir](#blobdir) - Output files viewable on a [BlobToolKit viewer](https://github.com/blobtoolkit/blobtoolkit) - [Static plots](#static-plots) - Static versions of the BlobToolKit plots - [BUSCO](#busco) - BUSCO results +- [Repeat masking](#repeat-masking) - Masked repeats (optional) - [Read alignments](#read-alignments) - Aligned reads (optional) - [Read coverage](#read-coverage) - Read coverage tracks - [Base content](#base-content) - _k_-mer statistics (for k ≤ 4) @@ -68,6 +69,20 @@ BUSCO results generated by the pipeline (all BUSCO lineages that match the claas +### Repeat masking + +Reults from the repeat-masker step -- only if the pipeline is run with `--mask`. + +
+Output files + +- `repeats/` + - `windowmasker/` + - `.fasta`: masked assembly in Fasta format. + - `.obinary`: frequency counts of repeats, in windowmasker's own binary format. + +
+ ### Read alignments Read alignments in BAM format -- only if the pipeline is run with `--align`. diff --git a/docs/usage.md b/docs/usage.md index 4d0ad33e..22df5508 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,7 +10,7 @@ ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row as shown in the examples below. ```bash --input '[path to samplesheet file]' @@ -21,30 +21,31 @@ You will need to create a samplesheet with information about the samples you wou The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: ```console -sample,datatype,datafile -sample1,hic,hic.cram -sample2,illumina,illumina.cram -sample2,illumina,illumina.cram +sample,datatype,datafile,library_layout +sample1,hic,hic.cram,PAIRED +sample2,illumina,illumina.cram,PAIRED +sample2,illumina,illumina.cram,PAIRED ``` ### Full samplesheet -The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 4 columns to match those defined in the table below. A final samplesheet file may look something like the one below. ```console -sample,datatype,datafile -sample1,hic,hic.cram -sample2,illumina,illumina.cram -sample3,ont,ont.cram +sample,datatype,datafile,library_layout +sample1,hic,hic.cram,PAIRED +sample2,illumina,illumina.cram,PAIRED +sample3,ont,ont.cram,SINGLE ``` -| Column | Description | -| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). | -| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, `pacbio_clr` or `ont`. | -| `datafile` | Full path to read data file. | +| Column | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). | +| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, `pacbio_clr` or `ont`. | +| `datafile` | Full path to read data file. | +| `library_layout` | Layout of the library. Must be one of `SINGLE`, `PAIRED`. | An [example samplesheet](assets/test/samplesheet.csv) has been provided with the pipeline. @@ -99,6 +100,10 @@ wget "ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/nt.???.tar.gz" -P $NT/ && for file in $NT/*.tar.gz; do tar xf $file -C $NT && rm $file; done + +wget "https://ftp.ncbi.nlm.nih.gov/blast/db/v5/taxdb.tar.gz" && +tar xf taxdb.tar.gz -C $NT && +rm taxdb.tar.gz ``` ### 3. UniProt reference proteomes database @@ -228,7 +233,7 @@ List of tools for any given dataset can be fetched from the API, for example htt | Dependency | Snakemake | Nextflow | | ----------------- | --------- | -------- | | blobtoolkit | 4.3.2 | 4.3.9 | -| blast | 2.12.0 | 2.15.0 | +| blast | 2.12.0 | 2.14.1 | | blobtk | 0.5.0 | 0.5.1 | | busco | 5.3.2 | 5.5.0 | | diamond | 2.0.15 | 2.1.8 | @@ -269,9 +274,8 @@ If you wish to repeatedly use the same parameters for multiple runs, rather than Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. -:::warning -Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -::: +> [!WARNING] +> Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). The above pipeline run specified with a params file in yaml format: @@ -308,15 +312,11 @@ This version number will be logged in reports when you run the pipeline, so that To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. -:::tip If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. -::: ## Core Nextflow arguments -:::note -These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). -::: +> These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). ### `-profile` @@ -324,9 +324,7 @@ Use this parameter to choose a configuration profile. Profiles can give configur Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. -:::info -We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. -::: +> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). diff --git a/modules.json b/modules.json index 3ce2a831..4af0bcd6 100644 --- a/modules.json +++ b/modules.json @@ -66,12 +66,6 @@ "git_sha": "0eab94fc1e48703c1b0a8704bd665f554905c39d", "installed_by": ["modules"] }, - "samtools/fasta": { - "branch": "master", - "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": ["modules"], - "patch": "modules/nf-core/samtools/fasta/samtools-fasta.diff" - }, "samtools/flagstat": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", diff --git a/modules/local/blobtoolkit/metadata.nf b/modules/local/blobtoolkit/metadata.nf deleted file mode 100644 index ffae2a8c..00000000 --- a/modules/local/blobtoolkit/metadata.nf +++ /dev/null @@ -1,33 +0,0 @@ -process BLOBTOOLKIT_METADATA { - tag "$meta.id" - label 'process_single' - - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - exit 1, "BLOBTOOLKIT_METADATA module does not support Conda. Please use Docker / Singularity / Podman instead." - } - container "docker.io/genomehubs/blobtoolkit:4.3.9" - - input: - tuple val(meta), path(yaml) - - output: - tuple val(meta), path("*.metadata.yaml"), emit: yaml - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - btk pipeline add-summary-to-metadata \\ - --config ${yaml} \\ - --out ${prefix}.metadata.yaml - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') - END_VERSIONS - """ -} diff --git a/modules/local/blobtoolkit/updateblobdir.nf b/modules/local/blobtoolkit/updateblobdir.nf index 50167f8b..d736a03f 100644 --- a/modules/local/blobtoolkit/updateblobdir.nf +++ b/modules/local/blobtoolkit/updateblobdir.nf @@ -9,8 +9,10 @@ process BLOBTOOLKIT_UPDATEBLOBDIR { input: tuple val(meta), path(input, stageAs: "input_blobdir") - tuple val(meta1), path(blastx, stageAs: "blastx.txt") - tuple val(meta2), path(blastn, stageAs: "blastn.txt") + tuple val(meta2), path(synonyms_tsv) + tuple val(meta3), path(categories_tsv) + tuple val(meta4), path(blastx, stageAs: "blastx.txt") + tuple val(meta5), path(blastn, stageAs: "blastn.txt") path(taxdump) output: @@ -25,6 +27,9 @@ process BLOBTOOLKIT_UPDATEBLOBDIR { prefix = task.ext.prefix ?: "${meta.id}" def hits_blastx = blastx ? "--hits ${blastx}" : "" def hits_blastn = blastn ? "--hits ${blastn}" : "" + def syn = synonyms_tsv ? "--synonyms ${synonyms_tsv}" : "" + def cat = categories_tsv ? "--text ${categories_tsv}" : "" + def head = (synonyms_tsv || categories_tsv) ? "--text-header" : "" """ # In-place modifications are not great in Nextflow, so work on a copy of ${input} mkdir ${prefix} @@ -34,6 +39,7 @@ process BLOBTOOLKIT_UPDATEBLOBDIR { --taxrule bestdistorder=buscoregions \\ ${hits_blastx} \\ ${hits_blastn} \\ + ${syn} ${cat} ${head} \\ --threads ${task.cpus} \\ $args \\ ${prefix} diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf index 0c3aaaaf..a5556348 100644 --- a/modules/local/blobtoolkit/updatemeta.nf +++ b/modules/local/blobtoolkit/updatemeta.nf @@ -9,13 +9,7 @@ process BLOBTOOLKIT_UPDATEMETA { input: tuple val(meta), path(input) - val reads path versions - // The following are passed as "val" because we just want to know the full paths. No staging necessary - val blastp - val blastx - val blastn - val taxdump output: tuple val(meta), path("*.json"), emit: json @@ -27,17 +21,11 @@ process BLOBTOOLKIT_UPDATEMETA { script: def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" - def input_reads = reads.collect{"--read_id ${it[0].id} --read_type ${it[0].datatype} --read_path ${it[1]}"}.join(' ') """ update_versions.py \\ ${args} \\ --meta_in ${input}/meta.json \\ --software ${versions} \\ - --blastp ${blastp} \\ - --blastx ${blastx} \\ - --blastn ${blastn} \\ - --taxdump ${taxdump} \\ - $input_reads \\ --meta_out ${prefix}.meta.json cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf index 3aab7ee9..a6cac943 100644 --- a/modules/local/generate_config.nf +++ b/modules/local/generate_config.nf @@ -11,11 +11,19 @@ process GENERATE_CONFIG { val busco_lin path lineage_tax_ids tuple val(meta2), path(blastn) + val reads + // The following are passed as "val" because we just want to know the full paths. No staging necessary + val blastp_path + val blastx_path + val blastn_path + val taxdump_path output: - tuple val(meta), path("*.yaml"), emit: yaml - tuple val(meta), path("*.csv") , emit: csv - path "versions.yml" , emit: versions + tuple val(meta), path("*.yaml") , emit: yaml + tuple val(meta), path("*.csv") , emit: csv + tuple val(meta), path("*.synonyms.tsv") , emit: synonyms_tsv, optional: true + tuple val(meta), path("*.categories.tsv"), emit: categories_tsv, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -25,6 +33,7 @@ process GENERATE_CONFIG { def prefix = task.ext.prefix ?: "${meta.id}" def busco_param = busco_lin ? "--busco '${busco_lin}'" : "" def accession_params = params.accession ? "--accession ${params.accession}" : "" + def input_reads = reads.collect{"--read_id ${it[0].id} --read_type ${it[0].datatype} --read_layout ${it[0].layout} --read_path ${it[1]}"}.join(' ') """ generate_config.py \\ --fasta $fasta \\ @@ -32,9 +41,13 @@ process GENERATE_CONFIG { --lineage_tax_ids $lineage_tax_ids \\ $busco_param \\ $accession_params \\ - --blastn $blastn \\ - --yml_out ${prefix}.yaml \\ - --csv_out ${prefix}.csv + --nt $blastn \\ + $input_reads \\ + --blastp ${blastp_path} \\ + --blastx ${blastx_path} \\ + --blastn ${blastn_path} \\ + --taxdump ${taxdump_path} \\ + --output_prefix ${prefix} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf index 7030554d..df7b1424 100644 --- a/modules/nf-core/minimap2/align/main.nf +++ b/modules/nf-core/minimap2/align/main.nf @@ -26,15 +26,18 @@ process MINIMAP2_ALIGN { def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def bam_input = reads.getExtension() in ["bam", "cram"] ? "samtools fasta ${reads} | " : "" + if (bam_input && !reference) error "Fasta/q format is required for self-alignments" + def minimap_input = bam_input ? "-" : reads def bam_output = bam_format ? "-a | samtools sort -@ ${task.cpus} -o ${prefix}.bam ${args2}" : "-o ${prefix}.paf" def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' """ - minimap2 \\ + ${bam_input} minimap2 \\ $args \\ -t $task.cpus \\ - ${reference ?: reads} \\ - $reads \\ + ${reference ?: minimap_input} \\ + $minimap_input \\ $cigar_paf \\ $set_cigar_bam \\ $bam_output diff --git a/modules/nf-core/minimap2/align/minimap2-align.diff b/modules/nf-core/minimap2/align/minimap2-align.diff index 479818b3..fa1b7d53 100644 --- a/modules/nf-core/minimap2/align/minimap2-align.diff +++ b/modules/nf-core/minimap2/align/minimap2-align.diff @@ -8,5 +8,27 @@ Changes in module 'nf-core/minimap2/align' // Note: the versions here need to match the versions used in the mulled container below and minimap2/index conda "${moduleDir}/environment.yml" +@@ -27,15 +26,18 @@ + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" ++ def bam_input = reads.getExtension() in ["bam", "cram"] ? "samtools fasta ${reads} | " : "" ++ if (bam_input && !reference) error "Fasta/q format is required for self-alignments" ++ def minimap_input = bam_input ? "-" : reads + def bam_output = bam_format ? "-a | samtools sort -@ ${task.cpus} -o ${prefix}.bam ${args2}" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + """ +- minimap2 \\ ++ ${bam_input} minimap2 \\ + $args \\ + -t $task.cpus \\ +- ${reference ?: reads} \\ +- $reads \\ ++ ${reference ?: minimap_input} \\ ++ $minimap_input \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output ************************************************************ diff --git a/modules/nf-core/samtools/fasta/environment.yml b/modules/nf-core/samtools/fasta/environment.yml deleted file mode 100644 index 14585013..00000000 --- a/modules/nf-core/samtools/fasta/environment.yml +++ /dev/null @@ -1,8 +0,0 @@ -name: samtools_fasta -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::samtools=1.19.2 - - bioconda::htslib=1.19.1 diff --git a/modules/nf-core/samtools/fasta/main.nf b/modules/nf-core/samtools/fasta/main.nf deleted file mode 100644 index 9aa03430..00000000 --- a/modules/nf-core/samtools/fasta/main.nf +++ /dev/null @@ -1,43 +0,0 @@ -process SAMTOOLS_FASTA { - tag "$meta.id" - label 'process_low' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : - 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" - - input: - tuple val(meta), path(input) - val(interleave) - - output: - tuple val(meta), path("*_{1,2}.fasta.gz") , optional:true, emit: fasta - tuple val(meta), path("*_interleaved.fasta.gz"), optional:true, emit: interleaved - tuple val(meta), path("*_singleton.fasta.gz") , optional:true, emit: singleton - tuple val(meta), path("*_other.fasta.gz") , optional:true, emit: other - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def output = ( interleave && ! meta.single_end ) ? "| gzip > ${prefix}_interleaved.fasta.gz" : - meta.single_end ? "-1 ${prefix}_1.fasta.gz -s ${prefix}_singleton.fasta.gz" : - "-1 ${prefix}_1.fasta.gz -2 ${prefix}_2.fasta.gz -s ${prefix}_singleton.fasta.gz" - """ - samtools \\ - fasta \\ - $args \\ - --threads ${task.cpus-1} \\ - $input \\ - $output - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/fasta/meta.yml b/modules/nf-core/samtools/fasta/meta.yml deleted file mode 100644 index eae26f01..00000000 --- a/modules/nf-core/samtools/fasta/meta.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: "samtools_fasta" -description: Converts a SAM/BAM/CRAM file to FASTA -keywords: - - bam - - sam - - cram - - fasta -tools: - - "samtools": - description: "Tools for dealing with SAM, BAM and CRAM files" - homepage: "http://www.htslib.org" - documentation: "https://www.htslib.org/doc/samtools-fasta.html" - tool_dev_url: "https://github.com/samtools/samtools" - doi: "10.1093/bioinformatics/btp352" - licence: ["MIT"] -input: - # Only when we have meta - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - input: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - interleave: - type: boolean - description: Set true for interleaved fasta files -output: - #Only when we have meta - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - fasta: - type: file - description: Compressed FASTA file(s) with reads with either the READ1 or READ2 flag set in separate files. - pattern: "*_{1,2}.fasta.gz" - - interleaved: - type: file - description: Compressed FASTA file with reads with either the READ1 or READ2 flag set in a combined file. Needs collated input file. - pattern: "*_interleaved.fasta.gz" - - singleton: - type: file - description: Compressed FASTA file with singleton reads - pattern: "*_singleton.fasta.gz" - - other: - type: file - description: Compressed FASTA file with reads with either both READ1 and READ2 flags set or unset - pattern: "*_other.fasta.gz" -authors: - - "@priyanka-surana" -maintainers: - - "@priyanka-surana" diff --git a/modules/nf-core/samtools/fasta/samtools-fasta.diff b/modules/nf-core/samtools/fasta/samtools-fasta.diff deleted file mode 100644 index e2374ed9..00000000 --- a/modules/nf-core/samtools/fasta/samtools-fasta.diff +++ /dev/null @@ -1,13 +0,0 @@ -Changes in module 'nf-core/samtools/fasta' ---- modules/nf-core/samtools/fasta/main.nf -+++ modules/nf-core/samtools/fasta/main.nf -@@ -32,7 +32,6 @@ - fasta \\ - $args \\ - --threads ${task.cpus-1} \\ -- -0 ${prefix}_other.fasta.gz \\ - $input \\ - $output - - -************************************************************ diff --git a/nextflow.config b/nextflow.config index 18994df9..caac9e54 100644 --- a/nextflow.config +++ b/nextflow.config @@ -249,7 +249,7 @@ manifest { description = """Quality assessment of genome assemblies""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '0.6.0' + version = '0.7.0' doi = '10.5281/zenodo.7949058' } diff --git a/subworkflows/local/blobtools.nf b/subworkflows/local/blobtools.nf index 747bc9fa..eea7f432 100644 --- a/subworkflows/local/blobtools.nf +++ b/subworkflows/local/blobtools.nf @@ -2,13 +2,14 @@ // Create BlobTools dataset // -include { BLOBTOOLKIT_METADATA } from '../../modules/local/blobtoolkit/metadata' include { BLOBTOOLKIT_CREATEBLOBDIR } from '../../modules/local/blobtoolkit/createblobdir' include { BLOBTOOLKIT_UPDATEBLOBDIR } from '../../modules/local/blobtoolkit/updateblobdir' workflow BLOBTOOLS { take: config // channel: [ val(meta), path(config) ] + syn_tsv // channel: [ val(meta), [path(tsv)] ] + cat_tsv // channel: [ val(meta), [path(tsv)] ] windowstats // channel: [ val(meta), path(window_stats_tsvs) ] busco // channel: [ val(meta), path(full_table) ] blastp // channel: [ val(meta), path(txt) ] @@ -21,29 +22,21 @@ workflow BLOBTOOLS { ch_versions = Channel.empty() - // - // Create metadata summary file - // - BLOBTOOLKIT_METADATA ( config ) - ch_versions = ch_versions.mix ( BLOBTOOLKIT_METADATA.out.versions.first() ) - - // // Create Blobtools dataset files // - BLOBTOOLKIT_CREATEBLOBDIR ( windowstats, busco, blastp, BLOBTOOLKIT_METADATA.out.yaml, taxdump ) + BLOBTOOLKIT_CREATEBLOBDIR ( windowstats, busco, blastp, config, taxdump ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_CREATEBLOBDIR.out.versions.first() ) // // Update Blobtools dataset files // - BLOBTOOLKIT_UPDATEBLOBDIR ( BLOBTOOLKIT_CREATEBLOBDIR.out.blobdir, blastx, blastn, taxdump ) + BLOBTOOLKIT_UPDATEBLOBDIR ( BLOBTOOLKIT_CREATEBLOBDIR.out.blobdir, syn_tsv, cat_tsv, blastx, blastn, taxdump ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_UPDATEBLOBDIR.out.versions.first() ) emit: - metadata = BLOBTOOLKIT_METADATA.out.yaml // channel: [ val(meta), path(yaml) ] blobdir = BLOBTOOLKIT_UPDATEBLOBDIR.out.blobdir // channel: [ val(meta), path(dir) ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/finalise_blobdir.nf b/subworkflows/local/finalise_blobdir.nf index 352cb01b..04dfe8d2 100644 --- a/subworkflows/local/finalise_blobdir.nf +++ b/subworkflows/local/finalise_blobdir.nf @@ -8,7 +8,6 @@ include { COMPRESSBLOBDIR } from '../../modules/local/compressblobdir' workflow FINALISE_BLOBDIR { take: blobdir // channel: [ val(meta), path(blobdir) ] - reads // channel: [ [meta, reads] ] software // channel: [ val(meta), path(software_yml) ] summary // channel: [ val(meta), path(summary_json) ] @@ -17,17 +16,9 @@ workflow FINALISE_BLOBDIR { ch_versions = Channel.empty() // - // MODULE: Update meta json file + // MODULE: Update the software listed in the meta json file // - BLOBTOOLKIT_UPDATEMETA ( - blobdir, - reads, - software, - params.blastp, - params.blastx, - params.blastn, - params.taxdump, - ) + BLOBTOOLKIT_UPDATEMETA ( blobdir, software ) // // MODULE: Compress all the json files diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 7a8fd112..69a4757a 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -67,6 +67,11 @@ workflow INPUT_CHECK { busco_lin, lineage_tax_ids, blastn, + reads.collect(flat: false).ifEmpty([]), + params.blastp, + params.blastx, + params.blastn, + params.taxdump, ) ch_versions = ch_versions.mix(GENERATE_CONFIG.out.versions.first()) @@ -106,6 +111,8 @@ workflow INPUT_CHECK { emit: reads // channel: [ val(meta), path(datafile) ] config = GENERATE_CONFIG.out.yaml // channel: [ val(meta), path(yaml) ] + synonyms_tsv = GENERATE_CONFIG.out.synonyms_tsv // channel: [ val(meta), path(tsv) ] + categories_tsv = GENERATE_CONFIG.out.categories_tsv // channel: [ val(meta), path(tsv) ] taxon_id = ch_taxon_id // channel: val(taxon_id) busco_lineages = ch_busco_lineages // channel: val([busco_lin]) versions = ch_versions // channel: [ versions.yml ] @@ -117,7 +124,7 @@ def create_data_channels(LinkedHashMap row) { def meta = [:] meta.id = row.sample meta.datatype = row.datatype - + meta.layout = row.library_layout // add path(s) of the read file(s) to the meta map def data_meta = [] @@ -140,6 +147,7 @@ def create_data_channels_from_fetchngs(LinkedHashMap row) { // create meta map def meta = [:] meta.id = row.run_accession + meta.layout = row.library_layout // Same as https://github.com/blobtoolkit/blobtoolkit/blob/4.3.3/src/blobtoolkit-pipeline/src/lib/functions.py#L30-L39 // with the addition of "hic" diff --git a/subworkflows/local/minimap_alignment.nf b/subworkflows/local/minimap_alignment.nf index 0c25f4c7..33d632c7 100644 --- a/subworkflows/local/minimap_alignment.nf +++ b/subworkflows/local/minimap_alignment.nf @@ -2,7 +2,6 @@ // Optional alignment subworkflow using Minimap2 // -include { SAMTOOLS_FASTA } from '../../modules/nf-core/samtools/fasta/main' include { MINIMAP2_ALIGN as MINIMAP2_HIC } from '../../modules/nf-core/minimap2/align/main' include { MINIMAP2_ALIGN as MINIMAP2_ILMN } from '../../modules/nf-core/minimap2/align/main' include { MINIMAP2_ALIGN as MINIMAP2_CCS } from '../../modules/nf-core/minimap2/align/main' @@ -20,24 +19,8 @@ workflow MINIMAP2_ALIGNMENT { ch_versions = Channel.empty() - // Convert BAM/CRAM reads to FASTA - input - | branch { - meta, reads -> - fasta: reads.toString().endsWith(".fasta") || reads.toString().endsWith(".fasta.gz") || reads.toString().endsWith(".fa") || reads.toString().endsWith(".fa.gz") - fastq: reads.toString().endsWith(".fastq") || reads.toString().endsWith(".fastq.gz") || reads.toString().endsWith(".fq") || reads.toString().endsWith(".fq.gz") - bamcram: true - } - | set { ch_reads_by_type } - - SAMTOOLS_FASTA ( ch_reads_by_type.bamcram, true ) - ch_versions = ch_versions.mix(SAMTOOLS_FASTA.out.versions.first()) - - // Branch input by sequencing type - SAMTOOLS_FASTA.out.interleaved - | mix ( ch_reads_by_type.fasta ) - | mix ( ch_reads_by_type.fastq ) + input | branch { meta, reads -> hic: meta.datatype == "hic" diff --git a/subworkflows/local/run_blastx.nf b/subworkflows/local/run_blastx.nf index 715e5ae2..cfcb0219 100644 --- a/subworkflows/local/run_blastx.nf +++ b/subworkflows/local/run_blastx.nf @@ -19,7 +19,7 @@ workflow RUN_BLASTX { // - // Create metadata summary file + // Split the sequences // BLOBTOOLKIT_CHUNK ( fasta, table ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_CHUNK.out.versions.first() ) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index cd97fd99..280278a7 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -180,6 +180,8 @@ workflow BLOBTOOLKIT { // BLOBTOOLS ( INPUT_CHECK.out.config, + INPUT_CHECK.out.synonyms_tsv.ifEmpty([[],[]]), + INPUT_CHECK.out.categories_tsv.ifEmpty([[],[]]), COLLATE_STATS.out.window_tsv, BUSCO_DIAMOND.out.all_tables, BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]), @@ -207,7 +209,6 @@ workflow BLOBTOOLKIT { // FINALISE_BLOBDIR ( BLOBTOOLS.out.blobdir, - INPUT_CHECK.out.reads.collect(flat: false).ifEmpty([]), CUSTOM_DUMPSOFTWAREVERSIONS.out.yml, VIEW.out.summary )