Skip to content

Commit

Permalink
Merge pull request #58 from gbouras13/jossreviews
Browse files Browse the repository at this point in the history
Jossreviews
  • Loading branch information
gbouras13 authored Nov 6, 2023
2 parents 75eff08 + 63f6a1b commit 637429c
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 20 deletions.
Binary file added paper/C333_chromosome_combined.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added paper/C333_phage_combined.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
77 changes: 65 additions & 12 deletions paper/paper.bib
Original file line number Diff line number Diff line change
Expand Up @@ -129,19 +129,23 @@ @article{Wick:2021
}


@article {Mallawaarachchi:2023,
author = {Vijini Mallawaarachchi and Michael J. Roach and Bhavya Papudeshi and Sarah K. Giles and Susanna R. Grigson and Przemyslaw Decewicz and George Bouras and Ryan D. Hesse and Laura K. Inglis and Abbey L. K. Hutton and Elizabeth A. Dinsdale and Robert A. Edwards},
title = {Phables: from fragmented assemblies to high-quality bacteriophage genomes},
elocation-id = {2023.04.04.535632},
year = {2023},
doi = {10.1101/2023.04.04.535632},
publisher = {Cold Spring Harbor Laboratory},
URL = {https://www.biorxiv.org/content/early/2023/04/04/2023.04.04.535632},
eprint = {https://www.biorxiv.org/content/early/2023/04/04/2023.04.04.535632.full.pdf},
journal = {bioRxiv}
@article{Mallawaarachchi:2023,
author = {Mallawaarachchi, Vijini and Roach, Michael J and Decewicz, Przemyslaw and Papudeshi, Bhavya and Giles, Sarah K and Grigson, Susanna R and Bouras, George and Hesse, Ryan D and Inglis, Laura K and Hutton, Abbey L K and Dinsdale, Elizabeth A and Edwards, Robert A},
title = "{Phables: from fragmented assemblies to high-quality bacteriophage genomes}",
journal = {Bioinformatics},
volume = {39},
number = {10},
pages = {btad586},
year = {2023},
month = {09},
issn = {1367-4811},
doi = {10.1093/bioinformatics/btad586},
url = {https://doi.org/10.1093/bioinformatics/btad586},
eprint = {https://academic.oup.com/bioinformatics/article-pdf/39/10/btad586/51972145/btad586.pdf},
}

@article{Bouras:2023,

@article{Bouras1:2023,
author = {Bouras, George and Sheppard, Anna E and Mallawaarachchi, Vijini and Vreugde, Sarah},
title = "{Plassembler: an automated bacterial plasmid assembly tool}",
journal = {Bioinformatics},
Expand All @@ -150,10 +154,59 @@ @article{Bouras:2023
pages = {btad409},
year = {2023},
month = {06},
abstract = "{With recent advances in sequencing technologies, it is now possible to obtain near-perfect complete bacterial chromosome assemblies cheaply and efficiently by combining a long-read-first assembly approach with short-read polishing. However, existing methods for assembling bacterial plasmids from long-read-first assemblies often misassemble or even miss bacterial plasmids entirely and accordingly require manual curation. Plassembler was developed to provide a tool that automatically assembles and outputs bacterial plasmids using a hybrid assembly approach. It achieves increased accuracy and computational efficiency compared to the existing gold standard tool Unicycler by removing chromosomal reads from the input read sets using a mapping approach.Plassembler is implemented in Python and is installable as a bioconda package using ‘conda install -c bioconda plassembler’. The source code is available on GitHub at https://github.com/gbouras13/plassembler. The full benchmarking pipeline can be found at https://github.com/gbouras13/plassembler\_simulation\_benchmarking, while the benchmarking input FASTQ and output files can be found at https://doi.org/10.5281/zenodo.7996690.}",
issn = {1367-4811},
doi = {10.1093/bioinformatics/btad409},
url = {https://doi.org/10.1093/bioinformatics/btad409},
eprint = {https://academic.oup.com/bioinformatics/article-pdf/39/7/btad409/50836034/btad409.pdf},
}

@article {Houtak:2023,
author = {Houtak, Ghais and Bouras, George and Nepal, Roshan and Shaghayegh, Gohar and Cooksley, Clare and Psaltis, Alkis James and Wormald, Peter-John and Vreugde, Sarah},
title = {The Intra-Host Evolutionary Landscape And Pathoadaptation Of Persistent Staphylococcus aureus In Chronic Rhinosinusitis},
elocation-id = {2023.03.28.534496},
year = {2023},
doi = {10.1101/2023.03.28.534496},
publisher = {Cold Spring Harbor Laboratory},
URL = {https://www.biorxiv.org/content/early/2023/03/28/2023.03.28.534496},
eprint = {https://www.biorxiv.org/content/early/2023/03/28/2023.03.28.534496.full.pdf},
journal = {bioRxiv}
}

@article{Bouras2:2023,
author = {Bouras, George and Nepal, Roshan and Houtak, Ghais and Psaltis, Alkis James and Wormald, Peter-John and Vreugde, Sarah},
title = "{Pharokka: a fast scalable bacteriophage annotation tool}",
journal = {Bioinformatics},
volume = {39},
number = {1},
pages = {btac776},
year = {2022},
month = {12},
abstract = "{In recent years, there has been an increasing interest in bacteriophages, which has led to growing numbers of bacteriophage genomic sequences becoming available. Consequently, there is a need for a rapid and consistent genomic annotation tool dedicated for bacteriophages. Existing tools either are not designed specifically for bacteriophages or are web- and email-based and require significant manual curation, which makes their integration into bioinformatic pipelines challenging. Pharokka was created to provide a tool that annotates bacteriophage genomes easily, rapidly and consistently with standards compliant outputs. Moreover, Pharokka requires only two lines of code to install and use and takes under 5 min to run for an average 50-kb bacteriophage genome.Pharokka is implemented in Python and is available as a bioconda package using ‘conda install -c bioconda pharokka’. The source code is available on GitHub (https://github.com/gbouras13/pharokka). Pharokka has been tested on Linux-64 and MacOSX machines and on Windows using a Linux Virtual Machine.}",
issn = {1367-4811},
doi = {10.1093/bioinformatics/btac776},
url = {https://doi.org/10.1093/bioinformatics/btac776},
eprint = {https://academic.oup.com/bioinformatics/article-pdf/39/1/btac776/48448991/btac776.pdf},
}

@article{Schwengers:2021,
author = "Schwengers, Oliver and Jelonek, Lukas and Dieckmann, Marius Alfred and Beyvers, Sebastian and Blom, Jochen and Goesmann, Alexander",
title = "Bakta: rapid and standardized annotation of bacterial genomes via alignment-free sequence identification",
journal= "Microbial Genomics",
year = "2021",
volume = "7",
number = "11",
pages = "",
doi = "https://doi.org/10.1099/mgen.0.000685",
url = "https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000685",
publisher = "Microbiology Society",
issn = "2057-5858",
type = "Journal Article",
keywords = "whole-genome sequencing",
keywords = "bacteria",
keywords = "metagenome-assembled genomes",
keywords = "plasmids ",
keywords = "genome annotation",
eid = "000685"
}


23 changes: 15 additions & 8 deletions paper/paper.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,38 +33,45 @@ bibliography: paper.bib

# Summary

Microorganisms found in natural environments are fundamental components of ecosystems and play vital roles in various ecological processes. Studying their genomes can provide valuable insights into the diversity, functionality, and evolution of microbial life, as well as their impacts on human health. Once the genetic material is extracted from environmental samples, it undergoes sequencing using advanced technologies like whole genome sequencing (WGS). The raw sequence data is then analysed, and computational methods are applied to assemble the fragmented sequences and reconstruct the complete microbial genomes [@Wick:2021] [@Mallawaarachchi:2023] [@Bouras:2023].

Many microorganisms including archaea, bacteria, plasmids, viruses, and bacteriophages, can have circular genomes. However, a circular genome sequence once assembled is represented as a linear character string and labelled in some way to indicate that it should be circular. The point at which the linear sequence begins is random, due to the nature of the algorithms employed in assembling genomes from sequencing reads. Such arbitrary startpoints can affect downstream genome annotation and analysis; they may occur within coding sequences (CDS), can disrupt the prediction potential of mobile genetic elements like prophages, and make pangenome analyses based on gene order difficult. Therefore, microbial sequences are often required to be reoriented to begin by convention with certain genes: the dnaA chromosomal replication initiator gene for bacterial chromosomes, the repA plasmid replication initiation gene for plasmids and the terL large terminase subunit gene for bacteriophages as shown in \autoref{fig:workflow}. Here we present Dnaapler, a flexible microbial sequence reorientation tool that allows for rapid and consistent orientation of circular microbial genomes such as bacteria, plasmids and bacteriophages. Dnaapler is hosted on GitHub at [github.com/gbouras13/dnaapler](https://github.com/gbouras13/dnaapler).
Microorganisms found in natural environments are fundamental components of ecosystems and play vital roles in various ecological processes. Studying their genomes can provide valuable insights into the diversity, functionality, and evolution of microbial life, as well as their impacts on human health. Once the genetic material is extracted from environmental samples, it undergoes sequencing using technologies like whole genome sequencing (WGS). The raw sequence data is then analysed, and computational methods are applied to assemble the fragmented sequences and reconstruct the complete microbial genomes [@Wick:2021] [@Mallawaarachchi:2023] [@Bouras1:2023].

Many biological entities including Bacteria, Archaea, plasmids, bacteriophages and other viruses can have circular genomes. Once assembled, a circular genome sequence is represented as a linear character string and labelled in some way to indicate that it should be circular. The point at which the linear sequence begins is random, due to the nature of the algorithms employed in assembling genomes from sequencing reads. Such arbitrary startpoints can affect downstream genome annotation and analysis; they may occur within coding sequences (CDS), can disrupt the prediction potential of mobile genetic elements like prophages, and make pangenome analyses based on gene order difficult. Therefore, microbial sequences are often required to be reoriented to begin by convention with certain genes: the dnaA chromosomal replication initiator gene for bacterial chromosomes, the repA plasmid replication initiation gene for plasmids and the terL large terminase subunit gene for bacteriophages as shown in \autoref{fig:workflow}. Here we present Dnaapler, a flexible microbial sequence reorientation tool that allows for rapid and consistent orientation of circular microbial genomes such as Bacteria, plasmids and bacteriophages. Dnaapler is hosted on GitHub at [github.com/gbouras13/dnaapler](https://github.com/gbouras13/dnaapler).

![Example microbial genome assembly workflow.\label{fig:workflow}](Dnaapler_figure.png){width=100%}


# Statement of need

Circlator [@Hunt:2015] is the most commonly used dedicated tool for reorienting bacterial genomes. However, Circlator was designed for bacterial chromosomes and plasmids only, is no longer supported by its developers, has several burdensome external dependencies, and requires the corrected reads in FASTA or FASTQ format along with the FASTA genome assembly as input. Alternatively, genome reorientation is often performed manually or with custom scripts on a genome-by-genome and project-by-project basis, making integration into assembly workflows difficult and creating inconsistencies between different projects and researchers. We propose Dnaapler, a light-weight command-line tool written in Python 3 that can easily be integrated into assembly workflows. Dnaapler takes only a FASTA formatted genome file as input. It uses the Basic Local Alignment Search Tool (BLAST) [@Altschul:1990] [@Mount:2007] — its only external dependency — or Pyrodigal [@Larralde:2022] [@Hyatt:2010] depending on the chosen subcommand for reorientation. A list of the subcommands provided in Dnaapler are as follows:
Circlator [@Hunt:2015] is the most commonly used dedicated tool for reorienting bacterial genomes. However, Circlator was designed for bacterial chromosomes and plasmids only, is no longer supported by its developers, has several burdensome external dependencies, and requires the corrected reads in FASTA or FASTQ format along with the FASTA genome assembly as input. Alternatively, genome reorientation is often performed manually or with custom scripts on a genome-by-genome and project-by-project basis, making integration into assembly workflows difficult, and creating inconsistencies between different projects and researchers. We propose Dnaapler, a light-weight command-line tool written in Python that can be easily integrated into assembly workflows. Dnaapler takes only a FASTA formatted genome file as input. It uses BLAST [@Altschul:1990] [@Mount:2007] — its only external dependency — or Pyrodigal [@Larralde:2022] [@Hyatt:2010] depending on the chosen subcommand for reorientation. A list of the subcommands provided in Dnaapler are as follows:

| Subcommand | Database used | Gene used to reorient |
|------------|-------------------------------------------------------------------------------|---------------------------------------------|
| chromosome | Custom database downloaded from Swissprot | dnaA chromosomal replication initiator gene |
| plasmid | repA database curated from Unicycler [@Wick:2017] | repA plasmid replication initiation gene |
| phage | Prokaryotic Virus Remote Homologous Groups database (PHROGs) [@Terzian:2021] | terL large terminase subunit gene |
| all | Chromosome, plasmid and phage databases combined | dnaA, repA and terL |
| custom | User specified | Custom gene |
| mystery | Pyrodigal predicted coding sequences | Random CDS |
| nearest | Pyrodigal predicted coding sequences | First CDS (nearest to the start) |
| largest | Pyrodigal predicted coding sequences | Largest CDS |
| bulk | Either chromosome, plasmid, phage or custom. Requires multiple input contigs. | dnaA, repA, terL or a custom gene |


Specifically, Dnaapler 'chromosome', 'phage' and 'plasmid' subcommands use blastx (protein databases are searched using a translated nucleotide query) to search for the dnaA, terL or repA gene respectively in the input genomes, using built-in amino acid databases for each gene. Dnaapler will check that the first amino acid of the identified start site begins with Methionine, Valine, or Leucine (the 3 most used gene start codons in bacteria and bacteriophages) and will then reorient the genome to begin with this gene in the forward direction. If the 'custom' subcommand is selected, the same process will be conducted but with a user specified amino acid FASTA formatted input database. If the 'mystery' or 'nearest' subcommands are selected, Pyrodigal will be used to predict all coding sequences, and the genome will be reoriented to begin with either a random (mystery) or the first (nearest) CDS, respectively. Dnaapler returns an output directory containing a log file and the genome reoriented as a FASTA formatted file. Finally, the 'bulk' subcommand can be used to reorient multiple input contigs (in a mulitFASTA format file) using either the chromosome, plasmid, phage or custom reorientation strategies.
Specifically, Dnaapler 'chromosome', 'phage' and 'plasmid' subcommands use blastx (protein databases are searched using a translated nucleotide query) to search for the dnaA, terL or repA gene respectively in the input genomes, using built-in amino acid databases for each gene. Dnaapler 'all' will run a blastx search against all three databases simultaneously, prioritising dnaA hits then repA and finally terL if multiple genes have hits. Taking the top blastx hit, Dnaapler will check that the first amino acid of the BLAST alignment begins with Methionine, Valine, or Leucine (the 3 most used gene start codons in bacteria and bacteriophages). If it does, then it will then reorient the genome to begin at that position in the forward direction. If it does not, then Pyrodigal will be used to predict all coding sequences. Dnaapler will calculate the CDS with the most overlap to the top blastx hit, and reorient the genome to begin with the start codon of that CDS in the forward direction.

If the 'custom' subcommand is selected, the same process will be conducted but with a user specified amino acid FASTA formatted input database. If the 'mystery', 'nearest' or 'largest' subcommands are selected, Pyrodigal will be used to predict all coding sequences, and the genome will be reoriented to begin with either a random (mystery), the first (nearest) CDS, or the largest CDS respectively. Dnaapler returns an output directory containing a log file and the genome reoriented as a FASTA formatted file. Finally, the 'bulk' subcommand can be used to reorient multiple input contigs (in a mulitFASTA format file) using either the chromosome, plasmid, phage or custom reorientation strategies.

Examples of Dnaapler's functionality on the C333 _Staphylococcus aureus_ chromosome and the C333 Sa3int prophage (GenBank accession GCA_030288915.1, Sample Number SAMN32360890 from BioProject PRJNA914892 from [@Houtak:2023]) are shown below using the plotting functionalities of Bakta v1.8.2 [@Schwengers:2021] and Pharokka v1.5.1 [@Bouras2:2023].

![Example Dnaapler phage reorientation of the c333 Sa3int prophage as a circular genomic map from Pharokka beginning at the top of the circle. Each coloured arrow represents a CDS. The large terminase subunit gene is labelled as terL. Dnaapler identified the terL gene as beginning with coordinate 19146 on the forward strand. \label{fig:prophage}](C333_phage_combined.png){width=100%}

![Example Dnaapler chromosome reorientation of the C333 chromosome as a circular genomic map from Bakta beginning at the top of the circle. Each thin line represents a CDS, with the positive stranded CDSs denoted in the outer ring and the negatived stranded CDSs in the inner ring. The position of the chromosomal replication initiator gene is labelled as dnaA. The red and green ring denotes the GC content while the blue and yellow ring denotes the GC skew. Dnaapler identified the dnaA gene as beginning with coordinate 466140 on the reverse strand. \label{fig:prophage}](C333_chromosome_combined.png){width=100%}

Dnaapler has already been integrated into the United States of America StaPH-B (State Public Health Lab Bioinformatics) consortium [Docker image collection](https://github.com/StaPH-B/docker-builds).

# Availability

Dnaapler is distributed on PyPI. A [Conda](https://conda.io/) package is
also available in the Bioconda channel [@Bioconda:2018]. The source code is available on [GitHub](https://github.com/gbouras13/dnaapler),
and features Continuous Integration tests and test coverage, and Continuous Deployment using Github actions.
and features Continuous Integration tests and test coverage, and Continuous Deployment using GitHub actions. Dnaapler has already been integrated into the United States of America StaPH-B (State Public Health Lab Bioinformatics) consortium [Docker image collection](https://github.com/StaPH-B/docker-builds).

# Acknowledgements
We would like to thank Michael B. Hall for providing some code snippets particularly the external tool class from his tool [tbpore](https://github.com/mbhall88/tbpore), Ryan Wick for curating a repA database from Unicycler and Sarah Vreugde and Robert A. Edwards for their supervision.
Expand Down

0 comments on commit 637429c

Please sign in to comment.