From 8d4441d0962ae813a93f3a6909b870143bb19e17 Mon Sep 17 00:00:00 2001 From: Nick Greenfield Date: Mon, 29 Jan 2018 17:12:19 -0800 Subject: [PATCH 1/2] Update paper.md and paper.bib --- paper/paper.bib | 24 ++++++++++++++++++++++++ paper/paper.md | 12 ++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 4ebafd9..af68d73 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -48,3 +48,27 @@ @article{Sim2017 title = {Determining the cause of recurrent Clostridium difficile infection using whole genome sequencing}, journal = {Diagnostic Microbiology and Infectious Disease} } + +@article{Matsakis2014, + doi = {10.1145/2692956.2663188}, + url = {http://doi.acm.org/10.1145/2692956.2663188}, + year = {2014}, + month = {oct}, + publisher = {ACM}, + volume = {34}, + number = {3}, + pages = {103--104}, + author = {Matsakis, Nicholas D. and Klock,II, Felix S.}, + title = {The Rust Language}, + journal = {Ada Lett.} +} + +@article{Minot2015, + doi = {10.1101/027607}, + url = {https://www.biorxiv.org/content/early/2015/09/28/027607}, + year = {2015}, + author = {Minot, Samuel S and Krumm, Niklas and Greenfield, Nicholas B}, + publisher = {Cold Spring Harbor Laboratory}, + title = {One Codex: A Sensitive and Accurate Data Platform for Genomic Microbial Identification}, + journal = {bioRxiv} +} diff --git a/paper/paper.md b/paper/paper.md index cb3267b..8710c16 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -18,10 +18,14 @@ bibliography: paper.bib --- # Summary -MinHash [@Broder1997] is a document similarity estimation technique that has been applied to problems in genomics including sequence search, phylogenetic reconstruction [@Ondov2016; @Brown2016], and evaluating outbreaks of hospital acquired infections (HAIs) [@Sim2017]. -We implement two additions to existing MinHash schemes: calculating abundances (i.e., minmer counts) during the generation of the MinHash sketches and adaptively correcting for biases introduced due to variable sequencing depth. -This count information greatly improves the utility of MinHashing when working directly from raw read data (i.e., FASTQ files) and allows more robust estimation of distances between both isolates and complex metagenomic samples. -`finch-rs` is a Rust library and a corresponding command line tool, `finch`, for creating and manipulating MinHash sketches with abundance information. +MinHash [@Broder1997] is a document similarity estimation technique that has been applied to problems in genomics including sequence search, phylogenetic reconstruction [@Ondov2016; @Brown2016], and evaluating outbreaks of hospital acquired infections (HAIs) [@Sim2017]. We developed the `finch-rs` library (https://github.com/onecodex/finch-rs) and `finch` command line tool for creating, filtering, and manipulating MinHash sketches from genomics data, including both FASTA sequence files and FASTQ raw read data from next-generation sequencing (NGS) instruments. We extend existing MinHash schemes for genomics data with two major additions: (1) calculation of abundances (i.e., minmer counts) during the generation of the MinHash sketches; and (2) adaptive correction of biases introduced due to variable sequencing depths. These features greatly improve the utility of MinHashing when applied directly to raw read data (i.e., FASTQ files) and allows more robust estimation between both isolates and complex metagenomic samples. + +Finch and similar genomic MinHashing software works by breaking sequence data up into k-length nucleotide or amino acid subsequences ("k-mers"), computing the hash of each k-mer, and then taking the _n_ lowest hash values. Collectively, these _n_ smallest values ("minmers") comprise a "sketch" of the input sample. By default, previous MinHash implementations for genomics data work by creating sketches from _all_ k-mers from an input genomic dataset (though the original Mash tool does enable filtering out k-mers that appear only once using a Bloom filter [@Ondov2016]). While this works well for high-quality sequences such as genome assemblies (i.e., FASTA files), it quickly becomes problematic when working with raw FASTQ data, where errors from NGS instruments can lead to a far larger number of _unique_ observed k-mers than are truly present biologically. Similarly, this also leads to the inclusion of sequencing errors and k-mers from minor community members when comparing complex, mixed genomic samples (i.e., microbiome samples). In both cases, non-representative k-mers (either direct products of sequencing error or low abundance organisms) come to dominate sketches and confound inter-sample distance estimates. + +We address this filtering challenge by "over-sketching" the input genomic data. First, we create a sketch substantially larger than the desired final size (_n_), tracking the abundances of each k-mer in the sketch, and using the abundances in the large sketch to determine a dynamic filtering threshold. We also track how often each k-mer is seen in its forward versus its reverse orientation. These two metrics allow us to both: (1) estimate the empirical sequencing error in the sample and only select k-mers that appear to be biologically present; and (2) remove k-mers that exhibit unbalanced forward and reverse orientation ratios. The former addresses the challenges of comparing data sequenced to varying depths (or samples of varying natural complexity), while the latter can correct for errors that may stem from measurement artifacts such as reads that include adapters and barcode sequences. By removing these error k-mers from a final, reduced sketch (size _n_), `finch` more robustly estimates distances between sets of both isolates and complex metagenomic samples. + +Finch is written in the Rust programming language [@Matsakis2014] which reduces programming errors through static type checking, allows greater control over performance, and easily integrates into higher-level languages such as Python or R. We have integrated Finch into our One Codex platform [@Minot2015] and are using it to power clustering and similarity search features. + # References From 29cbdb77748da83e813dc7fe898fe2acc83c8a51 Mon Sep 17 00:00:00 2001 From: Roderick Bovee Date: Tue, 30 Jan 2018 10:48:44 -0800 Subject: [PATCH 2/2] Update paper.md Remove some commas and other final touch-ups --- paper/paper.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 8710c16..c3ea48e 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -1,5 +1,5 @@ --- -title: 'Finch' +title: 'Finch: a tool adding dynamic abundance filtering to genomic MinHashing' tags: - minhash - Rust @@ -21,9 +21,9 @@ bibliography: paper.bib MinHash [@Broder1997] is a document similarity estimation technique that has been applied to problems in genomics including sequence search, phylogenetic reconstruction [@Ondov2016; @Brown2016], and evaluating outbreaks of hospital acquired infections (HAIs) [@Sim2017]. We developed the `finch-rs` library (https://github.com/onecodex/finch-rs) and `finch` command line tool for creating, filtering, and manipulating MinHash sketches from genomics data, including both FASTA sequence files and FASTQ raw read data from next-generation sequencing (NGS) instruments. We extend existing MinHash schemes for genomics data with two major additions: (1) calculation of abundances (i.e., minmer counts) during the generation of the MinHash sketches; and (2) adaptive correction of biases introduced due to variable sequencing depths. These features greatly improve the utility of MinHashing when applied directly to raw read data (i.e., FASTQ files) and allows more robust estimation between both isolates and complex metagenomic samples. -Finch and similar genomic MinHashing software works by breaking sequence data up into k-length nucleotide or amino acid subsequences ("k-mers"), computing the hash of each k-mer, and then taking the _n_ lowest hash values. Collectively, these _n_ smallest values ("minmers") comprise a "sketch" of the input sample. By default, previous MinHash implementations for genomics data work by creating sketches from _all_ k-mers from an input genomic dataset (though the original Mash tool does enable filtering out k-mers that appear only once using a Bloom filter [@Ondov2016]). While this works well for high-quality sequences such as genome assemblies (i.e., FASTA files), it quickly becomes problematic when working with raw FASTQ data, where errors from NGS instruments can lead to a far larger number of _unique_ observed k-mers than are truly present biologically. Similarly, this also leads to the inclusion of sequencing errors and k-mers from minor community members when comparing complex, mixed genomic samples (i.e., microbiome samples). In both cases, non-representative k-mers (either direct products of sequencing error or low abundance organisms) come to dominate sketches and confound inter-sample distance estimates. +Finch and similar genomic MinHashing software works by breaking sequence data up into k-length nucleotide or amino acid subsequences ("k-mers"), computing a hash of each k-mer, and then taking the _n_ lowest hash values. Collectively these _n_ smallest values ("minmers") comprise a "sketch" of the input sample. By default, previous MinHash implementations for genomics data work by creating sketches from _all_ k-mers from an input genomic dataset (though the original Mash tool does enable filtering out k-mers that appear only once using a Bloom filter [@Ondov2016]). While this works well for high-quality sequences such as genome assemblies (i.e., FASTA files), it quickly becomes problematic when working with raw FASTQ data where errors from NGS instruments can lead to a far larger number of _unique_ observed k-mers than are truly present biologically. Similarly, this also leads to the inclusion of sequencing errors and k-mers from minor community members when comparing complex, mixed genomic samples (i.e., microbiome samples). In both cases, non-representative k-mers (either direct products of sequencing error or low abundance organisms) come to dominate sketches and confound inter-sample distance estimates. -We address this filtering challenge by "over-sketching" the input genomic data. First, we create a sketch substantially larger than the desired final size (_n_), tracking the abundances of each k-mer in the sketch, and using the abundances in the large sketch to determine a dynamic filtering threshold. We also track how often each k-mer is seen in its forward versus its reverse orientation. These two metrics allow us to both: (1) estimate the empirical sequencing error in the sample and only select k-mers that appear to be biologically present; and (2) remove k-mers that exhibit unbalanced forward and reverse orientation ratios. The former addresses the challenges of comparing data sequenced to varying depths (or samples of varying natural complexity), while the latter can correct for errors that may stem from measurement artifacts such as reads that include adapters and barcode sequences. By removing these error k-mers from a final, reduced sketch (size _n_), `finch` more robustly estimates distances between sets of both isolates and complex metagenomic samples. +We address this filtering challenge by "over-sketching" the input genomic data. First, we create a sketch substantially larger than the desired final size (_n_), tracking the abundances of each k-mer in the sketch, and using the abundances in the large sketch to determine a dynamic filtering threshold. We also track how often each k-mer is seen in its forward versus its reverse orientation. These two metrics allow us to both: (1) estimate the empirical sequencing error in the sample and remove k-mers that may not be biologically present; and (2) remove k-mers that exhibit unbalanced forward and reverse orientation ratios. The former addresses the challenges of comparing data sequenced to varying depths (or samples of varying natural complexity) while the latter can correct for errors that may stem from measurement artifacts such as reads that include adapters and barcode sequences. By removing these error k-mers from a final, reduced sketch (size _n_), `finch` more robustly estimates distances between sets of both isolates and complex metagenomic samples. Finch is written in the Rust programming language [@Matsakis2014] which reduces programming errors through static type checking, allows greater control over performance, and easily integrates into higher-level languages such as Python or R. We have integrated Finch into our One Codex platform [@Minot2015] and are using it to power clustering and similarity search features.