diff --git a/README.md b/README.md index 2039f4d..44bcc55 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ sequencing technology. Read more about the development of this method [here][pap [![Conda (channel only)](https://img.shields.io/conda/vn/bioconda/nohuman)](https://anaconda.org/bioconda/nohuman) [![bioconda version](https://anaconda.org/bioconda/nohuman/badges/platforms.svg)](https://anaconda.org/bioconda/nohuman) -![Conda](https://img.shields.io/conda/dn/bioconda/nohuman) +![Conda Downloads](https://img.shields.io/conda/d/bioconda/nohuman) ```shell $ conda install -c bioconda nohuman @@ -187,6 +187,18 @@ or to specify a different path for the output $ nohuman -t 4 --out1 clean_1.fq --out2 clean_2.fq in_1.fq in_2.fq ``` +Set a [minimum confidence score][conf] for kraken2 classifications + +``` +$ nohuman --conf 0.5 in.fq +``` + +or write the kraken2 read classification output to a file + +``` +$ nohuman -k kraken.out in.fq +``` + > [!TIP] > Compressed output will be inferred from the specified output path(s). If no output path is provided, the same > compression as the input will be used. To override the output compression format, use the `--output-type` option. @@ -215,6 +227,8 @@ Options: -F, --output-type Output compression format. u: uncompressed; b: Bzip2; g: Gzip; x: Xz (Lzma); z: Zstd -t, --threads Number of threads to use in kraken2 and optional output compression. Cannot be 0 [default: 1] -H, --human Output human reads instead of removing them + -C, --conf <[0, 1]> Kraken2 minimum confidence score [default: 0.0] + -k, --kraken-output Write the Kraken2 read classification output to a file -v, --verbose Set the logging level to verbose -h, --help Print help (see more with '--help') -V, --version Print version @@ -275,7 +289,15 @@ Options: -H, --human Output human reads instead of removing them - + + -C, --conf <[0, 1]> + Kraken2 minimum confidence score + + [default: 0.0] + + -k, --kraken-output + Write the Kraken2 read classification output to a file + -v, --verbose Set the logging level to verbose @@ -326,4 +348,6 @@ more details and for other alternate approaches. [paper]: https://doi.org/10.1093/gigascience/giae010 -[ghcr]: https://github.com/mbhall88/nohuman/pkgs/container/nohuman \ No newline at end of file +[ghcr]: https://github.com/mbhall88/nohuman/pkgs/container/nohuman + +[conf]: https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#confidence-scoring diff --git a/src/lib.rs b/src/lib.rs index 91f9f70..aad304a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -157,6 +157,16 @@ pub fn validate_db_directory(path: &Path) -> Result { )) } +/// Parse confidence score from the command line. Will be passed on to kraken2. Must be in the +/// closed interval [0, 1] - i.e. 0 <= confidence <= 1. +pub fn parse_confidence_score(s: &str) -> Result { + let confidence: f32 = s.parse().map_err(|_| "Confidence score must be a number")?; + if !(0.0..=1.0).contains(&confidence) { + return Err("Confidence score must be in the closed interval [0, 1]".to_string()); + } + Ok(confidence) +} + #[cfg(test)] mod tests { use super::*; @@ -205,4 +215,25 @@ mod tests { let expected = PathBuf::from("Cargo.toml"); assert_eq!(actual, expected) } + + #[test] + fn test_parse_confidence_score() { + let result = parse_confidence_score("0.5"); + assert!(result.is_ok()); + assert_eq!(result.unwrap(), 0.5); + + let result = parse_confidence_score("1.0"); + assert!(result.is_ok()); + assert_eq!(result.unwrap(), 1.0); + + let result = parse_confidence_score("0.0"); + assert!(result.is_ok()); + assert_eq!(result.unwrap(), 0.0); + + let result = parse_confidence_score("1.1"); + assert!(result.is_err()); + + let result = parse_confidence_score("-0.1"); + assert!(result.is_err()); + } } diff --git a/src/main.rs b/src/main.rs index a2a8005..ae56366 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,7 +8,8 @@ use env_logger::Builder; use log::{debug, error, info, warn, LevelFilter}; use nohuman::compression::CompressionFormat; use nohuman::{ - check_path_exists, download::download_database, validate_db_directory, CommandRunner, + check_path_exists, download::download_database, parse_confidence_score, validate_db_directory, + CommandRunner, }; static DEFAULT_DB_LOCATION: LazyLock = LazyLock::new(|| { @@ -72,6 +73,14 @@ struct Args { #[arg(short = 'H', long = "human")] keep_human_reads: bool, + /// Kraken2 minimum confidence score + #[arg(short = 'C', long = "conf", value_name = "[0, 1]", default_value = "0.0", value_parser = parse_confidence_score)] + confidence: f32, + + /// Write the Kraken2 read classification output to a file. + #[arg(short, long, value_name = "FILE")] + kraken_output: Option, + /// Set the logging level to verbose #[arg(short, long)] verbose: bool, @@ -139,9 +148,10 @@ fn main() -> Result<()> { // error out if input files are not provided, otherwise unwrap to a variable let input = args.input.context("No input files provided")?; - let temp_kraken_output = - tempfile::NamedTempFile::new().context("Failed to create temporary kraken output file")?; + let kraken_output = args.kraken_output.unwrap_or(PathBuf::from("/dev/null")); + let kraken_output = kraken_output.to_string_lossy(); let threads = args.threads.to_string(); + let confidence = args.confidence.to_string(); let db = validate_db_directory(&args.database) .map_err(|e| anyhow::anyhow!(e))? .to_string_lossy() @@ -152,7 +162,9 @@ fn main() -> Result<()> { "--db", &db, "--output", - temp_kraken_output.path().to_str().unwrap(), + &kraken_output, + "--confidence", + &confidence, ]; match input.len() { 0 => bail!("No input files provided"),