Skip to content

Commit

Permalink
feat: first pass implementation of encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
a-frantz committed Feb 6, 2024
1 parent 6fcbd60 commit d872c82
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 3 deletions.
61 changes: 58 additions & 3 deletions src/derive/command/encoding.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,26 @@
//! Functionality relating to the `ngs derive encoding` subcommand itself.
use std::collections::HashSet;
use std::io::BufReader;
use std::path::PathBuf;

use anyhow::Context;
use anyhow::Ok;
use clap::Args;
use noodles::bam;
use num_format::Locale;
use num_format::ToFormattedString;
use tracing::info;

use crate::derive::encoding::compute;
use crate::utils::args::NumberOfRecords;
use crate::utils::display::RecordCounter;

/// Clap arguments for the `ngs derive encoding` subcommand.
#[derive(Args)]
pub struct DeriveEncodingArgs {
// Source NGS file (BAM or FASTQ).
#[arg(value_name = "NGS_FILE")]
/// Source BAM.
#[arg(value_name = "BAM")]
src: PathBuf,

/// Only examine the first n records in the file.
Expand All @@ -18,6 +29,50 @@ pub struct DeriveEncodingArgs {
}

/// Main function for the `ngs derive encoding` subcommand.
pub fn derive(_args: DeriveEncodingArgs) -> anyhow::Result<()> {
pub fn derive(args: DeriveEncodingArgs) -> anyhow::Result<()> {
info!("Starting derive readlen subcommand.");

let file = std::fs::File::open(args.src);
let reader = file
.map(BufReader::new)
.with_context(|| "opening BAM file")?;
let mut reader = bam::Reader::new(reader);
let _header: String = reader.read_header()?.parse()?;
reader.read_reference_sequences()?;

let mut score_set: HashSet<u8> = HashSet::new();

// (1) Collect quality scores from reads within the
// file. Support for sampling only a portion of the reads is provided.
let num_records = NumberOfRecords::from(args.num_records);
let mut counter = RecordCounter::new();

for result in reader.lazy_records() {
let record = result?;

for i in 0..record.quality_scores().len() {
let score = record.quality_scores().as_ref()[i];
score_set.insert(score);
}

counter.inc();
if counter.time_to_break(&num_records) {
break;
}
}

info!(
"Processed {} records.",
counter.get().to_formatted_string(&Locale::en)
);

// (2) Derive encoding from the observed quality scores
let result = compute::predict(score_set)?;

// (3) Print the output to stdout as JSON (more support for different output
// types may be added in the future, but for now, only JSON).
let output = serde_json::to_string_pretty(&result).unwrap();
print!("{}", output);

Ok(())
}
78 changes: 78 additions & 0 deletions src/derive/encoding/compute.rs
Original file line number Diff line number Diff line change
@@ -1 +1,79 @@
//! Module holding the logic for computing the quality score encoding.
use anyhow::bail;
use serde::Serialize;
use std::collections::HashSet;

const MAX_VALID_PHRED_SCORE: u8 = 93;
const SANGER_MIN: u8 = 0;
const ILLUMINA_1_0_MIN: u8 = 26;
const ILLUMINA_1_3_MIN: u8 = 31;

/// Struct holding the final results for an `ngs derive encoding` subcommand
/// call.
#[derive(Debug, Serialize)]
pub struct DerivedEncodingResult {
/// Whether or not the `ngs derive encoding` subcommand succeeded.
pub succeeded: bool,

/// The detected quality score encoding, if available.
pub encoding: Option<String>,

/// The minimum quality score observed.
pub observed_min: u8,

/// The maximum quality score observed.
pub observed_max: u8,
}

impl DerivedEncodingResult {
/// Creates a new [`DerivedEncodingResult`].
pub fn new(
succeeded: bool,
encoding: Option<String>,
observed_min: u8,
observed_max: u8,
) -> Self {
DerivedEncodingResult {
succeeded,
encoding,
observed_min,
observed_max,
}
}
}

/// Main method to evaluate the observed quality scores and
/// return a result for the derived encoding. This may fail, and the
/// resulting [`DerivedEncodingResult`] should be evaluated accordingly.
pub fn predict(score_set: HashSet<u8>) -> Result<DerivedEncodingResult, anyhow::Error> {
if score_set.is_empty() {
bail!("No quality scores were detected in the file.");
}

let observed_min = *score_set.iter().min().unwrap();
let observed_max = *score_set.iter().max().unwrap();

let mut result = DerivedEncodingResult::new(false, None, observed_min, observed_max);

if observed_max > MAX_VALID_PHRED_SCORE {
return anyhow::Ok(result);
}
match observed_min {
ILLUMINA_1_3_MIN..=MAX_VALID_PHRED_SCORE => {
result.succeeded = true;
result.encoding = Some("Illumina 1.3".to_string());
}
ILLUMINA_1_0_MIN..=MAX_VALID_PHRED_SCORE => {
result.succeeded = true;
result.encoding = Some("Illumina 1.0".to_string());
}
SANGER_MIN..=MAX_VALID_PHRED_SCORE => {
result.succeeded = true;
result.encoding = Some("Sanger/Illumina 1.8".to_string());
}
_ => bail!("This shouldn't be possible!"),
}

anyhow::Ok(result)
}

0 comments on commit d872c82

Please sign in to comment.