[WIP] to share code. Partial strandedness implementation

stjude-rust-labs · Feb 8, 2024 · 89b60e0 · 89b60e0
1 parent d872c82
commit 89b60e0
Show file tree

Hide file tree

Showing 12 changed files with 637 additions and 62 deletions.
diff --git a/src/derive.rs b/src/derive.rs
@@ -6,3 +6,4 @@ pub mod endedness;
 pub mod instrument;
 pub mod junction_annotation;
 pub mod readlen;
+pub mod strandedness;
diff --git a/src/derive/command.rs b/src/derive/command.rs
@@ -5,6 +5,7 @@ pub mod endedness;
 pub mod instrument;
 pub mod junction_annotation;
 pub mod readlen;
+pub mod strandedness;
 
 use clap::Args;
 use clap::Subcommand;
@@ -36,6 +37,10 @@ pub enum DeriveSubcommand {
     /// Derives the read length of the file.
     Readlen(self::readlen::DeriveReadlenArgs),
 
+    /// Derives the strandedness of the RNA-Seq file.
+    /// This subcommand requires a GFF file.
+    Strandedness(self::strandedness::DeriveStrandednessArgs),
+
     /// Annotates junctions in the file.
     /// This subcommand requires a GFF file with features to annotate.
     /// This subcommand does not "derive" anything, but is included here for

diff --git a/src/derive/command/endedness.rs b/src/derive/command/endedness.rs
@@ -14,14 +14,13 @@ use tracing::info;
 use tracing::trace;
 
 use crate::derive::endedness::compute;
-use crate::derive::endedness::compute::{
-    validate_read_group_info, OrderingFlagsCounts, OVERALL, UNKNOWN_READ_GROUP,
-};
+use crate::derive::endedness::compute::OrderingFlagsCounts;
 use crate::utils::args::arg_in_range as deviance_in_range;
 use crate::utils::args::NumberOfRecords;
 use crate::utils::display::RecordCounter;
 use crate::utils::formats::bam::ParsedBAMFile;
 use crate::utils::formats::utils::IndexCheck;
+use crate::utils::read_groups::{validate_read_group_info, OVERALL, UNKNOWN_READ_GROUP};
 
 /// Clap arguments for the `ngs derive endedness` subcommand.
 #[derive(Args)]

diff --git a/src/derive/command/junction_annotation.rs b/src/derive/command/junction_annotation.rs
@@ -31,34 +31,34 @@ pub struct JunctionAnnotationArgs {
 
     /// Name of the exon region feature for the gene model used.
     #[arg(long, value_name = "STRING", default_value = "exon")]
-    pub exon_feature_name: String,
+    exon_feature_name: String,
 
     /// Minimum intron length to consider.
     /// An intron is defined as an `N` CIGAR operation of any length.
     #[arg(short = 'i', long, value_name = "USIZE", default_value = "50")]
-    pub min_intron_length: usize,
+    min_intron_length: usize,
 
     /// Minimum number of reads supporting a junction to be considered.
-    #[arg(short = 'r', long, value_name = "usize", default_value = "2")]
-    pub min_read_support: usize,
+    #[arg(short = 'r', long, value_name = "USIZE", default_value = "2")]
+    min_read_support: usize,
 
     /// Minumum mapping quality for a record to be considered.
     /// Set to 0 to disable this filter and allow reads _without_
     /// a mapping quality to be considered.
     #[arg(short, long, value_name = "U8", default_value = "30")]
-    pub min_mapq: u8,
+    min_mapq: u8,
 
     /// Do not count supplementary alignments.
     #[arg(long)]
-    pub no_supplementary: bool,
+    no_supplementary: bool,
 
     /// Do count secondary alignments.
     #[arg(long)]
-    pub count_secondary: bool,
+    count_secondary: bool,
 
     /// Do count duplicates.
     #[arg(long)]
-    pub count_duplicates: bool,
+    count_duplicates: bool,
 }
 
 /// Main function for the `ngs derive junction_annotation` subcommand.
@@ -131,7 +131,7 @@ pub fn derive(args: JunctionAnnotationArgs) -> anyhow::Result<()> {
     // (3) Summarize found junctions.
     compute::summarize(&mut results, &params);
 
-    // (3) Print the output to stdout as JSON (more support for different output
+    // (4) Print the output to stdout as JSON (more support for different output
     // types may be added in the future, but for now, only JSON).
     let output = serde_json::to_string_pretty(&results).unwrap();
     print!("{}", output);

diff --git a/src/derive/command/strandedness.rs b/src/derive/command/strandedness.rs
@@ -0,0 +1,213 @@
+//! Functionality relating to the `ngs derive strandedness` subcommand itself.
+
+use std::collections::HashMap;
+use std::fs::File;
+use std::path::PathBuf;
+
+use anyhow::bail;
+use anyhow::Context;
+use clap::Args;
+use noodles::bam;
+use noodles::gff;
+use noodles::sam;
+use rust_lapper::{Interval, Lapper};
+use tracing::debug;
+use tracing::info;
+
+use crate::derive::strandedness::compute;
+use crate::derive::strandedness::compute::ParsedBAMFile;
+use crate::derive::strandedness::compute::StrandednessFilters;
+use crate::utils::formats;
+
+/// Clap arguments for the `ngs derive strandedness` subcommand.
+#[derive(Args)]
+pub struct DeriveStrandednessArgs {
+    /// Source BAM.
+    #[arg(value_name = "BAM")]
+    src: PathBuf,
+
+    /// Features GFF file.
+    #[arg(short = 'f', long, required = true, value_name = "PATH")]
+    features_gff: PathBuf,
+
+    /// When inconclusive, the test will repeat until this many tries have been reached.
+    /// Evidence of previous attempts is saved and reused,
+    /// leading to a larger sample size with multiple attempts.
+    #[arg(long, value_name = "USIZE", default_value = "3")]
+    max_tries: usize,
+
+    /// Filter any genes that don't have at least `m` reads.
+    #[arg(short = 'm', long, value_name = "USIZE", default_value = "10")]
+    min_reads_per_gene: usize,
+
+    /// How many genes to sample.
+    #[arg(short = 'n', long, value_name = "USIZE", default_value = "1000")]
+    num_genes: usize,
+
+    /// Minimum mapping quality for a record to be considered.
+    /// Set to 0 to disable this filter and allow reads _without_
+    /// a mapping quality to be considered.
+    #[arg(short = 'q', long, value_name = "U8", default_value = "30")]
+    min_mapq: u8,
+
+    /// Consider all genes, not just protein coding genes.
+    #[arg(long)]
+    all_genes: bool,
+
+    /// Name of the gene region feature for the gene model used.
+    #[arg(long, value_name = "STRING", default_value = "gene")]
+    gene_feature_name: String,
+
+    /// Name of the exon region feature for the gene model used.
+    #[arg(long, value_name = "STRING", default_value = "exon")]
+    exon_feature_name: String,
+
+    /// Do not count supplementary alignments.
+    #[arg(long)]
+    no_supplementary: bool,
+
+    /// Do count secondary alignments.
+    #[arg(long)]
+    count_secondary: bool,
+
+    /// Do count duplicates.
+    #[arg(long)]
+    count_duplicates: bool,
+
+    /// Do count QC failed reads.
+    #[arg(long)]
+    count_qc_failed: bool,
+
+    /// At most, search this many times for genes that satisfy our search criteria.
+    /// Default is 10 * --num-genes.
+    #[arg(long, value_name = "USIZE")]
+    max_iterations_per_try: Option<usize>,
+}
+
+/// Main function for the `ngs derive strandedness` subcommand.
+pub fn derive(args: DeriveStrandednessArgs) -> anyhow::Result<()> {
+    info!("Starting derive strandedness subcommand.");
+
+    // (1) Parse the GFF file and collect all gene features.
+    debug!("Reading all records in GFF.");
+    let mut gff = formats::gff::open(&args.features_gff)
+        .with_context(|| format!("opening GFF file: {}", args.features_gff.display()))?;
+
+    let mut gene_records = Vec::new();
+    let mut exon_records = Vec::new();
+    let mut gene_metrics = compute::GeneRecordMetrics::default();
+    let mut exon_metrics = compute::ExonRecordMetrics::default();
+    for result in gff.records() {
+        let record = result.unwrap();
+        if record.ty() == args.gene_feature_name {
+            // If --all-genes is set, keep the record.
+            // Otherwise, check the gene type or biotype and keep the record if it's protein coding.
+            // If the record does not have a gene type or biotype, discard it.
+            let mut keep_record = false;
+            if !args.all_genes {
+                let mut gene_type_value = None;
+                for entry in record.attributes().as_ref() {
+                    gene_type_value = match entry.key() {
+                        "gene_type" => Some(entry.value()),    // Gencode
+                        "gene_biotype" => Some(entry.value()), // ENSEMBL
+                        "biotype" => Some(entry.value()),      // also ENSEMBL
+                        _ => gene_type_value,
+                    };
+                }
+                if let Some(gene_type_value) = gene_type_value {
+                    if gene_type_value.to_lowercase().contains("protein") {
+                        keep_record = true;
+                        gene_metrics.protein_coding += 1;
+                    }
+                }
+            }
+            gene_metrics.total += 1;
+            if keep_record {
+                gene_records.push(record);
+            }
+        } else if record.ty() == args.exon_feature_name {
+            exon_metrics.total += 1;
+            exon_records.push(record);
+        }
+    }
+
+    debug!("Tabulating GFF gene and exon features.");
+
+    if gene_records.is_empty() {
+        bail!("No gene records matched criteria. Check your GFF file and `--gene-feature-name` and `--all-genes` options.");
+    }
+    if exon_records.is_empty() {
+        bail!("No exon records matched criteria. Check your GFF file and `--exon-feature-name` option.");
+    }
+
+    let mut exon_intervals: HashMap<&str, Vec<Interval<usize, gff::record::Strand>>> =
+        HashMap::new();
+    for record in &exon_records {
+        let seq_name = record.reference_sequence_name();
+        let start: usize = record.start().into();
+        let stop: usize = record.end().into();
+        let strand = record.strand();
+
+        exon_intervals.entry(seq_name).or_default().push(Interval {
+            start,
+            stop,
+            val: strand,
+        });
+    }
+
+    let mut exons: HashMap<&str, Lapper<usize, gff::record::Strand>> = HashMap::new();
+    for (seq_name, intervals) in exon_intervals {
+        exons.insert(seq_name, Lapper::new(intervals));
+    }
+
+    debug!("Done reading GFF.");
+
+    let mut reader = File::open(&args.src)
+        .map(bam::Reader::new)
+        .with_context(|| format!("opening BAM file: {}", args.src.display()))?;
+    let header = reader.read_header()?.parse()?;
+    let index = bam::bai::read(&args.src.with_extension("bam.bai")).with_context(|| {
+        format!(
+            "reading BAM index: {}",
+            args.src.with_extension("bam.bai").display()
+        )
+    })?;
+
+    let parsed_bam = ParsedBAMFile {
+        reader,
+        header,
+        index,
+    };
+
+    let filters = StrandednessFilters {
+        min_reads_per_gene: args.min_reads_per_gene,
+        min_mapq: args.min_mapq,
+        count_qc_failed: args.count_qc_failed,
+        no_supplementary: args.no_supplementary,
+        count_secondary: args.count_secondary,
+        count_duplicates: args.count_duplicates,
+    };
+
+    let max_iterations_per_try = args.max_iterations_per_try.unwrap_or(args.num_genes * 10);
+    let max_iterations_per_try = match max_iterations_per_try > gene_records.len() {
+        true => gene_records.len(),
+        false => max_iterations_per_try,
+    };
+
+    for try_num in 1..=args.max_tries {
+        info!("Starting try {} of {}", try_num, args.max_tries);
+
+        compute::predict(
+            &parsed_bam,
+            &mut gene_records,
+            &exons,
+            max_iterations_per_try,
+            args.num_genes,
+            &filters,
+            &mut gene_metrics,
+            &mut exon_metrics,
+        )?;
+    }
+
+    anyhow::Ok(())
+}
diff --git a/src/derive/encoding/compute.rs b/src/derive/encoding/compute.rs
@@ -72,7 +72,7 @@ pub fn predict(score_set: HashSet<u8>) -> Result<DerivedEncodingResult, anyhow::
             result.succeeded = true;
             result.encoding = Some("Sanger/Illumina 1.8".to_string());
         }
-        _ => bail!("This shouldn't be possible!"),
+        _ => unreachable!(),
     }
 
     anyhow::Ok(result)

diff --git a/src/derive/endedness/compute.rs b/src/derive/endedness/compute.rs
@@ -1,22 +1,12 @@
 //! Module holding the logic for computing the endedness of a BAM.
 
-use lazy_static::lazy_static;
-use noodles::sam::header;
 use serde::Serialize;
 use std::collections::HashMap;
 use std::collections::HashSet;
 use std::sync::Arc;
 use tracing::warn;
 
-// Strings used to index into the HashMaps used to store the Read Group ordering flags.
-// Lazy statics are used to save memory.
-lazy_static! {
-    /// String used to index into the HashMaps used to store the "overall" ordering flags.
-    pub static ref OVERALL: Arc<String> = Arc::new(String::from("overall"));
-
-    /// String used to index into th HashMaps used to store the "unknown_read_group" ordering flags.
-    pub static ref UNKNOWN_READ_GROUP: Arc<String> = Arc::new(String::from("unknown_read_group"));
-}
+use crate::utils::read_groups::{OVERALL, UNKNOWN_READ_GROUP};
 
 /// Struct holding the ordering flags for a single read group.
 #[derive(Debug, Clone)]
@@ -157,44 +147,6 @@ impl DerivedEndednessResult {
     }
 }
 
-/// Compares the read group tags found in the records
-/// and the read groups found in the header.
-/// Returns a vector of read group names that were found in the header
-/// but not in the records.
-pub fn validate_read_group_info(
-    found_rgs: &HashSet<Arc<String>>,
-    header: &header::Header,
-) -> Vec<String> {
-    let mut rgs_in_header_not_records = Vec::new();
-    let mut rgs_in_records_not_header = Vec::new();
-
-    for (rg_id, _) in header.read_groups() {
-        if !found_rgs.contains(rg_id) {
-            rgs_in_header_not_records.push(rg_id.to_string());
-        }
-    }
-    if !rgs_in_header_not_records.is_empty() {
-        warn!(
-            "The following read groups were not found in the file: {:?}",
-            rgs_in_header_not_records
-        );
-    }
-
-    for rg_id in found_rgs {
-        if !header.read_groups().contains_key(rg_id.as_str()) {
-            rgs_in_records_not_header.push(rg_id.to_string());
-        }
-    }
-    if !rgs_in_records_not_header.is_empty() {
-        warn!(
-            "The following read groups were not found in the header: {:?}",
-            rgs_in_records_not_header
-        );
-    }
-
-    rgs_in_header_not_records
-}
-
 fn calculate_reads_per_template(
     read_names: HashMap<String, Vec<Arc<String>>>,
 ) -> HashMap<Arc<String>, f64> {

diff --git a/src/derive/strandedness.rs b/src/derive/strandedness.rs
@@ -0,0 +1,3 @@
+//! Supporting functionality for the `ngs derive strandedness` subcommand.
+
+pub mod compute;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		//! Supporting functionality for the `ngs derive strandedness` subcommand.

		pub mod compute;