Skip to content

Commit

Permalink
[WIP] to share code. Partial strandedness implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
a-frantz committed Feb 8, 2024
1 parent d872c82 commit 89b60e0
Show file tree
Hide file tree
Showing 12 changed files with 637 additions and 62 deletions.
1 change: 1 addition & 0 deletions src/derive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ pub mod endedness;
pub mod instrument;
pub mod junction_annotation;
pub mod readlen;
pub mod strandedness;
5 changes: 5 additions & 0 deletions src/derive/command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ pub mod endedness;
pub mod instrument;
pub mod junction_annotation;
pub mod readlen;
pub mod strandedness;

use clap::Args;
use clap::Subcommand;
Expand Down Expand Up @@ -36,6 +37,10 @@ pub enum DeriveSubcommand {
/// Derives the read length of the file.
Readlen(self::readlen::DeriveReadlenArgs),

/// Derives the strandedness of the RNA-Seq file.
/// This subcommand requires a GFF file.
Strandedness(self::strandedness::DeriveStrandednessArgs),

/// Annotates junctions in the file.
/// This subcommand requires a GFF file with features to annotate.
/// This subcommand does not "derive" anything, but is included here for
Expand Down
5 changes: 2 additions & 3 deletions src/derive/command/endedness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,13 @@ use tracing::info;
use tracing::trace;

use crate::derive::endedness::compute;
use crate::derive::endedness::compute::{
validate_read_group_info, OrderingFlagsCounts, OVERALL, UNKNOWN_READ_GROUP,
};
use crate::derive::endedness::compute::OrderingFlagsCounts;
use crate::utils::args::arg_in_range as deviance_in_range;
use crate::utils::args::NumberOfRecords;
use crate::utils::display::RecordCounter;
use crate::utils::formats::bam::ParsedBAMFile;
use crate::utils::formats::utils::IndexCheck;
use crate::utils::read_groups::{validate_read_group_info, OVERALL, UNKNOWN_READ_GROUP};

/// Clap arguments for the `ngs derive endedness` subcommand.
#[derive(Args)]
Expand Down
18 changes: 9 additions & 9 deletions src/derive/command/junction_annotation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,34 +31,34 @@ pub struct JunctionAnnotationArgs {

/// Name of the exon region feature for the gene model used.
#[arg(long, value_name = "STRING", default_value = "exon")]
pub exon_feature_name: String,
exon_feature_name: String,

/// Minimum intron length to consider.
/// An intron is defined as an `N` CIGAR operation of any length.
#[arg(short = 'i', long, value_name = "USIZE", default_value = "50")]
pub min_intron_length: usize,
min_intron_length: usize,

/// Minimum number of reads supporting a junction to be considered.
#[arg(short = 'r', long, value_name = "usize", default_value = "2")]
pub min_read_support: usize,
#[arg(short = 'r', long, value_name = "USIZE", default_value = "2")]
min_read_support: usize,

/// Minumum mapping quality for a record to be considered.
/// Set to 0 to disable this filter and allow reads _without_
/// a mapping quality to be considered.
#[arg(short, long, value_name = "U8", default_value = "30")]
pub min_mapq: u8,
min_mapq: u8,

/// Do not count supplementary alignments.
#[arg(long)]
pub no_supplementary: bool,
no_supplementary: bool,

/// Do count secondary alignments.
#[arg(long)]
pub count_secondary: bool,
count_secondary: bool,

/// Do count duplicates.
#[arg(long)]
pub count_duplicates: bool,
count_duplicates: bool,
}

/// Main function for the `ngs derive junction_annotation` subcommand.
Expand Down Expand Up @@ -131,7 +131,7 @@ pub fn derive(args: JunctionAnnotationArgs) -> anyhow::Result<()> {
// (3) Summarize found junctions.
compute::summarize(&mut results, &params);

// (3) Print the output to stdout as JSON (more support for different output
// (4) Print the output to stdout as JSON (more support for different output
// types may be added in the future, but for now, only JSON).
let output = serde_json::to_string_pretty(&results).unwrap();
print!("{}", output);
Expand Down
213 changes: 213 additions & 0 deletions src/derive/command/strandedness.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
//! Functionality relating to the `ngs derive strandedness` subcommand itself.
use std::collections::HashMap;
use std::fs::File;
use std::path::PathBuf;

use anyhow::bail;
use anyhow::Context;
use clap::Args;
use noodles::bam;
use noodles::gff;
use noodles::sam;
use rust_lapper::{Interval, Lapper};
use tracing::debug;
use tracing::info;

use crate::derive::strandedness::compute;
use crate::derive::strandedness::compute::ParsedBAMFile;
use crate::derive::strandedness::compute::StrandednessFilters;
use crate::utils::formats;

/// Clap arguments for the `ngs derive strandedness` subcommand.
#[derive(Args)]
pub struct DeriveStrandednessArgs {
/// Source BAM.
#[arg(value_name = "BAM")]
src: PathBuf,

/// Features GFF file.
#[arg(short = 'f', long, required = true, value_name = "PATH")]
features_gff: PathBuf,

/// When inconclusive, the test will repeat until this many tries have been reached.
/// Evidence of previous attempts is saved and reused,
/// leading to a larger sample size with multiple attempts.
#[arg(long, value_name = "USIZE", default_value = "3")]
max_tries: usize,

/// Filter any genes that don't have at least `m` reads.
#[arg(short = 'm', long, value_name = "USIZE", default_value = "10")]
min_reads_per_gene: usize,

/// How many genes to sample.
#[arg(short = 'n', long, value_name = "USIZE", default_value = "1000")]
num_genes: usize,

/// Minimum mapping quality for a record to be considered.
/// Set to 0 to disable this filter and allow reads _without_
/// a mapping quality to be considered.
#[arg(short = 'q', long, value_name = "U8", default_value = "30")]
min_mapq: u8,

/// Consider all genes, not just protein coding genes.
#[arg(long)]
all_genes: bool,

/// Name of the gene region feature for the gene model used.
#[arg(long, value_name = "STRING", default_value = "gene")]
gene_feature_name: String,

/// Name of the exon region feature for the gene model used.
#[arg(long, value_name = "STRING", default_value = "exon")]
exon_feature_name: String,

/// Do not count supplementary alignments.
#[arg(long)]
no_supplementary: bool,

/// Do count secondary alignments.
#[arg(long)]
count_secondary: bool,

/// Do count duplicates.
#[arg(long)]
count_duplicates: bool,

/// Do count QC failed reads.
#[arg(long)]
count_qc_failed: bool,

/// At most, search this many times for genes that satisfy our search criteria.
/// Default is 10 * --num-genes.
#[arg(long, value_name = "USIZE")]
max_iterations_per_try: Option<usize>,
}

/// Main function for the `ngs derive strandedness` subcommand.
pub fn derive(args: DeriveStrandednessArgs) -> anyhow::Result<()> {
info!("Starting derive strandedness subcommand.");

// (1) Parse the GFF file and collect all gene features.
debug!("Reading all records in GFF.");
let mut gff = formats::gff::open(&args.features_gff)
.with_context(|| format!("opening GFF file: {}", args.features_gff.display()))?;

let mut gene_records = Vec::new();
let mut exon_records = Vec::new();
let mut gene_metrics = compute::GeneRecordMetrics::default();
let mut exon_metrics = compute::ExonRecordMetrics::default();
for result in gff.records() {
let record = result.unwrap();
if record.ty() == args.gene_feature_name {
// If --all-genes is set, keep the record.
// Otherwise, check the gene type or biotype and keep the record if it's protein coding.
// If the record does not have a gene type or biotype, discard it.
let mut keep_record = false;
if !args.all_genes {
let mut gene_type_value = None;
for entry in record.attributes().as_ref() {
gene_type_value = match entry.key() {
"gene_type" => Some(entry.value()), // Gencode
"gene_biotype" => Some(entry.value()), // ENSEMBL
"biotype" => Some(entry.value()), // also ENSEMBL
_ => gene_type_value,
};
}
if let Some(gene_type_value) = gene_type_value {
if gene_type_value.to_lowercase().contains("protein") {
keep_record = true;
gene_metrics.protein_coding += 1;
}
}
}
gene_metrics.total += 1;
if keep_record {
gene_records.push(record);
}
} else if record.ty() == args.exon_feature_name {
exon_metrics.total += 1;
exon_records.push(record);
}
}

debug!("Tabulating GFF gene and exon features.");

if gene_records.is_empty() {
bail!("No gene records matched criteria. Check your GFF file and `--gene-feature-name` and `--all-genes` options.");
}
if exon_records.is_empty() {
bail!("No exon records matched criteria. Check your GFF file and `--exon-feature-name` option.");
}

let mut exon_intervals: HashMap<&str, Vec<Interval<usize, gff::record::Strand>>> =
HashMap::new();
for record in &exon_records {
let seq_name = record.reference_sequence_name();
let start: usize = record.start().into();
let stop: usize = record.end().into();
let strand = record.strand();

exon_intervals.entry(seq_name).or_default().push(Interval {
start,
stop,
val: strand,
});
}

let mut exons: HashMap<&str, Lapper<usize, gff::record::Strand>> = HashMap::new();
for (seq_name, intervals) in exon_intervals {
exons.insert(seq_name, Lapper::new(intervals));
}

debug!("Done reading GFF.");

let mut reader = File::open(&args.src)
.map(bam::Reader::new)
.with_context(|| format!("opening BAM file: {}", args.src.display()))?;
let header = reader.read_header()?.parse()?;
let index = bam::bai::read(&args.src.with_extension("bam.bai")).with_context(|| {
format!(
"reading BAM index: {}",
args.src.with_extension("bam.bai").display()
)
})?;

let parsed_bam = ParsedBAMFile {
reader,
header,
index,
};

let filters = StrandednessFilters {
min_reads_per_gene: args.min_reads_per_gene,
min_mapq: args.min_mapq,
count_qc_failed: args.count_qc_failed,
no_supplementary: args.no_supplementary,
count_secondary: args.count_secondary,
count_duplicates: args.count_duplicates,
};

let max_iterations_per_try = args.max_iterations_per_try.unwrap_or(args.num_genes * 10);
let max_iterations_per_try = match max_iterations_per_try > gene_records.len() {
true => gene_records.len(),
false => max_iterations_per_try,
};

for try_num in 1..=args.max_tries {
info!("Starting try {} of {}", try_num, args.max_tries);

compute::predict(
&parsed_bam,
&mut gene_records,
&exons,
max_iterations_per_try,
args.num_genes,
&filters,
&mut gene_metrics,
&mut exon_metrics,
)?;
}

anyhow::Ok(())
}
2 changes: 1 addition & 1 deletion src/derive/encoding/compute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ pub fn predict(score_set: HashSet<u8>) -> Result<DerivedEncodingResult, anyhow::
result.succeeded = true;
result.encoding = Some("Sanger/Illumina 1.8".to_string());
}
_ => bail!("This shouldn't be possible!"),
_ => unreachable!(),
}

anyhow::Ok(result)
Expand Down
50 changes: 1 addition & 49 deletions src/derive/endedness/compute.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,12 @@
//! Module holding the logic for computing the endedness of a BAM.
use lazy_static::lazy_static;
use noodles::sam::header;
use serde::Serialize;
use std::collections::HashMap;
use std::collections::HashSet;
use std::sync::Arc;
use tracing::warn;

// Strings used to index into the HashMaps used to store the Read Group ordering flags.
// Lazy statics are used to save memory.
lazy_static! {
/// String used to index into the HashMaps used to store the "overall" ordering flags.
pub static ref OVERALL: Arc<String> = Arc::new(String::from("overall"));

/// String used to index into th HashMaps used to store the "unknown_read_group" ordering flags.
pub static ref UNKNOWN_READ_GROUP: Arc<String> = Arc::new(String::from("unknown_read_group"));
}
use crate::utils::read_groups::{OVERALL, UNKNOWN_READ_GROUP};

/// Struct holding the ordering flags for a single read group.
#[derive(Debug, Clone)]
Expand Down Expand Up @@ -157,44 +147,6 @@ impl DerivedEndednessResult {
}
}

/// Compares the read group tags found in the records
/// and the read groups found in the header.
/// Returns a vector of read group names that were found in the header
/// but not in the records.
pub fn validate_read_group_info(
found_rgs: &HashSet<Arc<String>>,
header: &header::Header,
) -> Vec<String> {
let mut rgs_in_header_not_records = Vec::new();
let mut rgs_in_records_not_header = Vec::new();

for (rg_id, _) in header.read_groups() {
if !found_rgs.contains(rg_id) {
rgs_in_header_not_records.push(rg_id.to_string());
}
}
if !rgs_in_header_not_records.is_empty() {
warn!(
"The following read groups were not found in the file: {:?}",
rgs_in_header_not_records
);
}

for rg_id in found_rgs {
if !header.read_groups().contains_key(rg_id.as_str()) {
rgs_in_records_not_header.push(rg_id.to_string());
}
}
if !rgs_in_records_not_header.is_empty() {
warn!(
"The following read groups were not found in the header: {:?}",
rgs_in_records_not_header
);
}

rgs_in_header_not_records
}

fn calculate_reads_per_template(
read_names: HashMap<String, Vec<Arc<String>>>,
) -> HashMap<Arc<String>, f64> {
Expand Down
3 changes: 3 additions & 0 deletions src/derive/strandedness.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
//! Supporting functionality for the `ngs derive strandedness` subcommand.
pub mod compute;
Loading

0 comments on commit 89b60e0

Please sign in to comment.