diff --git a/CHANGELOG.md b/CHANGELOG.md index 0535dd4..9d72b68 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [pileup, extract, sample-probs, summary] Allow narrowing of analysis to specific sites with `--include-bed`. - [summary, sample-probs] Add `--only-mapped` flag that will only report on base modification probabilities if they are mapped to the reference. - [pileup] Allow partitioning counts to separate bedMethyl files based on SAM tags with `--partition-tag` option. +### Fixes +- [adjust-mods, update-tags, call-mods] Panic when failure to parse SAM header, fixes #29. ## [v0.1.8] @@ -21,7 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixes - [call-mods] `--no-filtering` flag properly handled. ### Deprecated -- [adjust-mods] In order to allow `--edge-filter` without also performing an ignore or convert, in the next release `--ignore` will require a explicit argument (`h` is no longer the default). In this release, when no edge filter, conversion, or ignore is provided, `h` will be used for ignore, but this behavior will be removed in the next release. +- [adjust-mods] In order to allow `--edge-filter` without also performing an ignore or convert, in the next release `--ignore` will require an explicit argument (`h` is no longer the default). In this release, when no edge filter, conversion, or ignore is provided, `h` will be used for ignore, but this behavior will be removed in the next release. ## [v0.1.7] ### Changes diff --git a/Cargo.toml b/Cargo.toml index 02b183f..abe2f5b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,6 +31,7 @@ derive-new = "0.5.9" histo_fp = "0.2.1" prettytable-rs = "0.10.0" rust-lapper = "1.1.0" +linear-map = "1.2.0" [dev-dependencies] similar-asserts = "1.4.2" diff --git a/src/util.rs b/src/util.rs index 50902c9..f2203e4 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,12 +1,14 @@ use anyhow::{anyhow, bail}; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::string::FromUtf8Error; use anyhow::Result as AnyhowResult; use derive_new::new; use indicatif::{ProgressBar, ProgressStyle}; -use log::debug; +use linear_map::LinearMap; +use log::{debug, error}; +use regex::Regex; use rust_htslib::bam::{ self, ext::BamRecordExtensions, header::HeaderRecord, record::Aux, HeaderView, @@ -328,8 +330,62 @@ impl Region { } } +// shouldn't need this once it's fixed in rust-htslib or the repo moves to noodles.. +fn header_to_hashmap( + header: &bam::Header, +) -> anyhow::Result>>> { + let mut header_map = HashMap::default(); + let record_type_regex = Regex::new(r"@([A-Z][A-Z])").unwrap(); + let tag_regex = Regex::new(r"([A-Za-z][A-Za-z0-9]):([ -~]*)").unwrap(); + + if let Ok(header_string) = String::from_utf8(header.to_bytes()) { + for line in header_string.split('\n').filter(|x| !x.is_empty()) { + let parts: Vec<_> = + line.split('\t').filter(|x| !x.is_empty()).collect(); + if parts.is_empty() { + continue; + } + let record_type = record_type_regex + .captures(parts[0]) + .and_then(|captures| captures.get(1)) + .map(|m| m.as_str().to_owned()); + + if let Some(record_type) = record_type { + if record_type == "CO" { + continue; + } + let mut field = LinearMap::default(); + for part in parts.iter().skip(1) { + if let Some(cap) = tag_regex.captures(part) { + let tag = cap.get(1).unwrap().as_str().to_owned(); + let value = cap.get(2).unwrap().as_str().to_owned(); + field.insert(tag, value); + } else { + debug!("encounted illegal record line {line}"); + } + } + header_map + .entry(record_type) + .or_insert_with(Vec::new) + .push(field); + } else { + debug!("encountered illegal record type in line {line}"); + } + } + Ok(header_map) + } else { + bail!("failed to parse header string") + } +} + pub fn add_modkit_pg_records(header: &mut bam::Header) { - let header_map = header.to_hashmap(); + let header_map = match header_to_hashmap(&header) { + Ok(hm) => hm, + Err(_) => { + error!("failed to parse input BAM header, not adding PG header record for modkit"); + return; + } + }; let (id, pp) = if let Some(pg_tags) = header_map.get("PG") { let modkit_invocations = pg_tags.iter().filter_map(|tags| { tags.get("ID").and_then(|v| {