Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: remove gene regions from "strucvars query" database enhancement (#221) #222

Merged
merged 6 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 0 additions & 24 deletions src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -514,30 +514,6 @@ where
Ok(buffer)
}

/// Enum for the supported gene/transcript databases.
#[derive(
serde::Serialize,
serde::Deserialize,
enum_map::Enum,
PartialEq,
Eq,
Clone,
Copy,
Debug,
Default,
strum::EnumString,
strum::Display,
)]
#[serde(rename_all = "lowercase")]
#[strum(serialize_all = "lowercase")]
pub enum Database {
/// RefSeq
#[default]
RefSeq,
/// ENSEMBL
Ensembl,
}

/// Enum for the supported TADs.
#[derive(
serde::Serialize,
Expand Down
6 changes: 3 additions & 3 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ enum Commands {
Seqvars(Seqvars),
}

/// Parsing of "sv *" sub commands.
/// Parsing of "strucvars *" sub commands.
#[derive(Debug, Args)]
#[command(args_conflicts_with_subcommands = true)]
struct Strucvars {
Expand All @@ -44,7 +44,7 @@ struct Strucvars {
command: StrucvarsCommands,
}

/// Enum supporting the parsing of "sv *" sub commands.
/// Enum supporting the parsing of "strucvars *" sub commands.
#[derive(Debug, Subcommand)]
enum StrucvarsCommands {
Aggregate(strucvars::aggregate::cli::Args),
Expand All @@ -62,7 +62,7 @@ struct Seqvars {
command: SeqvarsCommands,
}

/// Enum supporting the parsing of "sv *" sub commands.
/// Enum supporting the parsing of "strucvars *" sub commands.
#[derive(Debug, Subcommand)]
enum SeqvarsCommands {
Aggregate(seqvars::aggregate::Args),
Expand Down
18 changes: 0 additions & 18 deletions src/proto/varfish/v1/sv.proto
Original file line number Diff line number Diff line change
Expand Up @@ -56,24 +56,6 @@ message MaskedDatabase {
repeated MaskedDbRecord records = 1;
}

// Record for the gene region database.
message GeneRegionRecord {
// Numeric chromosome number.
int32 chrom_no = 1;
// 1-based start position.
int32 start = 2;
// 1-based stop position.
int32 stop = 3;
// Numeric gene identifier.
uint32 gene_id = 4;
}

// Gene region database.
message GeneRegionDatabase {
// List of gene region records.
repeated GeneRegionRecord records = 1;
}

// Record for the gene cross-link database.
message XlinkRecord {
// HGNC ID (e.g., "HGNC:123")
Expand Down
2 changes: 1 addition & 1 deletion src/strucvars/aggregate/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ pub struct Args {
pub slack_ins: i32,
}

/// Main entry point for the `sv bg-db-to-bin` command.
/// Main entry point for the `strucvars txt-to-bin` command.
pub fn run(common_args: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> {
tracing::info!("Starting `strucvars aggregate`");
tracing::info!(" common_args = {:?}", &common_args);
Expand Down
2 changes: 1 addition & 1 deletion src/strucvars/query/bgdbs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ impl BeginEnd for BgDbRecord {
}
}

/// Load background database from a `.bin` file as created by `sv convert-bgdb`.
/// Load background database from a `.bin` file as created by `strucvar txt-to-bin`.
#[tracing::instrument]
pub fn load_bg_db_records(path: &Path) -> Result<BgDb, anyhow::Error> {
tracing::debug!("loading binary bg db records from {:?}", path);
Expand Down
145 changes: 2 additions & 143 deletions src/strucvars/query/genes.rs
Original file line number Diff line number Diff line change
@@ -1,90 +1,13 @@
//! Code for supporting annotation with overlapping genes.

use std::{collections::HashSet, ops::Range, path::Path, time::Instant};
use std::{collections::HashSet, path::Path, time::Instant};

use bio::data_structures::interval_tree::ArrayBackedIntervalTree;
use mehari::common::open_read_maybe_gz;
use prost::Message;
use serde::Deserialize;
use tracing::info;

use crate::{
common::{Database, GenomeRelease, CHROMS},
strucvars::pbs,
};

/// Alias for the interval tree that we use.
type IntervalTree = ArrayBackedIntervalTree<i32, u32>;

/// Information to store for a TAD set entry.
#[derive(Default, Clone, Debug)]
pub struct GeneRegionRecord {
/// 0-based begin position.
pub begin: i32,
/// End position.
pub end: i32,
/// gene ID
pub gene_id: u32,
}

/// Gene region overlapping information.
#[derive(Default, Debug)]
pub struct GeneRegionDb {
/// Records, stored by chromosome.
pub records: Vec<Vec<GeneRegionRecord>>,
/// Interval trees, stored by chromosome.
pub trees: Vec<IntervalTree>,
}

impl GeneRegionDb {
/// Return IDs of overlapping genes.
pub fn overlapping_gene_ids(&self, chrom_no: u32, query: Range<i32>) -> Vec<u32> {
self.trees[chrom_no as usize]
.find(query)
.iter()
.map(|cursor| self.records[chrom_no as usize][*cursor.data() as usize].gene_id)
.collect()
}
}

#[tracing::instrument]
fn load_gene_regions_db(path: &Path) -> Result<GeneRegionDb, anyhow::Error> {
tracing::debug!("loading binary gene region records from {:?}...", path);

let before_loading = Instant::now();
let mut result = GeneRegionDb::default();
for _ in CHROMS {
result.records.push(Vec::new());
result.trees.push(IntervalTree::new());
}

let fcontents =
std::fs::read(path).map_err(|e| anyhow::anyhow!("error reading {:?}: {}", &path, e))?;
let gene_region_db = pbs::GeneRegionDatabase::decode(std::io::Cursor::new(fcontents))
.map_err(|e| anyhow::anyhow!("error decoding {:?}: {}", &path, e))?;

let mut total_count = 0;
for record in gene_region_db.records.into_iter() {
let chrom_no = record.chrom_no as usize;
let key = (record.start - 1)..record.stop;
result.trees[chrom_no].insert(key, result.records[chrom_no].len() as u32);
result.records[chrom_no].push(GeneRegionRecord {
begin: record.start - 1,
end: record.stop,
gene_id: record.gene_id,
});

total_count += 1;
}
result.trees.iter_mut().for_each(|tree| tree.index());
tracing::debug!(
"... done loading {} records and building trees in {:?}",
total_count,
before_loading.elapsed(),
);

Ok(result)
}
use crate::{common::GenomeRelease, strucvars::pbs};

/// Information to store for the interlink table.
#[derive(Default, Debug)]
Expand All @@ -108,43 +31,6 @@ pub struct XlinkDb {
pub from_hgnc: multimap::MultiMap<String, u32>,
}

impl XlinkDb {
fn entrez_id_to_symbols(&self, entrez_id: u32) -> Vec<String> {
let mut tmp = self
.from_entrez
.get_vec(&entrez_id)
.map_or(Vec::new(), |idxs| {
idxs.iter()
.map(|idx| self.records[*idx as usize].symbol.clone())
.collect::<Vec<String>>()
});
tmp.sort();
tmp.dedup();
tmp
}

fn ensembl_to_symbols(&self, ensembl_id: u32) -> Vec<String> {
let mut tmp = self
.from_ensembl
.get_vec(&ensembl_id)
.map_or(Vec::new(), |idxs| {
idxs.iter()
.map(|idx| self.records[*idx as usize].symbol.clone())
.collect::<Vec<String>>()
});
tmp.sort();
tmp.dedup();
tmp
}

pub fn gene_id_to_symbols(&self, database: Database, gene_id: u32) -> Vec<String> {
match database {
Database::RefSeq => self.entrez_id_to_symbols(gene_id),
Database::Ensembl => self.ensembl_to_symbols(gene_id),
}
}
}

#[tracing::instrument]
fn load_xlink_db(path: &Path) -> Result<XlinkDb, anyhow::Error> {
tracing::debug!("loading binary xlink records from {:?}...", path);
Expand Down Expand Up @@ -287,44 +173,17 @@ fn load_mim2gene_db(path: &Path) -> Result<OmimDb, anyhow::Error> {
/// Bundle of gene region DBs and the xlink info packaged with VarFish.
#[derive(Default, Debug)]
pub struct GeneDb {
pub refseq: GeneRegionDb,
pub ensembl: GeneRegionDb,
pub xlink: XlinkDb,
pub acmg: AcmgDb,
pub mim2gene: OmimDb,
}

impl GeneDb {
/// Return IDs of overlapping genes.
pub fn overlapping_gene_ids(
&self,
database: Database,
chrom_no: u32,
query: Range<i32>,
) -> Vec<u32> {
match database {
Database::RefSeq => self.refseq.overlapping_gene_ids(chrom_no, query),
Database::Ensembl => self.ensembl.overlapping_gene_ids(chrom_no, query),
}
}
}

// Load all gene information, such as region, id mapping and symbols.
#[tracing::instrument]
pub fn load_gene_db(path_db: &str, genome_release: GenomeRelease) -> Result<GeneDb, anyhow::Error> {
info!("Loading gene dbs");

let result = GeneDb {
refseq: load_gene_regions_db(
Path::new(path_db)
.join(format!("{}/genes/refseq_regions.bin", genome_release))
.as_path(),
)?,
ensembl: load_gene_regions_db(
Path::new(path_db)
.join(format!("{}/genes/ensembl_regions.bin", genome_release))
.as_path(),
)?,
xlink: load_xlink_db(Path::new(path_db).join("noref/genes/xlink.bin").as_path())?,
acmg: load_acmg_db(Path::new(path_db).join("noref/genes/acmg.tsv").as_path())?,
mim2gene: load_mim2gene_db(
Expand Down
Loading
Loading