Skip to content

Commit

Permalink
feat: adding command "db create seqvars-clinvar" (#24) (#25)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Mar 31, 2023
1 parent a1d3bfa commit fd4c523
Show file tree
Hide file tree
Showing 22 changed files with 597 additions and 180 deletions.
5 changes: 3 additions & 2 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
tests/data/db/create/txs/latest/** filter=lfs diff=lfs merge=lfs -text
tests/data/vars/*.vcf*
tests/data/vars/*.tsv*
tests/data/vars/*.vcf* filter=lfs diff=lfs merge=lfs -text
tests/data/vars/*.tsv* filter=lfs diff=lfs merge=lfs -text
tests/data/** filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ parse-display = "0.8.0"
nom = "7.1.3"
linked-hash-map = "0.5.6"
enumset = { version = "1.0.12", features = ["serde"] }
bincode = "1.3.3"
bgzip = "0.3.1"

[build-dependencies]
flatc-rust = "0.2.0"
Expand Down
14 changes: 13 additions & 1 deletion docs/db_build.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ $ bcftools \
```

The reduction of this is quite big.
For GRCh37, the gnomAD raw download files go down from 449GB to 2.8GB and for GRCh38 from 2.2TB to XXXGB.
For GRCh37, the gnomAD raw download files go down from 449GB to 2.8GB and for GRCh38 from 2.2TB to 4.2GB.

## Building Frequency Databases

Expand Down Expand Up @@ -157,3 +157,15 @@ $ mehari db create txs \

You will have to build the transcript database for each genome release that you want and manually specify the release to `--genome-release`.
For GRCh38, simply use `--genome-release grch38`.

## Building ClinVar Database

This assumes that you have converted a recent ClinVar XML file to TSV using [clinvar-tsv](https://github.com/bihealth/clinvar-tsv).

```
$ mehari db create seqvar-clinvar \
--path-output-db ~/Data/mehari/db/seqvars/grch37/clinvar \
--path-clinvar-tsv path/to/clinvar_seqvars.b37.tsv.gz
```

You can specify an optional `--genome-release grch37` argument that will be used to check the ClinVar database to be compatible with your data.
41 changes: 36 additions & 5 deletions src/annotate/seqvars/csq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,15 @@ impl ConsequencePredictor {
txs.sort_by(|a, b| a.tx_ac.cmp(&b.tx_ac));

// Generate `AnnField` records for each transcript.
//
// Skip `None` results.
Ok(Some(
txs.into_iter()
.map(|tx| self.build_ann_field(&var, tx, chrom_acc.clone(), var_start, var_end))
.collect::<Result<Vec<_>, _>>()?,
.collect::<Result<Vec<_>, _>>()?
.into_iter()
.filter_map(|x| x)
.collect::<Vec<_>>(),
))
}

Expand All @@ -121,11 +126,18 @@ impl ConsequencePredictor {
chrom_acc: String,
var_start: i32,
var_end: i32,
) -> Result<AnnField, anyhow::Error> {
) -> Result<Option<AnnField>, anyhow::Error> {
// NB: The coordinates of var_start, var_end, as well as the exon boundaries
// are 0-based.

let tx = self.provider.get_tx(&tx_record.tx_ac).unwrap();

// Skip transcripts that are protein coding but do not have a CDS.
// TODO: do not include such transcripts when building the database.
if tx.biotype == TranscriptBiotype::Coding && tx.start_codon.is_none() {
return Ok(None);
}

let mut consequences: Vec<Consequence> = Vec::new();

let alignment = tx.genome_alignments.first().unwrap();
Expand Down Expand Up @@ -366,7 +378,26 @@ impl ConsequencePredictor {
let prot_len = cds_len / 3;

let var_c = self.mapper.n_to_c(&var_n)?;
let var_p = self.mapper.c_to_p(&var_c)?;
// Gracefully handle the case that the transcript is unsupported because the length
// is not a multiple of 3.
// TODO: do not include such transcripts when building the tx database.
let var_p = self.mapper.c_to_p(&var_c).map_or_else(
|e| {
if e.to_string()
.contains("is not supported because its sequence length of")
{
Ok(None)
} else {
Err(e)
}
},
|v| Ok(Some(v)),
)?;
if var_p.is_none() {
return Ok(None);
}
let var_p = var_p.unwrap();

let hgvs_p = Some(format!("{}", &var_p));
let cds_pos = match &var_c {
HgvsVariant::CdsVariant { loc_edit, .. } => Some(Pos {
Expand Down Expand Up @@ -547,7 +578,7 @@ impl ConsequencePredictor {
let putative_impact = (*consequences.first().unwrap()).into();

// Build and return ANN field from the information derived above.
Ok(AnnField {
Ok(Some(AnnField {
allele: Allele::Alt {
alternative: var.alternative.clone(),
},
Expand All @@ -568,7 +599,7 @@ impl ConsequencePredictor {
protein_pos,
distance,
messages: None,
})
}))
}

// Normalize variant by stripping common suffixes and prefixes.
Expand Down
111 changes: 92 additions & 19 deletions src/annotate/seqvars/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use thousands::Separable;
use crate::annotate::seqvars::csq::{ConsequencePredictor, VcfVariant};
use crate::annotate::seqvars::provider::MehariProvider;
use crate::common::GenomeRelease;
use crate::db::create::seqvar_clinvar::serialize::Record as ClinvarRecord;
use crate::db::create::seqvar_freqs::reading::guess_assembly;
use crate::db::create::seqvar_freqs::serialized::vcf::Var as VcfVar;
use crate::db::create::seqvar_freqs::serialized::{
Expand Down Expand Up @@ -98,6 +99,9 @@ pub mod keys {
pub static ref HELIX_HET: InfoKey = InfoKey::Other(InfoKeyOther::from_str("helix_het").unwrap());

pub static ref ANN: InfoKey = InfoKey::Other(InfoKeyOther::from_str("ANN").unwrap());

pub static ref CLINVAR_PATHO: InfoKey = InfoKey::Other(InfoKeyOther::from_str("clinvar_patho").unwrap());
pub static ref CLINVAR_VCV: InfoKey = InfoKey::Other(InfoKeyOther::from_str("clinvar_vcv").unwrap());
}
}

Expand Down Expand Up @@ -207,20 +211,28 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader {
),
);

header_out.infos_mut().insert(
keys::CLINVAR_PATHO.clone(),
Map::<Info>::new(Number::Count(1), Type::String, "ClinVar pathogenicity"),
);
header_out.infos_mut().insert(
keys::CLINVAR_VCV.clone(),
Map::<Info>::new(Number::Count(1), Type::String, "ClinVar VCV accession"),
);

header_out
}

/// Annotate record on autosomal chromosome with gnomAD exomes/genomes.
fn annotate_record_auto<T>(
db: &rocksdb::DBWithThreadMode<T>,
cf: &rocksdb::ColumnFamily,
vcf_var: VcfVar,
key: &Vec<u8>,
vcf_record: &mut noodles::vcf::Record,
) -> Result<(), anyhow::Error>
where
T: ThreadMode,
{
let key: Vec<u8> = vcf_var.into();
if let Some(freq) = db.get_cf(cf, key)? {
let auto_record = AutoRecord::from_buf(&freq);

Expand Down Expand Up @@ -257,13 +269,12 @@ where
fn annotate_record_xy<T>(
db: &rocksdb::DBWithThreadMode<T>,
cf: &rocksdb::ColumnFamily,
vcf_var: VcfVar,
key: &Vec<u8>,
vcf_record: &mut noodles::vcf::Record,
) -> Result<(), anyhow::Error>
where
T: ThreadMode,
{
let key: Vec<u8> = vcf_var.into();
if let Some(freq) = db.get_cf(cf, key)? {
let auto_record = XyRecord::from_buf(&freq);

Expand Down Expand Up @@ -308,13 +319,12 @@ where
fn annotate_record_mt<T>(
db: &rocksdb::DBWithThreadMode<T>,
cf: &rocksdb::ColumnFamily,
vcf_var: VcfVar,
key: &Vec<u8>,
vcf_record: &mut noodles::vcf::Record,
) -> Result<(), anyhow::Error>
where
T: ThreadMode,
{
let key: Vec<u8> = vcf_var.into();
if let Some(freq) = db.get_cf(cf, key)? {
let mt_record = MtRecord::from_buf(&freq);

Expand Down Expand Up @@ -347,6 +357,39 @@ where
Ok(())
}

/// Annotate record with ClinVar information.
fn annotate_record_clinvar<T>(
db: &rocksdb::DBWithThreadMode<T>,
cf: &rocksdb::ColumnFamily,
key: &Vec<u8>,
vcf_record: &mut noodles::vcf::Record,
) -> Result<(), anyhow::Error>
where
T: ThreadMode,
{
if let Some(clinvar_anno) = db.get_cf(cf, key)? {
let clinvar_record: ClinvarRecord = bincode::deserialize(&clinvar_anno)?;

let ClinvarRecord {
summary_clinvar_pathogenicity,
vcv,
..
} = clinvar_record;

vcf_record.info_mut().insert(
keys::CLINVAR_PATHO.clone(),
Some(Value::String(
summary_clinvar_pathogenicity.first().unwrap().to_string(),
)),
);
vcf_record
.info_mut()
.insert(keys::CLINVAR_VCV.clone(), Some(Value::String(vcv.clone())));
}

Ok(())
}

lazy_static::lazy_static! {
static ref CHROM_MT: HashSet<&'static str> = HashSet::from_iter(["M", "MT", "chrM", "chrMT"].into_iter());
static ref CHROM_XY: HashSet<&'static str> = HashSet::from_iter(["M", "MT", "chrM", "chrMT"].into_iter());
Expand Down Expand Up @@ -554,23 +597,43 @@ fn run_with_writer<Inner: Write>(
let assembly = guess_assembly(&header_in, false, genome_release)?;
tracing::info!("Determined input assembly to be {:?}", &assembly);

// Open the RocksDB database in read only mode.
// Open the frequency RocksDB database in read only mode.
tracing::info!("Opening frequency database");
let rocksdb_path = format!(
"{}/seqvars/{}/freqs",
&args.path_db,
path_component(assembly)
);
tracing::debug!("RocksDB path = {}", &rocksdb_path);
let options = rocksdb::Options::default();
let db = rocksdb::DB::open_cf_for_read_only(
let db_freq = rocksdb::DB::open_cf_for_read_only(
&options,
&format!(
"{}/seqvars/{}/freqs",
&args.path_db,
path_component(assembly)
),
&rocksdb_path,
["meta", "autosomal", "gonosomal", "mitochondrial"],
false,
)?;

let cf_autosomal = db.cf_handle("autosomal").unwrap();
let cf_gonosomal = db.cf_handle("gonosomal").unwrap();
let cf_mtdna = db.cf_handle("mitochondrial").unwrap();
let cf_autosomal = db_freq.cf_handle("autosomal").unwrap();
let cf_gonosomal = db_freq.cf_handle("gonosomal").unwrap();
let cf_mtdna = db_freq.cf_handle("mitochondrial").unwrap();

// Open the ClinVar RocksDB database in read only mode.
tracing::info!("Opening ClinVar database");
let rocksdb_path = format!(
"{}/seqvars/{}/clinvar",
&args.path_db,
path_component(assembly)
);
tracing::debug!("RocksDB path = {}", &rocksdb_path);
let options = rocksdb::Options::default();
let db_clinvar = rocksdb::DB::open_cf_for_read_only(
&options,
&rocksdb_path,
["meta", "clinvar_seqvars"],
false,
)?;

let cf_clinvar = db_clinvar.cf_handle("clinvar_seqvars").unwrap();

// Open the transcript flatbuffer.
tracing::info!("Opening transcript database");
Expand All @@ -597,23 +660,30 @@ fn run_with_writer<Inner: Write>(
loop {
if let Some(record) = records.next() {
let mut vcf_record = record?;

// TODO: ignores all but the first alternative allele!
let vcf_var = VcfVar::from_vcf(&vcf_record);

// Build key for RocksDB database from `vcf_var`.
let key: Vec<u8> = vcf_var.clone().into();

// Annotate with frequency.
if CHROM_AUTO.contains(vcf_var.chrom.as_str()) {
annotate_record_auto(&db, cf_autosomal, vcf_var.clone(), &mut vcf_record)?;
annotate_record_auto(&db_freq, cf_autosomal, &key, &mut vcf_record)?;
} else if CHROM_XY.contains(vcf_var.chrom.as_str()) {
annotate_record_xy(&db, cf_gonosomal, vcf_var.clone(), &mut vcf_record)?;
annotate_record_xy(&db_freq, cf_gonosomal, &key, &mut vcf_record)?;
} else if CHROM_MT.contains(vcf_var.chrom.as_str()) {
annotate_record_mt(&db, cf_mtdna, vcf_var.clone(), &mut vcf_record)?;
annotate_record_mt(&db_freq, cf_mtdna, &key, &mut vcf_record)?;
} else {
tracing::trace!(
"Record @{:?} on non-canonical chromosome, skipping.",
&vcf_var
);
}

// Annotate with ClinVar.
annotate_record_clinvar(&db_clinvar, cf_clinvar, &key, &mut vcf_record)?;

let VcfVar {
chrom,
pos,
Expand All @@ -638,6 +708,8 @@ fn run_with_writer<Inner: Write>(
}
}

// Annotate with ClinVar annotation.

writer.write_record(&vcf_record)?;
} else {
break; // all done
Expand All @@ -664,6 +736,7 @@ fn run_with_writer<Inner: Write>(

/// Main entry point for `annotate seqvars` sub command.
pub fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> {
tracing::info!("config = {:#?}", &args);
if args.path_output_vcf.ends_with(".vcf.gz") || args.path_output_vcf.ends_with(".vcf.bgzf") {
let writer = VcfWriter::new(
File::create(&args.path_output_vcf)
Expand Down
17 changes: 8 additions & 9 deletions src/annotate/seqvars/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,15 @@ impl TxIntervalTrees {
for (tx_id, tx) in db.tx_db.transcripts.iter().enumerate() {
for genome_alignment in &tx.genome_alignments {
let contig = &genome_alignment.contig;
let contig_idx = *contig_to_idx
.get(contig)
.unwrap_or_else(|| panic!("Unknown contig {}", contig));
let mut start = std::i32::MAX;
let mut stop = std::i32::MIN;
for exon in &genome_alignment.exons {
start = std::cmp::min(start, exon.alt_start_i);
stop = std::cmp::max(stop, exon.alt_end_i);
if let Some(contig_idx) = contig_to_idx.get(contig) {
let mut start = std::i32::MAX;
let mut stop = std::i32::MIN;
for exon in &genome_alignment.exons {
start = std::cmp::min(start, exon.alt_start_i);
stop = std::cmp::max(stop, exon.alt_end_i);
}
trees[*contig_idx].insert(start..stop, tx_id as u32);
}
trees[contig_idx].insert(start..stop, tx_id as u32);
}

txs += 1;
Expand Down
1 change: 1 addition & 0 deletions src/db/create/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
//! Creation of mehari internal databases.
pub mod seqvar_clinvar;
pub mod seqvar_freqs;
pub mod txs;
Loading

0 comments on commit fd4c523

Please sign in to comment.