Skip to content

Commit

Permalink
feat: adding command "db create seqvars-clinvar" (#24)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Mar 29, 2023
1 parent cefd950 commit c015295
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 4 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ parse-display = "0.8.0"
nom = "7.1.3"
linked-hash-map = "0.5.6"
enumset = { version = "1.0.12", features = ["serde"] }
bincode = "1.3.3"

[build-dependencies]
flatc-rust = "0.2.0"
Expand Down
1 change: 1 addition & 0 deletions src/db/create/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
//! Creation of mehari internal databases.
pub mod seqvar_freqs;
pub mod seqvar_clinvar;
pub mod txs;
138 changes: 138 additions & 0 deletions src/db/create/seqvar_clinvar.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
//! Creation of mehari internal sequence variant ClinVar database.
use std::time::Instant;

use clap::Parser;
use hgvs::static_data::Assembly;

use crate::common::GenomeRelease;

/// Reading `clinvar-tsv` sequence variant files.
pub mod reading {
use serde::{Serialize, Deserialize};

/// Representation of a record from the `clinvar-tsv` output.
///
/// Note that the pathogenicity and review status are available in two fashions. The first is
/// "ClinVar style" and attempts to follow the ClinVar approach. Here, variant assessments
/// with a higher star rating override those a lowe rone. This is what most users want.
/// The assessment "paranoid" uses all assessments, including those without a star rating,
/// on the same level.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
pub struct Record {
/// Genome release.
release: String,
/// Chromosome name.
chromosome: String,
/// 1-based start position.
start: i32,
/// 1-based end position.
end: i32,
/// Reference allele bases in VCF notation.
reference: String,
/// Alternative allele bases in VCF notation.
alternative: String,
/// VCV accession identifier.
vcv: String,
/// Pathogenicity summary for the variant (ClinVar style).
#[serde(rename = "summary_clinvar_pathogenicity_label")]
summary_clinvar_pathogenicity: String,
/// Pathogenicity review status for the variant (ClinVar style).
summary_clinvar_review_status: String,
/// Pathogenicity gold stars (ClinVar style).
summary_clinvar_gold_stars: i32,
/// Pathogenicity summary for the variant ("paranoid" style).
#[serde(rename = "summary_paranoid_pathogenicity_label")]
summary_paranoid_pathogenicity: String,
/// Pathogenicity review status for the variant ("paranoid" style).
summary_paranoid_review_status: String,
/// Pathogenicity gold stars ("paranoid" style).
summary_paranoid_gold_stars: i32,
}
}

#[derive(Parser, Debug)]
#[command(about = "Construct mehari sequence variant ClinVar database", long_about = None)]
pub struct Args {
/// Genome release to use, default is to auto-detect.
#[arg(long, value_enum)]
pub genome_release: Option<GenomeRelease>,
/// Path to the output database to build.
#[arg(long)]
pub path_output_db: String,

/// Path to the sequence variant ClinVar TSV file from `clinvar-tsv`.
#[arg(long)]
pub path_clinvar_tsv: String,

/// For debug purposes, maximal number of variants to import.
#[arg(long)]
pub max_var_count: Option<usize>,
}

/// Main entry point for `db create seqvar-clinvar` sub command.
pub fn run(common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> {
tracing::info!(
"Building sequence variant ClinVar database\ncommon args: {:#?}\nargs: {:#?}",
common,
args
);

// Guess genome release from paths.
let genome_release = args.genome_release.map(|gr| match gr {
GenomeRelease::Grch37 => Assembly::Grch37p10, // has chrMT!
GenomeRelease::Grch38 => Assembly::Grch38,
});

// Open the output database, obtain column family handles, and write out meta data.
tracing::info!("Opening output database");
let mut options = rocksdb::Options::default();
options.create_if_missing(true);
options.create_missing_column_families(true);
options.prepare_for_bulk_load();
options.set_disable_auto_compactions(true);
let db = rocksdb::DB::open_cf(
&options,
&args.path_output_db,
["meta", "clinvar_seqvars"],
)?;

let cf_meta = db.cf_handle("meta").unwrap();
let cf_clinvar = db.cf_handle("clinvar_seqvars").unwrap();

tracing::info!("Writing meta data to database");
db.put_cf(cf_meta, "genome-release", format!("{genome_release:?}"))?;

// Import the ClinVar data TSV file.
tracing::info!("Processing ClinVar TSV ...");
todo!();

// Finally, compact manually.
tracing::info!("Enforcing manual compaction");
db.compact_range_cf(cf_meta, None::<&[u8]>, None::<&[u8]>);
db.compact_range_cf(cf_clinvar, None::<&[u8]>, None::<&[u8]>);

let compaction_start = Instant::now();
let mut last_printed = compaction_start;
while db
.property_int_value(rocksdb::properties::COMPACTION_PENDING)?
.unwrap()
> 0
|| db
.property_int_value(rocksdb::properties::NUM_RUNNING_COMPACTIONS)?
.unwrap()
> 0
{
std::thread::sleep(std::time::Duration::from_millis(100));
if last_printed.elapsed() > std::time::Duration::from_millis(1000) {
log::info!(
"... waiting for compaction for {:?}",
compaction_start.elapsed()
);
last_printed = Instant::now();
}
}

tracing::info!("Done building sequence variant ClinVar database");
Ok(())
}
9 changes: 5 additions & 4 deletions src/db/create/seqvar_freqs/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//! Population frequencies for sequence variants.
//! Creation of mehari internal sequence variant population frequency database.
pub mod reading;
pub mod serialized;
Expand Down Expand Up @@ -602,10 +602,10 @@ fn import_chrmt(
Ok(())
}

/// Main entry point for `db create seqvar_freqs` sub command.
/// Main entry point for `db create seqvar-freqs` sub command.
pub fn run(common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> {
tracing::info!(
"Building sequence variant frequencies table\ncommon args: {:#?}\nargs: {:#?}",
"Building sequence variant frequencies database\ncommon args: {:#?}\nargs: {:#?}",
common,
args
);
Expand Down Expand Up @@ -646,6 +646,7 @@ pub fn run(common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Erro

// Finally, compact manually.
tracing::info!("Enforcing manual compaction");
db.compact_range_cf(cf_meta, None::<&[u8]>, None::<&[u8]>);
db.compact_range_cf(cf_autosomal, None::<&[u8]>, None::<&[u8]>);
db.compact_range_cf(cf_gonosomal, None::<&[u8]>, None::<&[u8]>);
db.compact_range_cf(cf_mtdna, None::<&[u8]>, None::<&[u8]>);
Expand All @@ -671,7 +672,7 @@ pub fn run(common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Erro
}
}

tracing::info!("Done building sequence variant frequency table");
tracing::info!("Done building sequence variant frequency database");
Ok(())
}

Expand Down
4 changes: 4 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ struct DbCreate {
enum DbCreateCommands {
Txs(db::create::txs::Args),
SeqvarFreqs(db::create::seqvar_freqs::Args),
SeqvarClinvar(db::create::seqvar_clinvar::Args),
}

/// Parsing of "annotate *" sub commands.
Expand Down Expand Up @@ -212,6 +213,9 @@ fn main() -> Result<(), anyhow::Error> {
DbCreateCommands::SeqvarFreqs(args) => {
db::create::seqvar_freqs::run(&cli.common, args)?
}
DbCreateCommands::SeqvarClinvar(args) => {
db::create::seqvar_clinvar::run(&cli.common, args)?
}
},
},
Commands::Annotate(annotate) => match &annotate.command {
Expand Down

0 comments on commit c015295

Please sign in to comment.