diff --git a/.gitattributes b/.gitattributes index 23af1223..06387b78 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,5 @@ tests/data/db/create/txs/latest/** filter=lfs diff=lfs merge=lfs -text -tests/data/vars/*.vcf* -tests/data/vars/*.tsv* +tests/data/vars/*.vcf* filter=lfs diff=lfs merge=lfs -text +tests/data/vars/*.tsv* filter=lfs diff=lfs merge=lfs -text +tests/data/** filter=lfs diff=lfs merge=lfs -text *.bin filter=lfs diff=lfs merge=lfs -text diff --git a/Cargo.toml b/Cargo.toml index 9b31178b..b42f0d76 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,8 @@ parse-display = "0.8.0" nom = "7.1.3" linked-hash-map = "0.5.6" enumset = { version = "1.0.12", features = ["serde"] } +bincode = "1.3.3" +bgzip = "0.3.1" [build-dependencies] flatc-rust = "0.2.0" diff --git a/docs/db_build.md b/docs/db_build.md index c4f13c5c..ba6dbbb3 100644 --- a/docs/db_build.md +++ b/docs/db_build.md @@ -39,7 +39,7 @@ $ bcftools \ ``` The reduction of this is quite big. -For GRCh37, the gnomAD raw download files go down from 449GB to 2.8GB and for GRCh38 from 2.2TB to XXXGB. +For GRCh37, the gnomAD raw download files go down from 449GB to 2.8GB and for GRCh38 from 2.2TB to 4.2GB. ## Building Frequency Databases @@ -157,3 +157,15 @@ $ mehari db create txs \ You will have to build the transcript database for each genome release that you want and manually specify the release to `--genome-release`. For GRCh38, simply use `--genome-release grch38`. + +## Building ClinVar Database + +This assumes that you have converted a recent ClinVar XML file to TSV using [clinvar-tsv](https://github.com/bihealth/clinvar-tsv). + +``` +$ mehari db create seqvar-clinvar \ + --path-output-db ~/Data/mehari/db/seqvars/grch37/clinvar \ + --path-clinvar-tsv path/to/clinvar_seqvars.b37.tsv.gz +``` + +You can specify an optional `--genome-release grch37` argument that will be used to check the ClinVar database to be compatible with your data. diff --git a/src/annotate/seqvars/csq.rs b/src/annotate/seqvars/csq.rs index 6ef3a963..c2be7dc8 100644 --- a/src/annotate/seqvars/csq.rs +++ b/src/annotate/seqvars/csq.rs @@ -107,10 +107,15 @@ impl ConsequencePredictor { txs.sort_by(|a, b| a.tx_ac.cmp(&b.tx_ac)); // Generate `AnnField` records for each transcript. + // + // Skip `None` results. Ok(Some( txs.into_iter() .map(|tx| self.build_ann_field(&var, tx, chrom_acc.clone(), var_start, var_end)) - .collect::, _>>()?, + .collect::, _>>()? + .into_iter() + .filter_map(|x| x) + .collect::>(), )) } @@ -121,11 +126,18 @@ impl ConsequencePredictor { chrom_acc: String, var_start: i32, var_end: i32, - ) -> Result { + ) -> Result, anyhow::Error> { // NB: The coordinates of var_start, var_end, as well as the exon boundaries // are 0-based. let tx = self.provider.get_tx(&tx_record.tx_ac).unwrap(); + + // Skip transcripts that are protein coding but do not have a CDS. + // TODO: do not include such transcripts when building the database. + if tx.biotype == TranscriptBiotype::Coding && tx.start_codon.is_none() { + return Ok(None); + } + let mut consequences: Vec = Vec::new(); let alignment = tx.genome_alignments.first().unwrap(); @@ -366,7 +378,26 @@ impl ConsequencePredictor { let prot_len = cds_len / 3; let var_c = self.mapper.n_to_c(&var_n)?; - let var_p = self.mapper.c_to_p(&var_c)?; + // Gracefully handle the case that the transcript is unsupported because the length + // is not a multiple of 3. + // TODO: do not include such transcripts when building the tx database. + let var_p = self.mapper.c_to_p(&var_c).map_or_else( + |e| { + if e.to_string() + .contains("is not supported because its sequence length of") + { + Ok(None) + } else { + Err(e) + } + }, + |v| Ok(Some(v)), + )?; + if var_p.is_none() { + return Ok(None); + } + let var_p = var_p.unwrap(); + let hgvs_p = Some(format!("{}", &var_p)); let cds_pos = match &var_c { HgvsVariant::CdsVariant { loc_edit, .. } => Some(Pos { @@ -547,7 +578,7 @@ impl ConsequencePredictor { let putative_impact = (*consequences.first().unwrap()).into(); // Build and return ANN field from the information derived above. - Ok(AnnField { + Ok(Some(AnnField { allele: Allele::Alt { alternative: var.alternative.clone(), }, @@ -568,7 +599,7 @@ impl ConsequencePredictor { protein_pos, distance, messages: None, - }) + })) } // Normalize variant by stripping common suffixes and prefixes. diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index ca2d3506..f43b403d 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -29,6 +29,7 @@ use thousands::Separable; use crate::annotate::seqvars::csq::{ConsequencePredictor, VcfVariant}; use crate::annotate::seqvars::provider::MehariProvider; use crate::common::GenomeRelease; +use crate::db::create::seqvar_clinvar::serialize::Record as ClinvarRecord; use crate::db::create::seqvar_freqs::reading::guess_assembly; use crate::db::create::seqvar_freqs::serialized::vcf::Var as VcfVar; use crate::db::create::seqvar_freqs::serialized::{ @@ -98,6 +99,9 @@ pub mod keys { pub static ref HELIX_HET: InfoKey = InfoKey::Other(InfoKeyOther::from_str("helix_het").unwrap()); pub static ref ANN: InfoKey = InfoKey::Other(InfoKeyOther::from_str("ANN").unwrap()); + + pub static ref CLINVAR_PATHO: InfoKey = InfoKey::Other(InfoKeyOther::from_str("clinvar_patho").unwrap()); + pub static ref CLINVAR_VCV: InfoKey = InfoKey::Other(InfoKeyOther::from_str("clinvar_vcv").unwrap()); } } @@ -207,6 +211,15 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { ), ); + header_out.infos_mut().insert( + keys::CLINVAR_PATHO.clone(), + Map::::new(Number::Count(1), Type::String, "ClinVar pathogenicity"), + ); + header_out.infos_mut().insert( + keys::CLINVAR_VCV.clone(), + Map::::new(Number::Count(1), Type::String, "ClinVar VCV accession"), + ); + header_out } @@ -214,13 +227,12 @@ fn build_header(header_in: &VcfHeader) -> VcfHeader { fn annotate_record_auto( db: &rocksdb::DBWithThreadMode, cf: &rocksdb::ColumnFamily, - vcf_var: VcfVar, + key: &Vec, vcf_record: &mut noodles::vcf::Record, ) -> Result<(), anyhow::Error> where T: ThreadMode, { - let key: Vec = vcf_var.into(); if let Some(freq) = db.get_cf(cf, key)? { let auto_record = AutoRecord::from_buf(&freq); @@ -257,13 +269,12 @@ where fn annotate_record_xy( db: &rocksdb::DBWithThreadMode, cf: &rocksdb::ColumnFamily, - vcf_var: VcfVar, + key: &Vec, vcf_record: &mut noodles::vcf::Record, ) -> Result<(), anyhow::Error> where T: ThreadMode, { - let key: Vec = vcf_var.into(); if let Some(freq) = db.get_cf(cf, key)? { let auto_record = XyRecord::from_buf(&freq); @@ -308,13 +319,12 @@ where fn annotate_record_mt( db: &rocksdb::DBWithThreadMode, cf: &rocksdb::ColumnFamily, - vcf_var: VcfVar, + key: &Vec, vcf_record: &mut noodles::vcf::Record, ) -> Result<(), anyhow::Error> where T: ThreadMode, { - let key: Vec = vcf_var.into(); if let Some(freq) = db.get_cf(cf, key)? { let mt_record = MtRecord::from_buf(&freq); @@ -347,6 +357,39 @@ where Ok(()) } +/// Annotate record with ClinVar information. +fn annotate_record_clinvar( + db: &rocksdb::DBWithThreadMode, + cf: &rocksdb::ColumnFamily, + key: &Vec, + vcf_record: &mut noodles::vcf::Record, +) -> Result<(), anyhow::Error> +where + T: ThreadMode, +{ + if let Some(clinvar_anno) = db.get_cf(cf, key)? { + let clinvar_record: ClinvarRecord = bincode::deserialize(&clinvar_anno)?; + + let ClinvarRecord { + summary_clinvar_pathogenicity, + vcv, + .. + } = clinvar_record; + + vcf_record.info_mut().insert( + keys::CLINVAR_PATHO.clone(), + Some(Value::String( + summary_clinvar_pathogenicity.first().unwrap().to_string(), + )), + ); + vcf_record + .info_mut() + .insert(keys::CLINVAR_VCV.clone(), Some(Value::String(vcv.clone()))); + } + + Ok(()) +} + lazy_static::lazy_static! { static ref CHROM_MT: HashSet<&'static str> = HashSet::from_iter(["M", "MT", "chrM", "chrMT"].into_iter()); static ref CHROM_XY: HashSet<&'static str> = HashSet::from_iter(["M", "MT", "chrM", "chrMT"].into_iter()); @@ -554,23 +597,43 @@ fn run_with_writer( let assembly = guess_assembly(&header_in, false, genome_release)?; tracing::info!("Determined input assembly to be {:?}", &assembly); - // Open the RocksDB database in read only mode. + // Open the frequency RocksDB database in read only mode. tracing::info!("Opening frequency database"); + let rocksdb_path = format!( + "{}/seqvars/{}/freqs", + &args.path_db, + path_component(assembly) + ); + tracing::debug!("RocksDB path = {}", &rocksdb_path); let options = rocksdb::Options::default(); - let db = rocksdb::DB::open_cf_for_read_only( + let db_freq = rocksdb::DB::open_cf_for_read_only( &options, - &format!( - "{}/seqvars/{}/freqs", - &args.path_db, - path_component(assembly) - ), + &rocksdb_path, ["meta", "autosomal", "gonosomal", "mitochondrial"], false, )?; - let cf_autosomal = db.cf_handle("autosomal").unwrap(); - let cf_gonosomal = db.cf_handle("gonosomal").unwrap(); - let cf_mtdna = db.cf_handle("mitochondrial").unwrap(); + let cf_autosomal = db_freq.cf_handle("autosomal").unwrap(); + let cf_gonosomal = db_freq.cf_handle("gonosomal").unwrap(); + let cf_mtdna = db_freq.cf_handle("mitochondrial").unwrap(); + + // Open the ClinVar RocksDB database in read only mode. + tracing::info!("Opening ClinVar database"); + let rocksdb_path = format!( + "{}/seqvars/{}/clinvar", + &args.path_db, + path_component(assembly) + ); + tracing::debug!("RocksDB path = {}", &rocksdb_path); + let options = rocksdb::Options::default(); + let db_clinvar = rocksdb::DB::open_cf_for_read_only( + &options, + &rocksdb_path, + ["meta", "clinvar_seqvars"], + false, + )?; + + let cf_clinvar = db_clinvar.cf_handle("clinvar_seqvars").unwrap(); // Open the transcript flatbuffer. tracing::info!("Opening transcript database"); @@ -597,16 +660,20 @@ fn run_with_writer( loop { if let Some(record) = records.next() { let mut vcf_record = record?; + // TODO: ignores all but the first alternative allele! let vcf_var = VcfVar::from_vcf(&vcf_record); + // Build key for RocksDB database from `vcf_var`. + let key: Vec = vcf_var.clone().into(); + // Annotate with frequency. if CHROM_AUTO.contains(vcf_var.chrom.as_str()) { - annotate_record_auto(&db, cf_autosomal, vcf_var.clone(), &mut vcf_record)?; + annotate_record_auto(&db_freq, cf_autosomal, &key, &mut vcf_record)?; } else if CHROM_XY.contains(vcf_var.chrom.as_str()) { - annotate_record_xy(&db, cf_gonosomal, vcf_var.clone(), &mut vcf_record)?; + annotate_record_xy(&db_freq, cf_gonosomal, &key, &mut vcf_record)?; } else if CHROM_MT.contains(vcf_var.chrom.as_str()) { - annotate_record_mt(&db, cf_mtdna, vcf_var.clone(), &mut vcf_record)?; + annotate_record_mt(&db_freq, cf_mtdna, &key, &mut vcf_record)?; } else { tracing::trace!( "Record @{:?} on non-canonical chromosome, skipping.", @@ -614,6 +681,9 @@ fn run_with_writer( ); } + // Annotate with ClinVar. + annotate_record_clinvar(&db_clinvar, cf_clinvar, &key, &mut vcf_record)?; + let VcfVar { chrom, pos, @@ -638,6 +708,8 @@ fn run_with_writer( } } + // Annotate with ClinVar annotation. + writer.write_record(&vcf_record)?; } else { break; // all done @@ -664,6 +736,7 @@ fn run_with_writer( /// Main entry point for `annotate seqvars` sub command. pub fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> { + tracing::info!("config = {:#?}", &args); if args.path_output_vcf.ends_with(".vcf.gz") || args.path_output_vcf.ends_with(".vcf.bgzf") { let writer = VcfWriter::new( File::create(&args.path_output_vcf) diff --git a/src/annotate/seqvars/provider.rs b/src/annotate/seqvars/provider.rs index 262bc315..6cd1674e 100644 --- a/src/annotate/seqvars/provider.rs +++ b/src/annotate/seqvars/provider.rs @@ -66,16 +66,15 @@ impl TxIntervalTrees { for (tx_id, tx) in db.tx_db.transcripts.iter().enumerate() { for genome_alignment in &tx.genome_alignments { let contig = &genome_alignment.contig; - let contig_idx = *contig_to_idx - .get(contig) - .unwrap_or_else(|| panic!("Unknown contig {}", contig)); - let mut start = std::i32::MAX; - let mut stop = std::i32::MIN; - for exon in &genome_alignment.exons { - start = std::cmp::min(start, exon.alt_start_i); - stop = std::cmp::max(stop, exon.alt_end_i); + if let Some(contig_idx) = contig_to_idx.get(contig) { + let mut start = std::i32::MAX; + let mut stop = std::i32::MIN; + for exon in &genome_alignment.exons { + start = std::cmp::min(start, exon.alt_start_i); + stop = std::cmp::max(stop, exon.alt_end_i); + } + trees[*contig_idx].insert(start..stop, tx_id as u32); } - trees[contig_idx].insert(start..stop, tx_id as u32); } txs += 1; diff --git a/src/db/create/mod.rs b/src/db/create/mod.rs index cbd5b448..981dd091 100644 --- a/src/db/create/mod.rs +++ b/src/db/create/mod.rs @@ -1,4 +1,5 @@ //! Creation of mehari internal databases. +pub mod seqvar_clinvar; pub mod seqvar_freqs; pub mod txs; diff --git a/src/db/create/seqvar_clinvar.rs b/src/db/create/seqvar_clinvar.rs new file mode 100644 index 00000000..481b99f9 --- /dev/null +++ b/src/db/create/seqvar_clinvar.rs @@ -0,0 +1,399 @@ +//! Creation of mehari internal sequence variant ClinVar database. + +use std::{fmt::Display, str::FromStr}; +use std::{ + io::{BufRead, BufReader}, + time::Instant, +}; + +use bgzip::BGZFReader; +use clap::Parser; +use hgvs::static_data::Assembly; +use rocksdb::{DBWithThreadMode, SingleThreaded}; +use serde::{Deserialize, Serialize}; +use thousands::Separable; + +use crate::common::GenomeRelease; + +use super::seqvar_freqs::serialized::vcf::Var; + +/// Enumeration for ClinVar pathogenicity. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub enum Pathogenicity { + /// Pathogenic. + Pathogenic, + /// Likely pathogenic. + LikelyPathogenic, + /// Uncertain significance. + UncertainSignificance, + /// Likely benign. + LikelyBenign, + /// Benign. + Benign, +} + +impl Display for Pathogenicity { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Pathogenicity::Pathogenic => write!(f, "pathogenic"), + Pathogenicity::LikelyPathogenic => write!(f, "likely pathogenic"), + Pathogenicity::UncertainSignificance => write!(f, "uncertain significance"), + Pathogenicity::LikelyBenign => write!(f, "likely benign"), + Pathogenicity::Benign => write!(f, "benign"), + } + } +} + +impl FromStr for Pathogenicity { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "pathogenic" => Ok(Pathogenicity::Pathogenic), + "likely pathogenic" => Ok(Pathogenicity::LikelyPathogenic), + "uncertain significance" => Ok(Pathogenicity::UncertainSignificance), + "likely benign" => Ok(Pathogenicity::LikelyBenign), + "benign" => Ok(Pathogenicity::Benign), + _ => anyhow::bail!("Unknown pathogenicity: {}", s), + } + } +} + +impl Serialize for Pathogenicity { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.serialize_str(&self.to_string()) + } +} + +impl<'de> Deserialize<'de> for Pathogenicity { + fn deserialize(d: D) -> Result + where + D: serde::Deserializer<'de>, + { + let s = String::deserialize(d)?; + Self::from_str(&s).map_err(serde::de::Error::custom) + } +} + +/// Reading `clinvar-tsv` sequence variant files. +pub mod reading { + use serde::{Deserialize, Serialize}; + + use super::Pathogenicity; + + /// Representation of a record from the `clinvar-tsv` output. + /// + /// Note that the pathogenicity and review status are available in two fashions. The first is + /// "ClinVar style" and attempts to follow the ClinVar approach. Here, variant assessments + /// with a higher star rating override those a lowe rone. This is what most users want. + /// The assessment "paranoid" uses all assessments, including those without a star rating, + /// on the same level. + #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] + pub struct Record { + /// Genome release. + pub release: String, + /// Chromosome name. + pub chromosome: String, + /// 1-based start position. + pub start: u32, + /// 1-based end position. + pub end: u32, + /// Reference allele bases in VCF notation. + pub reference: String, + /// Alternative allele bases in VCF notation. + pub alternative: String, + /// VCV accession identifier. + pub vcv: String, + /// Pathogenicity summary for the variant (ClinVar style). + #[serde(deserialize_with = "deserialize_pathogenicity")] + pub summary_clinvar_pathogenicity: Vec, + /// Pathogenicity gold stars (ClinVar style). + pub summary_clinvar_gold_stars: u32, + /// Pathogenicity summary for the variant ("paranoid" style). + #[serde(deserialize_with = "deserialize_pathogenicity")] + pub summary_paranoid_pathogenicity: Vec, + /// Pathogenicity gold stars ("paranoid" style). + pub summary_paranoid_gold_stars: u32, + } + + fn deserialize_pathogenicity<'de, D>(deserializer: D) -> Result, D::Error> + where + D: serde::Deserializer<'de>, + { + let s: &str = Deserialize::deserialize(deserializer)?; + let s = s.replace('{', "[").replace('}', "]"); + serde_json::from_str(&s).map_err(serde::de::Error::custom) + } +} + +pub mod serialize { + use serde::{Deserialize, Serialize}; + + use super::Pathogenicity; + + #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] + pub struct Record { + /// Genome release. + pub release: String, + /// Chromosome name. + pub chromosome: String, + /// 1-based start position. + pub start: u32, + /// 1-based end position. + pub end: u32, + /// Reference allele bases in VCF notation. + pub reference: String, + /// Alternative allele bases in VCF notation. + pub alternative: String, + /// VCV accession identifier. + pub vcv: String, + /// Pathogenicity summary for the variant (ClinVar style). + pub summary_clinvar_pathogenicity: Vec, + /// Pathogenicity gold stars (ClinVar style). + pub summary_clinvar_gold_stars: u32, + /// Pathogenicity summary for the variant ("paranoid" style). + pub summary_paranoid_pathogenicity: Vec, + /// Pathogenicity gold stars ("paranoid" style). + pub summary_paranoid_gold_stars: u32, + } + + impl From for Record { + fn from(r: super::reading::Record) -> Self { + Self { + release: r.release, + chromosome: r.chromosome, + start: r.start, + end: r.end, + reference: r.reference, + alternative: r.alternative, + vcv: r.vcv, + summary_clinvar_pathogenicity: r.summary_clinvar_pathogenicity, + summary_clinvar_gold_stars: r.summary_clinvar_gold_stars, + summary_paranoid_pathogenicity: r.summary_paranoid_pathogenicity, + summary_paranoid_gold_stars: r.summary_paranoid_gold_stars, + } + } + } +} + +#[derive(Parser, Debug)] +#[command(about = "Construct mehari sequence variant ClinVar database", long_about = None)] +pub struct Args { + /// Genome release to use, default is to auto-detect. + #[arg(long, value_enum)] + pub genome_release: Option, + /// Path to the output database to build. + #[arg(long)] + pub path_output_db: String, + + /// Path to the sequence variant ClinVar TSV file from `clinvar-tsv`. + #[arg(long)] + pub path_clinvar_tsv: String, + + /// For debug purposes, maximal number of variants to import. + #[arg(long)] + pub max_var_count: Option, +} + +/// Convert release string to Assembly. +fn str_to_assembly(s: &str) -> Result { + match s { + "GRCh37" => Ok(Assembly::Grch37p10), + "GRCh38" => Ok(Assembly::Grch38), + _ => anyhow::bail!("Unknown genome release: {}", s), + } +} + +/// Guess genome release from TSV file. +fn guess_genome_release(reader: Box) -> Result { + let reader = std::io::BufReader::new(reader); + let mut reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(true) + .from_reader(reader); + let record: reading::Record = reader.deserialize().next().unwrap()?; + str_to_assembly(record.release.as_str()) +} + +fn open_tsv(args: &Args) -> Result, anyhow::Error> { + Ok(if args.path_clinvar_tsv.ends_with(".gz") { + Box::new(BGZFReader::new(std::fs::File::open( + &args.path_clinvar_tsv, + )?)?) + } else { + Box::new(BufReader::new(std::fs::File::open(&args.path_clinvar_tsv)?)) + }) +} + +/// Import ClinVar TSV file into RocksDB column family. +fn import_clinvar_seqvars( + args: &Args, + genome_release: Option, + db: &DBWithThreadMode, + cf_clinvar: &rocksdb::ColumnFamily, +) -> Result<(), anyhow::Error> { + let start = Instant::now(); + + // Read first record from TSV file to guess genome release. + let reader = open_tsv(args)?; + let guessed_release = guess_genome_release(reader)?; + // Check genome release guessed vs the one passed as argument. + let genome_release = if let Some(genome_release) = genome_release { + if genome_release != guessed_release { + anyhow::bail!( + "Genome release mismatch: {:?} vs. {:?}", + genome_release, + guessed_release + ); + } else { + genome_release + } + } else { + guessed_release + }; + + // Open TSV file again and read all records, writing them to db/cf_clinvar. + let mut reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(true) + .from_reader(open_tsv(args)?); + let mut prev = Instant::now(); + let mut records_written = 0; + for record in reader.deserialize() { + let mut record: reading::Record = record?; + + record.summary_clinvar_pathogenicity.sort(); + record.summary_paranoid_pathogenicity.sort(); + + let var = Var { + chrom: record.chromosome.clone(), + pos: record.start, + reference: record.reference.clone(), + alternative: record.alternative.clone(), + }; + tracing::trace!("now at {:?}", &var); + if var.pos == 865567 { + tracing::info!("at {:?}", &var); + } + + if prev.elapsed().as_secs() >= 60 { + tracing::info!("at {:?}", &var); + prev = Instant::now(); + } + + // Validate record's genome release. + let record_release = str_to_assembly(record.release.as_str())?; + if record_release != genome_release { + anyhow::bail!( + "Genome release mismatch: {:?} vs. {:?}", + genome_release, + record_release + ); + } + + // Serialize record to `Vec` using `bincode`. + let record: serialize::Record = record.into(); + let value = bincode::serialize(&record)?; + // Derive key from record. + let key: Vec = var.into(); + + db.put_cf(cf_clinvar, key, value)?; + + records_written += 1; + + if let Some(max_var_count) = args.max_var_count { + if records_written >= max_var_count { + tracing::warn!("Stopping after {} records as requested", max_var_count); + break; + } + } + } + + tracing::info!( + " wrote {} ClinVar seqvars records in {:?}", + records_written.separate_with_commas(), + start.elapsed() + ); + + Ok(()) +} + +fn rocksdb_tuning(options: rocksdb::Options) -> rocksdb::Options { + let mut options = options; + + // compress all files with Zstandard + options.set_compression_per_level(&[]); + options.set_compression_type(rocksdb::DBCompressionType::Zstd); + // We only want to set level to 2 but have to set the rest as well using the Rust interface. + // The (default) values for the other levels were taken from the output of a RocksDB + // output folder created with default settings. + options.set_compression_options(-14, 2, 0, 0); + + options +} + +/// Main entry point for `db create seqvar-clinvar` sub command. +pub fn run(common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> { + tracing::info!( + "Building sequence variant ClinVar database\ncommon args: {:#?}\nargs: {:#?}", + common, + args + ); + + // Guess genome release from paths. + let genome_release = args.genome_release.map(|gr| match gr { + GenomeRelease::Grch37 => Assembly::Grch37p10, // has chrMT! + GenomeRelease::Grch38 => Assembly::Grch38, + }); + + // Open the output database, obtain column family handles, and write out meta data. + tracing::info!("Opening output database"); + let mut options = rocksdb_tuning(rocksdb::Options::default()); + options.create_if_missing(true); + options.create_missing_column_families(true); + options.prepare_for_bulk_load(); + options.set_disable_auto_compactions(true); + let db = rocksdb::DB::open_cf(&options, &args.path_output_db, ["meta", "clinvar_seqvars"])?; + + let cf_meta = db.cf_handle("meta").unwrap(); + let cf_clinvar = db.cf_handle("clinvar_seqvars").unwrap(); + + tracing::info!("Writing meta data to database"); + db.put_cf(cf_meta, "genome-release", format!("{genome_release:?}"))?; + + // Import the ClinVar data TSV file. + tracing::info!("Processing ClinVar TSV ..."); + import_clinvar_seqvars(args, genome_release, &db, cf_clinvar)?; + + // Finally, compact manually. + tracing::info!("Enforcing manual compaction"); + db.compact_range_cf(cf_meta, None::<&[u8]>, None::<&[u8]>); + db.compact_range_cf(cf_clinvar, None::<&[u8]>, None::<&[u8]>); + + let compaction_start = Instant::now(); + let mut last_printed = compaction_start; + while db + .property_int_value(rocksdb::properties::COMPACTION_PENDING)? + .unwrap() + > 0 + || db + .property_int_value(rocksdb::properties::NUM_RUNNING_COMPACTIONS)? + .unwrap() + > 0 + { + std::thread::sleep(std::time::Duration::from_millis(100)); + if last_printed.elapsed() > std::time::Duration::from_millis(1000) { + log::info!( + "... waiting for compaction for {:?}", + compaction_start.elapsed() + ); + last_printed = Instant::now(); + } + } + + tracing::info!("Done building sequence variant ClinVar database"); + Ok(()) +} diff --git a/src/db/create/seqvar_freqs/mod.rs b/src/db/create/seqvar_freqs/mod.rs index 9658664b..7d5a441f 100644 --- a/src/db/create/seqvar_freqs/mod.rs +++ b/src/db/create/seqvar_freqs/mod.rs @@ -1,4 +1,4 @@ -//! Population frequencies for sequence variants. +//! Creation of mehari internal sequence variant population frequency database. pub mod reading; pub mod serialized; @@ -9,6 +9,7 @@ use clap::Parser; use hgvs::static_data::Assembly; use rocksdb::{DBWithThreadMode, SingleThreaded}; +use thousands::Separable; use crate::common::GenomeRelease; @@ -469,7 +470,7 @@ fn import_autosomal( let mut has_next = true; while has_next { has_next = auto_reader.run(|variant, gnomad_exomes, gnomad_genomes| { - if prev.elapsed().as_secs() > 60 { + if prev.elapsed().as_secs() >= 60 { tracing::info!("at {:?}", &variant); prev = Instant::now(); } @@ -491,7 +492,7 @@ fn import_autosomal( tracing::info!( " wrote {} chr1, ..., chr22 records in {:?}", - chrauto_written, + chrauto_written.separate_with_commas(), start.elapsed() ); Ok(()) @@ -547,7 +548,7 @@ fn import_gonomosomal( tracing::info!( " wrote {} chrX and chrY records in {:?}", - chrxy_written, + chrxy_written.separate_with_commas(), start.elapsed() ); Ok(()) @@ -596,16 +597,16 @@ fn import_chrmt( tracing::info!( " wrote {} chrMT records in {:?}", - chrmt_written, + chrmt_written.separate_with_commas(), start.elapsed() ); Ok(()) } -/// Main entry point for `db create seqvar_freqs` sub command. +/// Main entry point for `db create seqvar-freqs` sub command. pub fn run(common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> { tracing::info!( - "Building sequence variant frequencies table\ncommon args: {:#?}\nargs: {:#?}", + "Building sequence variant frequencies database\ncommon args: {:#?}\nargs: {:#?}", common, args ); @@ -646,6 +647,7 @@ pub fn run(common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Erro // Finally, compact manually. tracing::info!("Enforcing manual compaction"); + db.compact_range_cf(cf_meta, None::<&[u8]>, None::<&[u8]>); db.compact_range_cf(cf_autosomal, None::<&[u8]>, None::<&[u8]>); db.compact_range_cf(cf_gonosomal, None::<&[u8]>, None::<&[u8]>); db.compact_range_cf(cf_mtdna, None::<&[u8]>, None::<&[u8]>); @@ -671,7 +673,7 @@ pub fn run(common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Erro } } - tracing::info!("Done building sequence variant frequency table"); + tracing::info!("Done building sequence variant frequency database"); Ok(()) } diff --git a/src/db/create/txs/data.rs b/src/db/create/txs/data.rs index 065f803e..e1b524cb 100644 --- a/src/db/create/txs/data.rs +++ b/src/db/create/txs/data.rs @@ -7,7 +7,7 @@ use enumset::{EnumSet, EnumSetType}; use serde::{Deserialize, Serialize}; // Enumeration for `Transcript::biotype`. -#[derive(Debug, Serialize, Deserialize, Clone, Copy)] +#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] pub enum TranscriptBiotype { Coding, NonCoding, diff --git a/src/main.rs b/src/main.rs index 268157de..da3b83f1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -165,6 +165,7 @@ struct DbCreate { enum DbCreateCommands { Txs(db::create::txs::Args), SeqvarFreqs(db::create::seqvar_freqs::Args), + SeqvarClinvar(db::create::seqvar_clinvar::Args), } /// Parsing of "annotate *" sub commands. @@ -212,6 +213,9 @@ fn main() -> Result<(), anyhow::Error> { DbCreateCommands::SeqvarFreqs(args) => { db::create::seqvar_freqs::run(&cli.common, args)? } + DbCreateCommands::SeqvarClinvar(args) => { + db::create::seqvar_clinvar::run(&cli.common, args)? + } }, }, Commands::Annotate(annotate) => match &annotate.command { @@ -219,6 +223,7 @@ fn main() -> Result<(), anyhow::Error> { }, } + tracing::info!("... the dromedary is back in the stable."); tracing::info!("All done. Have a nice day!"); Ok::<(), anyhow::Error>(()) diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/000012.log b/tests/data/annotate/db/seqvars/grch37/clinvar/000012.log new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/000013.sst b/tests/data/annotate/db/seqvars/grch37/clinvar/000013.sst new file mode 100644 index 00000000..e6400a22 --- /dev/null +++ b/tests/data/annotate/db/seqvars/grch37/clinvar/000013.sst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab998c80f4b44c55a05e2caac1569c1db9caed07d6b6fa6d2b01fe4e976dfbdd +size 1037 diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/000014.sst b/tests/data/annotate/db/seqvars/grch37/clinvar/000014.sst new file mode 100644 index 00000000..d05e535e --- /dev/null +++ b/tests/data/annotate/db/seqvars/grch37/clinvar/000014.sst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcb9d5cca414be6fafc14746a9ee954bebdaaf965cf6d4e7f1ef04ec8f5e2023 +size 5251 diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/CURRENT b/tests/data/annotate/db/seqvars/grch37/clinvar/CURRENT new file mode 100644 index 00000000..f8d50486 --- /dev/null +++ b/tests/data/annotate/db/seqvars/grch37/clinvar/CURRENT @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c283f6e81028b9eb0760d918ee4bc0aa256ed3b926393c1734c760c4bd724fd +size 16 diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/IDENTITY b/tests/data/annotate/db/seqvars/grch37/clinvar/IDENTITY new file mode 100644 index 00000000..b9cba323 --- /dev/null +++ b/tests/data/annotate/db/seqvars/grch37/clinvar/IDENTITY @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa20560c6be2722b2c0cfe2b0a18beb58b1df4d50c16aefa38f368aadd9ef48d +size 36 diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/LOCK b/tests/data/annotate/db/seqvars/grch37/clinvar/LOCK new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/LOG b/tests/data/annotate/db/seqvars/grch37/clinvar/LOG new file mode 100644 index 00000000..5e673761 --- /dev/null +++ b/tests/data/annotate/db/seqvars/grch37/clinvar/LOG @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe51512c988c1c1e729c9c92b259d6b604ee0c767286c29c3e6408201bbcb2e +size 55685 diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/MANIFEST-000005 b/tests/data/annotate/db/seqvars/grch37/clinvar/MANIFEST-000005 new file mode 100644 index 00000000..3abd741e --- /dev/null +++ b/tests/data/annotate/db/seqvars/grch37/clinvar/MANIFEST-000005 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3376d4ccec695035c008a1fe2ab6dfadf0eed778fe7c7cff446c199633041130 +size 644 diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/OPTIONS-000009 b/tests/data/annotate/db/seqvars/grch37/clinvar/OPTIONS-000009 new file mode 100644 index 00000000..345354c1 --- /dev/null +++ b/tests/data/annotate/db/seqvars/grch37/clinvar/OPTIONS-000009 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b813453ad8fde4d3fbfe489631d60405e77b19b65107e39fa4ff10ac47315a93 +size 15351 diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/OPTIONS-000011 b/tests/data/annotate/db/seqvars/grch37/clinvar/OPTIONS-000011 new file mode 100644 index 00000000..345354c1 --- /dev/null +++ b/tests/data/annotate/db/seqvars/grch37/clinvar/OPTIONS-000011 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b813453ad8fde4d3fbfe489631d60405e77b19b65107e39fa4ff10ac47315a93 +size 15351 diff --git a/tests/data/db/create/seqvar_freqs/db-rs1263393206/output.vcf b/tests/data/db/create/seqvar_freqs/db-rs1263393206/output.vcf index 15826f5f..98e9f866 100644 --- a/tests/data/db/create/seqvar_freqs/db-rs1263393206/output.vcf +++ b/tests/data/db/create/seqvar_freqs/db-rs1263393206/output.vcf @@ -1,135 +1,3 @@ -##fileformat=VCFv4.2 -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FILTER= -##FILTER= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##GATKCommandLine.HaplotypeCaller= -##reference=file:///fast/projects/cubit/current/static_data/reference/GRCh37/hs37d5/hs37d5.fa -##bcftools_viewVersion=1.9+htslib-1.9 -##bcftools_viewCommand=view --header-only /fast/users/holtgrem_c/scratch/tmp/tmp.San1J5p2CT/tmp.vcf.gz; Date=Wed Sep 18 14:09:14 2019 -##SAMPLE= -##PEDIGREE= -##bcftools_concatVersion=1.9+htslib-1.9 -##bcftools_concatCommand=concat --allow-overlaps -d none -o output/out.vcf.gz -O z job_out.0.d/out/tmp_0.vcf.gz job_out.1.d/out/tmp_1.vcf.gz job_out.2.d/out/tmp_2.vcf.gz job_out.3.d/out/tmp_3.vcf.gz job_out.4.d/out/tmp_4.vcf.gz job_out.5.d/out/tmp_5.vcf.gz job_out.6.d/out/tmp_6.vcf.gz job_out.7.d/out/tmp_7.vcf.gz job_out.8.d/out/tmp_8.vcf.gz job_out.9.d/out/tmp_9.vcf.gz job_out.10.d/out/tmp_10.vcf.gz job_out.11.d/out/tmp_11.vcf.gz job_out.12.d/out/tmp_12.vcf.gz job_out.13.d/out/tmp_13.vcf.gz job_out.14.d/out/tmp_14.vcf.gz job_out.15.d/out/tmp_15.vcf.gz job_out.16.d/out/tmp_16.vcf.gz job_out.17.d/out/tmp_17.vcf.gz job_out.18.d/out/tmp_18.vcf.gz job_out.19.d/out/tmp_19.vcf.gz job_out.20.d/out/tmp_20.vcf.gz job_out.21.d/out/tmp_21.vcf.gz job_out.22.d/out/tmp_22.vcf.gz job_out.23.d/out/tmp_23.vcf.gz job_out.24.d/out/tmp_24.vcf.gz job_out.25.d/out/tmp_25.vcf.gz job_out.26.d/out/tmp_26.vcf.gz job_out.27.d/out/tmp_27.vcf.gz job_out.28.d/out/tmp_28.vcf.gz job_out.29.d/out/tmp_29.vcf.gz job_out.30.d/out/tmp_30.vcf.gz job_out.31.d/out/tmp_31.vcf.gz job_out.32.d/out/tmp_32.vcf.gz job_out.33.d/out/tmp_33.vcf.gz job_out.34.d/out/tmp_34.vcf.gz job_out.35.d/out/tmp_35.vcf.gz job_out.36.d/out/tmp_36.vcf.gz job_out.37.d/out/tmp_37.vcf.gz job_out.38.d/out/tmp_38.vcf.gz job_out.39.d/out/tmp_39.vcf.gz job_out.40.d/out/tmp_40.vcf.gz job_out.41.d/out/tmp_41.vcf.gz job_out.42.d/out/tmp_42.vcf.gz job_out.43.d/out/tmp_43.vcf.gz job_out.44.d/out/tmp_44.vcf.gz job_out.45.d/out/tmp_45.vcf.gz job_out.46.d/out/tmp_46.vcf.gz job_out.47.d/out/tmp_47.vcf.gz job_out.48.d/out/tmp_48.vcf.gz job_out.49.d/out/tmp_49.vcf.gz job_out.50.d/out/tmp_50.vcf.gz job_out.51.d/out/tmp_51.vcf.gz job_out.52.d/out/tmp_52.vcf.gz job_out.53.d/out/tmp_53.vcf.gz job_out.54.d/out/tmp_54.vcf.gz job_out.55.d/out/tmp_55.vcf.gz job_out.56.d/out/tmp_56.vcf.gz job_out.57.d/out/tmp_57.vcf.gz job_out.58.d/out/tmp_58.vcf.gz job_out.59.d/out/tmp_59.vcf.gz job_out.60.d/out/tmp_60.vcf.gz job_out.61.d/out/tmp_61.vcf.gz job_out.62.d/out/tmp_62.vcf.gz job_out.63.d/out/tmp_63.vcf.gz job_out.64.d/out/tmp_64.vcf.gz job_out.65.d/out/tmp_65.vcf.gz job_out.66.d/out/tmp_66.vcf.gz job_out.67.d/out/tmp_67.vcf.gz job_out.68.d/out/tmp_68.vcf.gz job_out.69.d/out/tmp_69.vcf.gz job_out.70.d/out/tmp_70.vcf.gz job_out.71.d/out/tmp_71.vcf.gz job_out.72.d/out/tmp_72.vcf.gz job_out.73.d/out/tmp_73.vcf.gz job_out.74.d/out/tmp_74.vcf.gz job_out.75.d/out/tmp_75.vcf.gz job_out.76.d/out/tmp_76.vcf.gz job_out.77.d/out/tmp_77.vcf.gz job_out.78.d/out/tmp_78.vcf.gz job_out.79.d/out/tmp_79.vcf.gz job_out.80.d/out/tmp_80.vcf.gz job_out.81.d/out/tmp_81.vcf.gz job_out.82.d/out/tmp_82.vcf.gz job_out.83.d/out/tmp_83.vcf.gz job_out.84.d/out/tmp_84.vcf.gz job_out.85.d/out/tmp_85.vcf.gz job_out.86.d/out/tmp_86.vcf.gz job_out.87.d/out/tmp_87.vcf.gz job_out.88.d/out/tmp_88.vcf.gz job_out.89.d/out/tmp_89.vcf.gz job_out.90.d/out/tmp_90.vcf.gz job_out.91.d/out/tmp_91.vcf.gz job_out.92.d/out/tmp_92.vcf.gz job_out.93.d/out/tmp_93.vcf.gz job_out.94.d/out/tmp_94.vcf.gz job_out.95.d/out/tmp_95.vcf.gz job_out.96.d/out/tmp_96.vcf.gz job_out.97.d/out/tmp_97.vcf.gz job_out.98.d/out/tmp_98.vcf.gz job_out.99.d/out/tmp_99.vcf.gz job_out.100.d/out/tmp_100.vcf.gz job_out.101.d/out/tmp_101.vcf.gz job_out.102.d/out/tmp_102.vcf.gz job_out.103.d/out/tmp_103.vcf.gz job_out.104.d/out/tmp_104.vcf.gz job_out.105.d/out/tmp_105.vcf.gz job_out.106.d/out/tmp_106.vcf.gz job_out.107.d/out/tmp_107.vcf.gz job_out.108.d/out/tmp_108.vcf.gz job_out.109.d/out/tmp_109.vcf.gz job_out.110.d/out/tmp_110.vcf.gz job_out.111.d/out/tmp_111.vcf.gz job_out.112.d/out/tmp_112.vcf.gz job_out.113.d/out/tmp_113.vcf.gz job_out.114.d/out/tmp_114.vcf.gz job_out.115.d/out/tmp_115.vcf.gz job_out.116.d/out/tmp_116.vcf.gz job_out.117.d/out/tmp_117.vcf.gz job_out.118.d/out/tmp_118.vcf.gz job_out.119.d/out/tmp_119.vcf.gz job_out.120.d/out/tmp_120.vcf.gz job_out.121.d/out/tmp_121.vcf.gz job_out.122.d/out/tmp_122.vcf.gz job_out.123.d/out/tmp_123.vcf.gz job_out.124.d/out/tmp_124.vcf.gz job_out.125.d/out/tmp_125.vcf.gz job_out.126.d/out/tmp_126.vcf.gz job_out.127.d/out/tmp_127.vcf.gz job_out.128.d/out/tmp_128.vcf.gz job_out.129.d/out/tmp_129.vcf.gz job_out.130.d/out/tmp_130.vcf.gz job_out.131.d/out/tmp_131.vcf.gz job_out.132.d/out/tmp_132.vcf.gz; Date=Wed Sep 18 14:03:41 2019 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00102-N1-DNA1-WES1 -1 13656 . CAG C 125 . AC=2;AF=1;AN=2;DP=5;ExcessHet=3.0103;FS=0;MLEAC=2;MLEAF=1;MQ=22.81;QD=31.25;SOR=3.258;gnomad_exomes_an=25700;gnomad_exomes_hom=25;gnomad_exomes_het=709;gnomad_genomes_an=26970;gnomad_genomes_hom=102;gnomad_genomes_het=13005 GT:AD:DP:GQ:PL 1/1:0,4:4:12:162,12,0 +version https://git-lfs.github.com/spec/v1 +oid sha256:af35d25008b9ffc3c84cfb4ef7df60ba3176cb0b74664270cd02cde8b3d1f82f +size 17891