diff --git a/protos/annonars/genes/base.proto b/protos/annonars/genes/base.proto index 6a32cd06..da1eb576 100644 --- a/protos/annonars/genes/base.proto +++ b/protos/annonars/genes/base.proto @@ -64,6 +64,18 @@ message ClingenDosageRecord { optional string triplosensitivity_disease_id = 8; } +// Decipher HI Predictions +message DecipherHiRecord { + // HGNC identifier. + string hgnc_id = 1; + // Official HGNC gene symbol. + string hgnc_symbol = 2; + // P(HI) prediction from DECIPHER HI. + double p_hi = 3; + // Percent HI index. + double hi_index = 4; +} + // Information from DOMINO. message DominoRecord { // Gene symbol. @@ -820,4 +832,6 @@ message Record { GtexRecord gtex = 11; // Information from DOMINO. DominoRecord domino = 13; + // DECIPHER HI score. + DecipherHiRecord decipher_hi = 14; } diff --git a/src/genes/cli/data.rs b/src/genes/cli/data.rs index 2f4bf612..4bcbbe92 100644 --- a/src/genes/cli/data.rs +++ b/src/genes/cli/data.rs @@ -34,6 +34,8 @@ pub struct Record { pub gtex: Option, /// Information from DOMINO. pub domino: Option, + /// DECIPHER HI predictions. + pub decipher_hi: Option, } /// Code for data from the ACMG secondary findings list. @@ -227,6 +229,22 @@ pub mod clingen_gene { } } +/// Code for deserializing data from DECIPHER HI. +pub mod decipher_hi { + /// DECIPHER HI prediction. + #[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)] + pub struct Record { + /// HGNC identifier. + pub hgnc_id: String, + /// Official HGNC gene symbol. + pub hgnc_symbol: String, + /// P(HI) prediction from DECIPHER HI. + pub p_hi: f64, + /// Percent HI index. + pub hi_index: f64, + } +} + /// Code for deserializing data from dbNSFP gene. pub mod dbnsfp_gene { use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -2123,6 +2141,23 @@ mod tests { Ok(()) } + #[test] + fn deserialize_decipher_hi_record() -> Result<(), anyhow::Error> { + let path_tsv = "tests/genes/decipher/decipher_hi_prediction.tsv"; + let str_tsv = std::fs::read_to_string(path_tsv)?; + let mut rdr = csv::ReaderBuilder::new() + .has_headers(true) + .delimiter(b'\t') + .flexible(false) + .from_reader(str_tsv.as_bytes()); + let records = rdr + .deserialize() + .collect::, csv::Error>>()?; + insta::assert_yaml_snapshot!(records); + + Ok(()) + } + #[test] fn deserialize_dbnsfp_record() -> Result<(), anyhow::Error> { let path_tsv = "tests/genes/dbnsfp/genes.tsv"; diff --git a/src/genes/cli/import.rs b/src/genes/cli/import.rs index 55723087..de305586 100644 --- a/src/genes/cli/import.rs +++ b/src/genes/cli/import.rs @@ -17,8 +17,8 @@ use crate::{ }; use super::data::{ - self, acmg_sf, clingen_gene, dbnsfp_gene, domino, gnomad_constraints, gtex, hgnc, ncbi, omim, - orpha, rcnv, shet, + self, acmg_sf, clingen_gene, dbnsfp_gene, decipher_hi, domino, gnomad_constraints, gtex, hgnc, + ncbi, omim, orpha, rcnv, shet, }; /// Command line arguments for `genes import` sub command. @@ -64,6 +64,9 @@ pub struct Args { /// Path to the DOMINO TSV file. #[arg(long, required = true)] pub path_in_domino: String, + /// Path to the DECIPHER HI file. + #[arg(long, required = true)] + pub path_in_decipher_hi: String, /// Path to output RocksDB. #[arg(long, required = true)] @@ -144,6 +147,34 @@ fn load_gnomad_constraints( Ok(result) } +/// Load DECIPHER HI predictions. +/// +/// # Result +/// +/// A map from gene symbol to DECIPHER HI record. +fn load_decipher_hi(path: &str) -> Result, anyhow::Error> { + info!(" loading DECIPHER HI information from {}", path); + let mut result = HashMap::new(); + + let reader: Box = if path.ends_with(".gz") { + Box::new(flate2::bufread::MultiGzDecoder::new(BufReader::new( + std::fs::File::open(path)?, + ))) + } else { + Box::new(BufReader::new(std::fs::File::open(path)?)) + }; + + let mut reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .from_reader(reader); + for record in reader.deserialize::() { + let record = record?; + result.insert(record.hgnc_id.clone(), record); + } + + Ok(result) +} + /// Load dbNSFP genes information. /// /// # Result @@ -389,6 +420,7 @@ fn convert_record(record: data::Record) -> pbs::genes::base::Record { shet, gtex, domino, + decipher_hi, } = record; let acmg_sf = acmg_sf.map(|acmg_sf| { @@ -454,6 +486,22 @@ fn convert_record(record: data::Record) -> pbs::genes::base::Record { } }); + let decipher_hi = decipher_hi.map(|decipher| { + let decipher_hi::Record { + hgnc_id, + hgnc_symbol, + p_hi, + hi_index, + } = decipher; + + pbs::genes::base::DecipherHiRecord { + hgnc_id, + hgnc_symbol, + p_hi, + hi_index, + } + }); + let dbnsfp = dbnsfp.map(|dbnsfp| { let dbnsfp_gene::Record { gene_name, @@ -951,6 +999,7 @@ fn convert_record(record: data::Record) -> pbs::genes::base::Record { shet, gtex, domino, + decipher_hi, } } @@ -970,6 +1019,7 @@ fn write_rocksdb( shet_by_hgnc_id: HashMap, gtex_by_hgnc_id: HashMap, domino_by_symbol: HashMap, + decipher_hi_by_hgnc_id: HashMap, args: &&Args, ) -> Result<(), anyhow::Error> { // Construct RocksDB options and open file for writing. @@ -1018,6 +1068,7 @@ fn write_rocksdb( shet: shet_by_hgnc_id.get(&hgnc_id).cloned(), gtex: gtex_by_hgnc_id.get(&hgnc_id).cloned(), domino: domino_by_symbol.get(&hgnc_record.symbol).cloned(), + decipher_hi: decipher_hi_by_hgnc_id.get(&hgnc_id).cloned(), }); tracing::debug!("writing {:?} -> {:?}", &hgnc, &record); db.put_cf(&cf_genes, hgnc_id, &record.encode_to_vec())?; @@ -1051,6 +1102,7 @@ pub fn run(common_args: &common::cli::Args, args: &Args) -> Result<(), anyhow::E let shet_by_hgnc_id = load_shet(&args.path_in_shet)?; let gtex_by_hgnc_id = load_gtex(&args.path_in_gtex)?; let domino_by_symbol = load_domino(&args.path_in_domino)?; + let decipher_hi_by_hgnc_id = load_decipher_hi(&args.path_in_decipher_hi)?; info!( "... done loadin genes data files in {:?}", before_loading.elapsed() @@ -1072,6 +1124,7 @@ pub fn run(common_args: &common::cli::Args, args: &Args) -> Result<(), anyhow::E shet_by_hgnc_id, gtex_by_hgnc_id, domino_by_symbol, + decipher_hi_by_hgnc_id, &args, )?; info!( @@ -1121,6 +1174,7 @@ pub mod test { .into_os_string() .into_string() .unwrap(), + path_in_decipher_hi: String::from("tests/genes/decipher/decipher_hi_prediction.tsv"), }; run(&common_args, &args)?; diff --git a/src/genes/cli/snapshots/annonars__genes__cli__data__tests__deserialize_decipher_hi_record.snap b/src/genes/cli/snapshots/annonars__genes__cli__data__tests__deserialize_decipher_hi_record.snap new file mode 100644 index 00000000..44abb613 --- /dev/null +++ b/src/genes/cli/snapshots/annonars__genes__cli__data__tests__deserialize_decipher_hi_record.snap @@ -0,0 +1,85 @@ +--- +source: src/genes/cli/data.rs +expression: records +--- +- hgnc_id: "HGNC:100" + hgnc_symbol: ASIC1 + p_hi: 0.372226278 + hi_index: 22.59 +- hgnc_id: "HGNC:10000" + hgnc_symbol: RGS4 + p_hi: 0.307829343 + hi_index: 27.02 +- hgnc_id: "HGNC:10001" + hgnc_symbol: RGS5 + p_hi: 0.254559645 + hi_index: 31.16 +- hgnc_id: "HGNC:10002" + hgnc_symbol: RGS6 + p_hi: 0.590032694 + hi_index: 12.13 +- hgnc_id: "HGNC:10003" + hgnc_symbol: RGS7 + p_hi: 0.625263603 + hi_index: 10.93 +- hgnc_id: "HGNC:10004" + hgnc_symbol: RGS9 + p_hi: 0.076663554 + hi_index: 55.97 +- hgnc_id: "HGNC:10006" + hgnc_symbol: RHAG + p_hi: 0.112613028 + hi_index: 48.74 +- hgnc_id: "HGNC:10007" + hgnc_symbol: RHBDL1 + p_hi: 0.12245953 + hi_index: 47.03 +- hgnc_id: "HGNC:10008" + hgnc_symbol: RHCE + p_hi: 0.008358084 + hi_index: 82.29 +- hgnc_id: "HGNC:10009" + hgnc_symbol: RHD + p_hi: 0.008254663 + hi_index: 82.4 +- hgnc_id: "HGNC:1001" + hgnc_symbol: BCL6 + p_hi: 0.913517921 + hi_index: 3.15 +- hgnc_id: "HGNC:10011" + hgnc_symbol: RHEB + p_hi: 0.789197392 + hi_index: 6.12 +- hgnc_id: "HGNC:10012" + hgnc_symbol: RHO + p_hi: 0.941179124 + hi_index: 2.44 +- hgnc_id: "HGNC:10013" + hgnc_symbol: GRK1 + p_hi: 0.090604558 + hi_index: 53.1 +- hgnc_id: "HGNC:10017" + hgnc_symbol: RIT2 + p_hi: 0.362346922 + hi_index: 23.27 +- hgnc_id: "HGNC:10018" + hgnc_symbol: RING1 + p_hi: 0.223170581 + hi_index: 34.04 +- hgnc_id: "HGNC:10019" + hgnc_symbol: RIPK1 + p_hi: 0.094475501 + hi_index: 52.24 +- hgnc_id: "HGNC:1002" + hgnc_symbol: BCL6B + p_hi: 0.213572706 + hi_index: 35.16 +- hgnc_id: "HGNC:10020" + hgnc_symbol: RIPK2 + p_hi: 0.393221381 + hi_index: 21.18 +- hgnc_id: "HGNC:10021" + hgnc_symbol: RIPK3 + p_hi: 0.001720654 + hi_index: 91.79 + diff --git a/tests/genes/decipher/decipher_hi_prediction.tsv b/tests/genes/decipher/decipher_hi_prediction.tsv new file mode 100644 index 00000000..0931bfb3 --- /dev/null +++ b/tests/genes/decipher/decipher_hi_prediction.tsv @@ -0,0 +1,21 @@ +hgnc_id hgnc_symbol p_hi hi_index +HGNC:100 ASIC1 0.372226278 22.59 +HGNC:10000 RGS4 0.307829343 27.02 +HGNC:10001 RGS5 0.254559645 31.16 +HGNC:10002 RGS6 0.590032694 12.13 +HGNC:10003 RGS7 0.625263603 10.93 +HGNC:10004 RGS9 0.076663554 55.97 +HGNC:10006 RHAG 0.112613028 48.74 +HGNC:10007 RHBDL1 0.12245953 47.03 +HGNC:10008 RHCE 0.008358084 82.29 +HGNC:10009 RHD 0.008254663 82.4 +HGNC:1001 BCL6 0.913517921 3.15 +HGNC:10011 RHEB 0.789197392 6.12 +HGNC:10012 RHO 0.941179124 2.44 +HGNC:10013 GRK1 0.090604558 53.1 +HGNC:10017 RIT2 0.362346922 23.27 +HGNC:10018 RING1 0.223170581 34.04 +HGNC:10019 RIPK1 0.094475501 52.24 +HGNC:1002 BCL6B 0.213572706 35.16 +HGNC:10020 RIPK2 0.393221381 21.18 +HGNC:10021 RIPK3 0.001720654 91.79