Skip to content

Commit

Permalink
feat: adding information from DECIPHER HI (#323) (#324)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Nov 22, 2023
1 parent 3273d1f commit 06b51a6
Show file tree
Hide file tree
Showing 5 changed files with 211 additions and 2 deletions.
14 changes: 14 additions & 0 deletions protos/annonars/genes/base.proto
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,18 @@ message ClingenDosageRecord {
optional string triplosensitivity_disease_id = 8;
}

// Decipher HI Predictions
message DecipherHiRecord {
// HGNC identifier.
string hgnc_id = 1;
// Official HGNC gene symbol.
string hgnc_symbol = 2;
// P(HI) prediction from DECIPHER HI.
double p_hi = 3;
// Percent HI index.
double hi_index = 4;
}

// Information from DOMINO.
message DominoRecord {
// Gene symbol.
Expand Down Expand Up @@ -820,4 +832,6 @@ message Record {
GtexRecord gtex = 11;
// Information from DOMINO.
DominoRecord domino = 13;
// DECIPHER HI score.
DecipherHiRecord decipher_hi = 14;
}
35 changes: 35 additions & 0 deletions src/genes/cli/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ pub struct Record {
pub gtex: Option<gtex::Record>,
/// Information from DOMINO.
pub domino: Option<domino::Record>,
/// DECIPHER HI predictions.
pub decipher_hi: Option<decipher_hi::Record>,
}

/// Code for data from the ACMG secondary findings list.
Expand Down Expand Up @@ -227,6 +229,22 @@ pub mod clingen_gene {
}
}

/// Code for deserializing data from DECIPHER HI.
pub mod decipher_hi {
/// DECIPHER HI prediction.
#[derive(Debug, Clone, PartialEq, serde::Deserialize, serde::Serialize)]
pub struct Record {
/// HGNC identifier.
pub hgnc_id: String,
/// Official HGNC gene symbol.
pub hgnc_symbol: String,
/// P(HI) prediction from DECIPHER HI.
pub p_hi: f64,
/// Percent HI index.
pub hi_index: f64,
}
}

/// Code for deserializing data from dbNSFP gene.
pub mod dbnsfp_gene {
use serde::{Deserialize, Deserializer, Serialize, Serializer};
Expand Down Expand Up @@ -2123,6 +2141,23 @@ mod tests {
Ok(())
}

#[test]
fn deserialize_decipher_hi_record() -> Result<(), anyhow::Error> {
let path_tsv = "tests/genes/decipher/decipher_hi_prediction.tsv";
let str_tsv = std::fs::read_to_string(path_tsv)?;
let mut rdr = csv::ReaderBuilder::new()
.has_headers(true)
.delimiter(b'\t')
.flexible(false)
.from_reader(str_tsv.as_bytes());
let records = rdr
.deserialize()
.collect::<Result<Vec<decipher_hi::Record>, csv::Error>>()?;
insta::assert_yaml_snapshot!(records);

Ok(())
}

#[test]
fn deserialize_dbnsfp_record() -> Result<(), anyhow::Error> {
let path_tsv = "tests/genes/dbnsfp/genes.tsv";
Expand Down
58 changes: 56 additions & 2 deletions src/genes/cli/import.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ use crate::{
};

use super::data::{
self, acmg_sf, clingen_gene, dbnsfp_gene, domino, gnomad_constraints, gtex, hgnc, ncbi, omim,
orpha, rcnv, shet,
self, acmg_sf, clingen_gene, dbnsfp_gene, decipher_hi, domino, gnomad_constraints, gtex, hgnc,
ncbi, omim, orpha, rcnv, shet,
};

/// Command line arguments for `genes import` sub command.
Expand Down Expand Up @@ -64,6 +64,9 @@ pub struct Args {
/// Path to the DOMINO TSV file.
#[arg(long, required = true)]
pub path_in_domino: String,
/// Path to the DECIPHER HI file.
#[arg(long, required = true)]
pub path_in_decipher_hi: String,

/// Path to output RocksDB.
#[arg(long, required = true)]
Expand Down Expand Up @@ -144,6 +147,34 @@ fn load_gnomad_constraints(
Ok(result)
}

/// Load DECIPHER HI predictions.
///
/// # Result
///
/// A map from gene symbol to DECIPHER HI record.
fn load_decipher_hi(path: &str) -> Result<HashMap<String, decipher_hi::Record>, anyhow::Error> {
info!(" loading DECIPHER HI information from {}", path);
let mut result = HashMap::new();

let reader: Box<dyn Read> = if path.ends_with(".gz") {
Box::new(flate2::bufread::MultiGzDecoder::new(BufReader::new(
std::fs::File::open(path)?,
)))
} else {
Box::new(BufReader::new(std::fs::File::open(path)?))
};

let mut reader = csv::ReaderBuilder::new()
.delimiter(b'\t')
.from_reader(reader);
for record in reader.deserialize::<decipher_hi::Record>() {
let record = record?;
result.insert(record.hgnc_id.clone(), record);
}

Ok(result)
}

/// Load dbNSFP genes information.
///
/// # Result
Expand Down Expand Up @@ -389,6 +420,7 @@ fn convert_record(record: data::Record) -> pbs::genes::base::Record {
shet,
gtex,
domino,
decipher_hi,
} = record;

let acmg_sf = acmg_sf.map(|acmg_sf| {
Expand Down Expand Up @@ -454,6 +486,22 @@ fn convert_record(record: data::Record) -> pbs::genes::base::Record {
}
});

let decipher_hi = decipher_hi.map(|decipher| {
let decipher_hi::Record {
hgnc_id,
hgnc_symbol,
p_hi,
hi_index,
} = decipher;

pbs::genes::base::DecipherHiRecord {
hgnc_id,
hgnc_symbol,
p_hi,
hi_index,
}
});

let dbnsfp = dbnsfp.map(|dbnsfp| {
let dbnsfp_gene::Record {
gene_name,
Expand Down Expand Up @@ -951,6 +999,7 @@ fn convert_record(record: data::Record) -> pbs::genes::base::Record {
shet,
gtex,
domino,
decipher_hi,
}
}

Expand All @@ -970,6 +1019,7 @@ fn write_rocksdb(
shet_by_hgnc_id: HashMap<String, shet::Record>,
gtex_by_hgnc_id: HashMap<String, gtex::Record>,
domino_by_symbol: HashMap<String, domino::Record>,
decipher_hi_by_hgnc_id: HashMap<String, decipher_hi::Record>,
args: &&Args,
) -> Result<(), anyhow::Error> {
// Construct RocksDB options and open file for writing.
Expand Down Expand Up @@ -1018,6 +1068,7 @@ fn write_rocksdb(
shet: shet_by_hgnc_id.get(&hgnc_id).cloned(),
gtex: gtex_by_hgnc_id.get(&hgnc_id).cloned(),
domino: domino_by_symbol.get(&hgnc_record.symbol).cloned(),
decipher_hi: decipher_hi_by_hgnc_id.get(&hgnc_id).cloned(),
});
tracing::debug!("writing {:?} -> {:?}", &hgnc, &record);
db.put_cf(&cf_genes, hgnc_id, &record.encode_to_vec())?;
Expand Down Expand Up @@ -1051,6 +1102,7 @@ pub fn run(common_args: &common::cli::Args, args: &Args) -> Result<(), anyhow::E
let shet_by_hgnc_id = load_shet(&args.path_in_shet)?;
let gtex_by_hgnc_id = load_gtex(&args.path_in_gtex)?;
let domino_by_symbol = load_domino(&args.path_in_domino)?;
let decipher_hi_by_hgnc_id = load_decipher_hi(&args.path_in_decipher_hi)?;
info!(
"... done loadin genes data files in {:?}",
before_loading.elapsed()
Expand All @@ -1072,6 +1124,7 @@ pub fn run(common_args: &common::cli::Args, args: &Args) -> Result<(), anyhow::E
shet_by_hgnc_id,
gtex_by_hgnc_id,
domino_by_symbol,
decipher_hi_by_hgnc_id,
&args,
)?;
info!(
Expand Down Expand Up @@ -1121,6 +1174,7 @@ pub mod test {
.into_os_string()
.into_string()
.unwrap(),
path_in_decipher_hi: String::from("tests/genes/decipher/decipher_hi_prediction.tsv"),
};

run(&common_args, &args)?;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
---
source: src/genes/cli/data.rs
expression: records
---
- hgnc_id: "HGNC:100"
hgnc_symbol: ASIC1
p_hi: 0.372226278
hi_index: 22.59
- hgnc_id: "HGNC:10000"
hgnc_symbol: RGS4
p_hi: 0.307829343
hi_index: 27.02
- hgnc_id: "HGNC:10001"
hgnc_symbol: RGS5
p_hi: 0.254559645
hi_index: 31.16
- hgnc_id: "HGNC:10002"
hgnc_symbol: RGS6
p_hi: 0.590032694
hi_index: 12.13
- hgnc_id: "HGNC:10003"
hgnc_symbol: RGS7
p_hi: 0.625263603
hi_index: 10.93
- hgnc_id: "HGNC:10004"
hgnc_symbol: RGS9
p_hi: 0.076663554
hi_index: 55.97
- hgnc_id: "HGNC:10006"
hgnc_symbol: RHAG
p_hi: 0.112613028
hi_index: 48.74
- hgnc_id: "HGNC:10007"
hgnc_symbol: RHBDL1
p_hi: 0.12245953
hi_index: 47.03
- hgnc_id: "HGNC:10008"
hgnc_symbol: RHCE
p_hi: 0.008358084
hi_index: 82.29
- hgnc_id: "HGNC:10009"
hgnc_symbol: RHD
p_hi: 0.008254663
hi_index: 82.4
- hgnc_id: "HGNC:1001"
hgnc_symbol: BCL6
p_hi: 0.913517921
hi_index: 3.15
- hgnc_id: "HGNC:10011"
hgnc_symbol: RHEB
p_hi: 0.789197392
hi_index: 6.12
- hgnc_id: "HGNC:10012"
hgnc_symbol: RHO
p_hi: 0.941179124
hi_index: 2.44
- hgnc_id: "HGNC:10013"
hgnc_symbol: GRK1
p_hi: 0.090604558
hi_index: 53.1
- hgnc_id: "HGNC:10017"
hgnc_symbol: RIT2
p_hi: 0.362346922
hi_index: 23.27
- hgnc_id: "HGNC:10018"
hgnc_symbol: RING1
p_hi: 0.223170581
hi_index: 34.04
- hgnc_id: "HGNC:10019"
hgnc_symbol: RIPK1
p_hi: 0.094475501
hi_index: 52.24
- hgnc_id: "HGNC:1002"
hgnc_symbol: BCL6B
p_hi: 0.213572706
hi_index: 35.16
- hgnc_id: "HGNC:10020"
hgnc_symbol: RIPK2
p_hi: 0.393221381
hi_index: 21.18
- hgnc_id: "HGNC:10021"
hgnc_symbol: RIPK3
p_hi: 0.001720654
hi_index: 91.79

21 changes: 21 additions & 0 deletions tests/genes/decipher/decipher_hi_prediction.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
hgnc_id hgnc_symbol p_hi hi_index
HGNC:100 ASIC1 0.372226278 22.59
HGNC:10000 RGS4 0.307829343 27.02
HGNC:10001 RGS5 0.254559645 31.16
HGNC:10002 RGS6 0.590032694 12.13
HGNC:10003 RGS7 0.625263603 10.93
HGNC:10004 RGS9 0.076663554 55.97
HGNC:10006 RHAG 0.112613028 48.74
HGNC:10007 RHBDL1 0.12245953 47.03
HGNC:10008 RHCE 0.008358084 82.29
HGNC:10009 RHD 0.008254663 82.4
HGNC:1001 BCL6 0.913517921 3.15
HGNC:10011 RHEB 0.789197392 6.12
HGNC:10012 RHO 0.941179124 2.44
HGNC:10013 GRK1 0.090604558 53.1
HGNC:10017 RIT2 0.362346922 23.27
HGNC:10018 RING1 0.223170581 34.04
HGNC:10019 RIPK1 0.094475501 52.24
HGNC:1002 BCL6B 0.213572706 35.16
HGNC:10020 RIPK2 0.393221381 21.18
HGNC:10021 RIPK3 0.001720654 91.79

0 comments on commit 06b51a6

Please sign in to comment.