Skip to content

Commit

Permalink
feat: properly represent clinvar VCV/RCV structure in protobufs (#242) (
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Oct 18, 2023
1 parent bf74e69 commit 9095773
Show file tree
Hide file tree
Showing 28 changed files with 484 additions and 406 deletions.
58 changes: 36 additions & 22 deletions src/clinvar_genes/cli/import.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ use crate::{
clinvar_genes::{
self,
pbs::{
GeneFreqRecordCounts, GeneImpactRecordCounts, GeneVariantsForRelease, SequenceVariant,
ClinicalSignificance, GeneFreqRecordCounts, GeneImpactRecordCounts,
GeneVariantsForRelease, ReferenceAssertion, ReviewStatus, SequenceVariant,
},
},
clinvar_minimal, common,
Expand Down Expand Up @@ -111,13 +112,16 @@ fn load_per_frequency_jsonl(
Ok(result)
}

type PerVcv = indexmap::IndexMap<String, SequenceVariant>;
type PerAssembly = indexmap::IndexMap<String, PerVcv>;
type PerGene = indexmap::IndexMap<String, PerAssembly>;

/// Load per-gene sequence variants.
fn load_variants_jsonl(
variant_jsonls: &[String],
) -> Result<indexmap::IndexMap<String, Vec<GeneVariantsForRelease>>, anyhow::Error> {
// Build intermediate data structure using nested maps.
let mut tmp: indexmap::IndexMap<String, indexmap::IndexMap<String, Vec<SequenceVariant>>> =
indexmap::IndexMap::new();
let mut per_gene: PerGene = Default::default();
for path_jsonl in variant_jsonls {
let reader: Box<dyn std::io::Read> = if path_jsonl.ends_with(".gz") {
Box::new(flate2::read::GzDecoder::new(std::fs::File::open(
Expand All @@ -136,52 +140,62 @@ fn load_variants_jsonl(

let clinvar_minimal::cli::reading::Record {
rcv,
vcv,
title,
hgnc_ids,
clinical_significance,
review_status,
sequence_location,
} = input_record;
let clinvar_minimal::cli::reading::SequenceLocation {
assembly,
chr,
start,
chr: chrom,
start: pos,
reference_allele_vcf,
alternate_allele_vcf,
..
} = sequence_location;

if let (Some(reference_allele_vcf), Some(alternate_allele_vcf)) =
if let (Some(reference), Some(alternative)) =
(reference_allele_vcf, alternate_allele_vcf)
{
for hgnc_id in hgnc_ids {
let per_gene = tmp.entry(hgnc_id).or_default();
let per_release = per_gene.entry(assembly.clone()).or_default();
let clinsig: crate::clinvar_minimal::pbs::ClinicalSignificance =
clinical_significance.clone().into();
let review_status: crate::clinvar_minimal::pbs::ReviewStatus =
review_status.clone().into();
per_release.push(SequenceVariant {
chrom: chr.clone(),
pos: start,
reference: reference_allele_vcf.clone(),
alternative: alternate_allele_vcf.clone(),
let per_release = per_gene.entry(hgnc_id).or_default();
let per_vcv = per_release.entry(assembly.clone()).or_default();
let seqvar = per_vcv
.entry(vcv.clone())
.or_insert_with(|| SequenceVariant {
chrom: chrom.clone(),
pos,
reference: reference.clone(),
alternative: alternative.clone(),
vcv: vcv.clone(),
reference_assertions: vec![],
});
seqvar.reference_assertions.push(ReferenceAssertion {
rcv: rcv.clone(),
clinsig: clinsig as i32,
review_status: review_status as i32,
})
title: title.clone(),
clinical_significance: Into::<ClinicalSignificance>::into(
clinical_significance,
) as i32,
review_status: Into::<ReviewStatus>::into(review_status) as i32,
});
seqvar
.reference_assertions
.sort_by_key(|a| (a.clinical_significance, a.review_status));
}
}
}
}

// Convert into final data structure that uses lists of entry records rather than nested maps.
let mut result = indexmap::IndexMap::new();
for (hgnc_id, per_gene) in tmp {
for (hgnc_id, per_gene) in per_gene {
let mut per_gene_out = Vec::new();
for (genome_release, per_release) in per_gene {
per_gene_out.push(GeneVariantsForRelease {
genome_release,
variants: per_release,
variants: per_release.values().cloned().collect(),
});
}
result.insert(hgnc_id, per_gene_out);
Expand Down
58 changes: 41 additions & 17 deletions src/clinvar_minimal/cli/import.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ fn jsonl_import(

let clinvar_minimal::cli::reading::Record {
rcv,
vcv,
title,
clinical_significance,
review_status,
sequence_location,
Expand All @@ -75,27 +77,49 @@ fn jsonl_import(
if let (Some(reference_allele_vcf), Some(alternate_allele_vcf)) =
(reference_allele_vcf, alternate_allele_vcf)
{
let record = clinvar_minimal::pbs::Record {
release: assembly,
chromosome: chr,
start,
stop,
reference: reference_allele_vcf,
alternative: alternate_allele_vcf,
rcv,
clinical_significance: clinical_significance.into(),
review_status: review_status.into(),
};
let buf = record.encode_to_vec();

let var = keys::Var::from(
&record.chromosome,
record.start as i32,
&record.reference,
&record.alternative,
&chr,
start as i32,
&reference_allele_vcf,
&alternate_allele_vcf,
);
let key: Vec<u8> = var.into();

let record = if let Some(data) = db
.get_cf(&cf_data, key.clone())
.map_err(|e| anyhow::anyhow!("problem querying database: {}", e))?
{
let mut record = clinvar_minimal::pbs::Record::decode(&data[..])?;
record
.reference_assertions
.push(clinvar_minimal::pbs::ReferenceAssertion {
rcv,
title,
clinical_significance: clinical_significance.into(),
review_status: review_status.into(),
});
record
.reference_assertions
.sort_by_key(|a| (a.clinical_significance, a.review_status));
record
} else {
clinvar_minimal::pbs::Record {
release: assembly,
chromosome: chr,
start,
stop,
reference: reference_allele_vcf,
alternative: alternate_allele_vcf,
vcv,
reference_assertions: vec![clinvar_minimal::pbs::ReferenceAssertion {
rcv,
title,
clinical_significance: clinical_significance.into(),
review_status: review_status.into(),
}],
}
};
let buf = record.encode_to_vec();
db.put_cf(&cf_data, key, buf)?;
}
}
Expand Down
123 changes: 87 additions & 36 deletions src/clinvar_minimal/cli/reading.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@ use std::{fmt::Display, str::FromStr};

use serde::{Deserialize, Serialize};

use crate::clinvar_minimal::pbs;

/// Enumeration for ClinVar clinical significance for (de)serialization.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum ClinicalSignificance {
/// Pathogenic.
Pathogenic,
Expand Down Expand Up @@ -67,16 +65,44 @@ impl<'de> Deserialize<'de> for ClinicalSignificance {
}
}

impl From<ClinicalSignificance> for pbs::ClinicalSignificance {
impl From<ClinicalSignificance> for crate::clinvar_minimal::pbs::ClinicalSignificance {
fn from(value: ClinicalSignificance) -> Self {
match value {
ClinicalSignificance::Pathogenic => pbs::ClinicalSignificance::Pathogenic,
ClinicalSignificance::LikelyPathogenic => pbs::ClinicalSignificance::LikelyPathogenic,
ClinicalSignificance::Pathogenic => {
crate::clinvar_minimal::pbs::ClinicalSignificance::Pathogenic
}
ClinicalSignificance::LikelyPathogenic => {
crate::clinvar_minimal::pbs::ClinicalSignificance::LikelyPathogenic
}
ClinicalSignificance::UncertainSignificance => {
pbs::ClinicalSignificance::UncertainSignificance
crate::clinvar_minimal::pbs::ClinicalSignificance::UncertainSignificance
}
ClinicalSignificance::LikelyBenign => {
crate::clinvar_minimal::pbs::ClinicalSignificance::LikelyBenign
}
ClinicalSignificance::Benign => {
crate::clinvar_minimal::pbs::ClinicalSignificance::Benign
}
ClinicalSignificance::LikelyBenign => pbs::ClinicalSignificance::LikelyBenign,
ClinicalSignificance::Benign => pbs::ClinicalSignificance::Benign,
}
}
}

impl From<ClinicalSignificance> for crate::clinvar_genes::pbs::ClinicalSignificance {
fn from(value: ClinicalSignificance) -> Self {
match value {
ClinicalSignificance::Pathogenic => {
crate::clinvar_genes::pbs::ClinicalSignificance::Pathogenic
}
ClinicalSignificance::LikelyPathogenic => {
crate::clinvar_genes::pbs::ClinicalSignificance::LikelyPathogenic
}
ClinicalSignificance::UncertainSignificance => {
crate::clinvar_genes::pbs::ClinicalSignificance::UncertainSignificance
}
ClinicalSignificance::LikelyBenign => {
crate::clinvar_genes::pbs::ClinicalSignificance::LikelyBenign
}
ClinicalSignificance::Benign => crate::clinvar_genes::pbs::ClinicalSignificance::Benign,
}
}
}
Expand All @@ -95,22 +121,22 @@ impl From<i32> for ClinicalSignificance {
}

/// Enumeration for ClinVar review status for (de)serialization.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum ReviewStatus {
/// "no assertion provided"
NoAssertionProvided,
/// "no assertion criteria provided"
NoAssertionCriteriaProvided,
/// "criteria provided, conflicting interpretations"
CriteriaProvidedConflictingInterpretations,
/// "criteria provided, single submitter"
CriteriaProvidedSingleSubmitter,
/// "criteria provided, multiple submitters, no conflicts"
CriteriaProvidedMultipleSubmittersNoConflicts,
/// "reviewed by expert panel"
ReviewedByExpertPanel,
/// "practice guideline"
PracticeGuideline,
/// "reviewed by expert panel"
ReviewedByExpertPanel,
/// "criteria provided, multiple submitters, no conflicts"
CriteriaProvidedMultipleSubmittersNoConflicts,
/// "criteria provided, single submitter"
CriteriaProvidedSingleSubmitter,
/// "criteria provided, conflicting interpretations"
CriteriaProvidedConflictingInterpretations,
/// "no assertion criteria provided"
NoAssertionCriteriaProvided,
/// "no assertion provided"
NoAssertionProvided,
}

impl Display for ReviewStatus {
Expand Down Expand Up @@ -177,38 +203,59 @@ impl<'de> Deserialize<'de> for ReviewStatus {
}
}

impl From<ReviewStatus> for pbs::ReviewStatus {
impl From<ReviewStatus> for crate::clinvar_minimal::pbs::ReviewStatus {
fn from(value: ReviewStatus) -> Self {
match value {
ReviewStatus::NoAssertionProvided => pbs::ReviewStatus::NoAssertionProvided,
ReviewStatus::NoAssertionProvided => crate::clinvar_minimal::pbs::ReviewStatus::NoAssertionProvided,
ReviewStatus::NoAssertionCriteriaProvided => {
pbs::ReviewStatus::NoAssertionCriteriaProvided
crate::clinvar_minimal::pbs::ReviewStatus::NoAssertionCriteriaProvided
}
ReviewStatus::CriteriaProvidedConflictingInterpretations => {
pbs::ReviewStatus::CriteriaProvidedConflictingInterpretations
crate::clinvar_minimal::pbs::ReviewStatus::CriteriaProvidedConflictingInterpretations
}
ReviewStatus::CriteriaProvidedSingleSubmitter => {
pbs::ReviewStatus::CriteriaProvidedSingleSubmitter
crate::clinvar_minimal::pbs::ReviewStatus::CriteriaProvidedSingleSubmitter
}
ReviewStatus::CriteriaProvidedMultipleSubmittersNoConflicts => {
pbs::ReviewStatus::CriteriaProvidedMultipleSubmittersNoConflicts
crate::clinvar_minimal::pbs::ReviewStatus::CriteriaProvidedMultipleSubmittersNoConflicts
}
ReviewStatus::ReviewedByExpertPanel => pbs::ReviewStatus::ReviewedByExpertPanel,
ReviewStatus::PracticeGuideline => pbs::ReviewStatus::PracticeGuideline,
ReviewStatus::ReviewedByExpertPanel => crate::clinvar_minimal::pbs::ReviewStatus::ReviewedByExpertPanel,
ReviewStatus::PracticeGuideline => crate::clinvar_minimal::pbs::ReviewStatus::PracticeGuideline,
}
}
}

impl From<ReviewStatus> for crate::clinvar_genes::pbs::ReviewStatus {
fn from(value: ReviewStatus) -> Self {
match value {
ReviewStatus::NoAssertionProvided => crate::clinvar_genes::pbs::ReviewStatus::NoAssertionProvided,
ReviewStatus::NoAssertionCriteriaProvided => {
crate::clinvar_genes::pbs::ReviewStatus::NoAssertionCriteriaProvided
}
ReviewStatus::CriteriaProvidedConflictingInterpretations => {
crate::clinvar_genes::pbs::ReviewStatus::CriteriaProvidedConflictingInterpretations
}
ReviewStatus::CriteriaProvidedSingleSubmitter => {
crate::clinvar_genes::pbs::ReviewStatus::CriteriaProvidedSingleSubmitter
}
ReviewStatus::CriteriaProvidedMultipleSubmittersNoConflicts => {
crate::clinvar_genes::pbs::ReviewStatus::CriteriaProvidedMultipleSubmittersNoConflicts
}
ReviewStatus::ReviewedByExpertPanel => crate::clinvar_genes::pbs::ReviewStatus::ReviewedByExpertPanel,
ReviewStatus::PracticeGuideline => crate::clinvar_genes::pbs::ReviewStatus::PracticeGuideline,
}
}
}
impl From<i32> for ReviewStatus {
fn from(value: i32) -> Self {
match value {
0 => ReviewStatus::NoAssertionProvided,
1 => ReviewStatus::NoAssertionCriteriaProvided,
2 => ReviewStatus::CriteriaProvidedConflictingInterpretations,
0 => ReviewStatus::PracticeGuideline,
1 => ReviewStatus::ReviewedByExpertPanel,
2 => ReviewStatus::CriteriaProvidedMultipleSubmittersNoConflicts,
3 => ReviewStatus::CriteriaProvidedSingleSubmitter,
4 => ReviewStatus::CriteriaProvidedMultipleSubmittersNoConflicts,
5 => ReviewStatus::ReviewedByExpertPanel,
6 => ReviewStatus::PracticeGuideline,
4 => ReviewStatus::CriteriaProvidedConflictingInterpretations,
5 => ReviewStatus::NoAssertionCriteriaProvided,
6 => ReviewStatus::NoAssertionProvided,
_ => unreachable!(),
}
}
Expand All @@ -219,6 +266,10 @@ impl From<i32> for ReviewStatus {
pub struct Record {
/// RCV accession identifier.
pub rcv: String,
/// VCV accession identifier.
pub vcv: String,
/// RCV title.
pub title: String,
/// HGNC ids
pub hgnc_ids: Vec<String>,
/// ClinVar clinical significance
Expand Down
Loading

0 comments on commit 9095773

Please sign in to comment.