Skip to content

Commit

Permalink
feat!: extending clinvar-this v0.15 support to clinvar-minimal and cl…
Browse files Browse the repository at this point in the history
…invar-sv (#456)
  • Loading branch information
holtgrewe authored May 30, 2024
1 parent a792d9b commit b37514a
Show file tree
Hide file tree
Showing 69 changed files with 727 additions and 1,267 deletions.
92 changes: 5 additions & 87 deletions protos/annonars/clinvar/minimal.proto
Original file line number Diff line number Diff line change
Expand Up @@ -4,92 +4,10 @@ syntax = "proto3";

package annonars.clinvar.minimal;

// Enumeration for variant type.
enum VariantType {
// unknown
VARIANT_TYPE_UNKNOWN = 0;
// deletion
VARIANT_TYPE_DELETION = 1;
// duplication
VARIANT_TYPE_DUPLICATION = 2;
// indel
VARIANT_TYPE_INDEL = 3;
// insertion
VARIANT_TYPE_INSERTION = 4;
// inversion
VARIANT_TYPE_INVERSION = 5;
// single nucleotide variant
VARIANT_TYPE_SNV = 6;
}

// Enumeration for ClinVar pathogenicity.
enum ClinicalSignificance {
// unknown
CLINICAL_SIGNIFICANCE_UNKNOWN = 0;
// Pathogenic.
CLINICAL_SIGNIFICANCE_PATHOGENIC = 1;
// Likely pathogenic.
CLINICAL_SIGNIFICANCE_LIKELY_PATHOGENIC = 2;
// Uncertain significance.
CLINICAL_SIGNIFICANCE_UNCERTAIN_SIGNIFICANCE = 3;
// Likely benign.
CLINICAL_SIGNIFICANCE_LIKELY_BENIGN = 4;
// Benign.
CLINICAL_SIGNIFICANCE_BENIGN = 5;
}

/// Enumeration for ClinVar review status.
enum ReviewStatus {
// unknown
REVIEW_STATUS_PRACTICE_UNKNOWN = 0;
// "practice guideline"
REVIEW_STATUS_PRACTICE_GUIDELINE = 1;
// "reviewed by expert panel"
REVIEW_STATUS_REVIEWED_BY_EXPERT_PANEL = 2;
// "criteria provided, multiple submitters, no conflicts"
REVIEW_STATUS_CRITERIA_PROVIDED_MULTIPLE_SUBMITTERS_NO_CONFLICTS = 3;
// "criteria provided, single submitter"
REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER = 4;
// "criteria provided, conflicting interpretations"
REVIEW_STATUS_CRITERIA_PROVIDED_CONFLICTING_INTERPRETATIONS = 5;
// "no assertion criteria provided"
REVIEW_STATUS_NO_ASSERTION_CRITERIA_PROVIDED = 6;
// "no assertion provided"
REVIEW_STATUS_NO_ASSERTION_PROVIDED = 7;
// "flagged submission"
REVIEW_STATUS_FLAGGED_SUBMISSION = 8;
// "no classifications from unflagged records"
REVIEW_STATUS_NO_CLASSIFICATIONS_FROM_UNFLAGGED_RECORDS = 9;
}

// Record for storing information about a reference clinvar assertion.
message ReferenceAssertion {
// RCV accession identifier.
string rcv = 1;
// Title of the reference assertion, includes phenotype / disease.
string title = 2;
// Clinical significance for variant.
ClinicalSignificance clinical_significance = 3;
// Review status.
ReviewStatus review_status = 4;
}
import "annonars/clinvar_data/extracted_vars.proto";

// Record for storing minimal information on ClinVar for Mehari.
message Record {
// Genome release.
string release = 1;
// Chromosome name.
string chromosome = 2;
// 1-based start position.
uint32 start = 3;
// 1-based end position.
uint32 stop = 4;
// Reference allele bases in VCF notation.
string reference = 5;
// Alternative allele bases in VCF notation.
string alternative = 6;
// VCV accession identifier.
string vcv = 7;
// The reference assertions, sorted by (ClinicalSignificance, ReviewStatus).
repeated ReferenceAssertion reference_assertions = 8;
// Record with overlap information.
message ExtractedVcvRecordList {
// The list of VCV records that may share a global variant.
repeated clinvar_data.extracted_vars.ExtractedVcvRecord records = 1;
}
1 change: 0 additions & 1 deletion protos/annonars/clinvar/per_gene.proto
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ syntax = "proto3";

package annonars.clinvar.per_gene;

import "annonars/clinvar/minimal.proto";
import "annonars/clinvar_data/class_by_freq.proto";
import "annonars/clinvar_data/extracted_vars.proto";
import "annonars/clinvar_data/gene_impact.proto";
Expand Down
38 changes: 2 additions & 36 deletions protos/annonars/clinvar/sv.proto
Original file line number Diff line number Diff line change
Expand Up @@ -4,46 +4,12 @@ syntax = "proto3";

package annonars.clinvar.sv;

import "annonars/clinvar/minimal.proto";

// Record for storing minimal information on ClinVar for Mehari.
message Record {
// Genome release.
string release = 1;
// Chromosome name.
string chromosome = 2;
// 1-based start position.
uint32 start = 3;
// 1-based end position.
uint32 stop = 4;

// Reference allele bases in VCF notation; optional.
optional string reference = 5;
// Alternative allele bases in VCF notation; optional.
optional string alternative = 6;

// 1-based inner start position.
optional uint32 inner_start = 7;
// 1-based inner stop position.
optional uint32 inner_stop = 8;
// 1-based outer start position.
optional uint32 outer_start = 9;
// 1-based outer stop position.
optional uint32 outer_stop = 10;

// The variant type.
annonars.clinvar.minimal.VariantType variant_type = 11;

// VCV accession identifier.
string vcv = 12;
// The reference assertions, sorted by (ClinicalSignificance, ReviewStatus).
repeated annonars.clinvar.minimal.ReferenceAssertion reference_assertions = 13;
}
import "annonars/clinvar_data/extracted_vars.proto";

// Record with overlap information.
message ResponseRecord {
// The record.
Record record = 1;
clinvar_data.extracted_vars.ExtractedVcvRecord record = 1;
// The reciprocal overlap with the query.
double overlap = 2;
}
Expand Down
99 changes: 46 additions & 53 deletions src/clinvar_minimal/cli/import.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,11 @@ use std::{io::BufRead, sync::Arc};
use clap::Parser;
use prost::Message;

use crate::{
clinvar_minimal,
common::{self, keys},
pbs::clinvar::minimal::ReferenceAssertion,
};
use crate::common::{self, keys};

/// Command line arguments for `clinvar-minimal import` sub command.
#[derive(Parser, Debug, Clone)]
#[command(about = "import minimal ClinVar data into RocksDB", long_about = None)]
#[command(about = "import extracted seqvars ClinVar data into RocksDB", long_about = None)]
pub struct Args {
/// Genome build to use in the build.
#[arg(long, value_enum)]
Expand Down Expand Up @@ -57,40 +53,42 @@ fn jsonl_import(

for line in reader.lines() {
let line = line?;
let record = match serde_json::from_str::<clinvar_minimal::cli::reading::Record>(&line) {
let vcv_record = match serde_json::from_str::<
crate::pbs::clinvar_data::extracted_vars::ExtractedVcvRecord,
>(&line)
{
Ok(record) => record,
Err(e) => {
tracing::warn!("skipping line because of error: {}", e);
continue;
}
};

let clinvar_minimal::cli::reading::Record {
rcv,
vcv,
title,
clinical_significance,
review_status,
let crate::pbs::clinvar_data::extracted_vars::ExtractedVcvRecord {
accession,
rcvs: rcv_records,
sequence_location,
..
} = record;
let clinical_significance: crate::pbs::clinvar::minimal::ClinicalSignificance =
clinical_significance.into();
let review_status: crate::pbs::clinvar::minimal::ReviewStatus = review_status.into();
let clinvar_minimal::cli::reading::SequenceLocation {
assembly,
} = vcv_record.clone();
let accession = accession.expect("accession is required");
let vcv = format!("{}.{}", accession.accession, accession.version);
let sequence_location = sequence_location.expect("sequence_location is required");
let crate::pbs::clinvar_data::clinvar_public::location::SequenceLocation {
chr,
start,
stop,
reference_allele_vcf,
alternate_allele_vcf,
..
} = sequence_location;
if let (Some(start), Some(stop), Some(reference_allele_vcf), Some(alternate_allele_vcf)) =
(start, stop, reference_allele_vcf, alternate_allele_vcf)
let chr_pb =
crate::pbs::clinvar_data::clinvar_public::Chromosome::try_from(chr).map_err(|e| {
anyhow::anyhow!("problem converting chromosome {} to Chromosome: {}", chr, e)
})?;
if let (Some(start), Some(reference_allele_vcf), Some(alternate_allele_vcf)) =
(start, reference_allele_vcf, alternate_allele_vcf)
{
let var = keys::Var::from(
&chr,
&chr_pb.as_chr_name(),
start as i32,
&reference_allele_vcf,
&alternate_allele_vcf,
Expand All @@ -106,41 +104,36 @@ fn jsonl_import(
continue;
}
Ok(data) => {
db.put_cf(&cf_by_accession, rcv.as_bytes(), &key)?;
db.put_cf(&cf_by_accession, vcv.as_bytes(), &key)?;

let record = if let Some(data) = data {
let mut record = crate::pbs::clinvar::minimal::Record::decode(&data[..])?;
record.reference_assertions.push(
crate::pbs::clinvar::minimal::ReferenceAssertion {
rcv,
title,
clinical_significance: clinical_significance.into(),
review_status: review_status.into(),
},
);
record
.reference_assertions
.sort_by_key(|a| (a.clinical_significance, a.review_status));
for rcv_record in &rcv_records {
let accession = rcv_record
.accession
.as_ref()
.expect("rcv.accession is required");
let rcv = format!("{}.{}", accession.accession, accession.version);
db.put_cf(&cf_by_accession, rcv.as_bytes(), &key)?;
}

let new_record = if let Some(data) = data {
let mut record =
crate::pbs::clinvar::minimal::ExtractedVcvRecordList::decode(
&data[..],
)?;
record.records.push(vcv_record);
record.records.sort_by_key(|a| {
a.accession
.as_ref()
.expect("accession is required")
.accession
.clone()
});
record
} else {
crate::pbs::clinvar::minimal::Record {
release: assembly,
chromosome: chr,
start,
stop,
reference: reference_allele_vcf,
alternative: alternate_allele_vcf,
vcv,
reference_assertions: vec![ReferenceAssertion {
rcv,
title,
clinical_significance: clinical_significance.into(),
review_status: review_status.into(),
}],
crate::pbs::clinvar::minimal::ExtractedVcvRecordList {
records: vec![vcv_record],
}
};
let buf = record.encode_to_vec();
let buf = new_record.encode_to_vec();
db.put_cf(&cf_data, &key, &buf)?;
}
}
Expand Down
1 change: 0 additions & 1 deletion src/clinvar_minimal/cli/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,3 @@
pub mod import;
pub mod query;
pub mod reading;
Loading

0 comments on commit b37514a

Please sign in to comment.