Skip to content

Commit

Permalink
feat!: adding support for JSONL from v0.15 clinvar-this (#454)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored May 29, 2024
1 parent b4854b8 commit 484453a
Show file tree
Hide file tree
Showing 27 changed files with 12,981 additions and 416 deletions.
9,865 changes: 9,865 additions & 0 deletions ForRelease

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ use std::{env, path::PathBuf};
fn main() -> Result<(), anyhow::Error> {
let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("protos");
let proto_files = vec![
"annonars/clinvar_data/class_by_freq.proto",
"annonars/clinvar_data/clinvar_public.proto",
"annonars/clinvar_data/extracted_vars.proto",
"annonars/clinvar_data/gene_impact.proto",
"annonars/clinvar_data/phenotype_link.proto",
"annonars/clinvar/minimal.proto",
"annonars/clinvar/per_gene.proto",
"annonars/clinvar/sv.proto",
Expand Down Expand Up @@ -51,7 +56,7 @@ fn main() -> Result<(), anyhow::Error> {
let descriptor_set = std::fs::read(descriptor_path).unwrap();
pbjson_build::Builder::new()
.register_descriptors(&descriptor_set)?
.build(&[".annonars"])?;
.build(&[".annonars", ".clinvar_data"])?;

Ok(())
}
89 changes: 13 additions & 76 deletions protos/annonars/clinvar/per_gene.proto
Original file line number Diff line number Diff line change
Expand Up @@ -5,87 +5,24 @@ syntax = "proto3";
package annonars.clinvar.per_gene;

import "annonars/clinvar/minimal.proto";

// Impact on gene in ClinVar.
enum Impact {
// unknown
IMPACT_UNKNOWN = 0;
// 3' UTR variant
IMPACT_THREE_PRIME_UTR_VARIANT = 1;
// 5' UTR variant
IMPACT_FIVE_PRIME_UTR_VARIANT = 2;
// downstream gene variant
IMPACT_DOWNSTREAM_TRANSCRIPT_VARIANT = 3;
// frameshift variant
IMPACT_FRAMESHIFT_VARIANT = 4;
// inframe indel
IMPACT_INFRAME_INDEL = 5;
// start lost
IMPACT_START_LOST = 6;
// intron variant
IMPACT_INTRON_VARIANT = 7;
// missense variant
IMPACT_MISSENSE_VARIANT = 8;
// non-coding transcript variant
IMPACT_NON_CODING_TRANSCRIPT_VARIANT = 9;
// stop gained
IMPACT_STOP_GAINED = 10;
// no sequence alteration
IMPACT_NO_SEQUENCE_ALTERATION = 11;
// splice acceptor variant
IMPACT_SPLICE_ACCEPTOR_VARIANT = 12;
// splice donor variant
IMPACT_SPLICE_DONOR_VARIANT = 13;
// stop lost
IMPACT_STOP_LOST = 14;
// synonymous variant
IMPACT_SYNONYMOUS_VARIANT = 15;
// upstream gene variant
IMPACT_UPSTREAM_TRANSCRIPT_VARIANT = 16;
}

// Counts of variants by impact.
message GeneImpactRecordCounts {
// Impact
Impact impact = 1;
// Counts for ACMG1..5
repeated uint32 counts = 2;
}

// Coarsened clinical significance
enum CoarseClinicalSignificance {
// unknown
COARSE_CLINICAL_SIGNIFICANCE_UNKNOWN = 0;
// benign / likely benign
COARSE_CLINICAL_SIGNIFICANCE_BENIGN = 1;
// uncertain signifiance
COARSE_CLINICAL_SIGNIFICANCE_UNCERTAIN = 2;
// pathogenic / likely pathogenic
COARSE_CLINICAL_SIGNIFICANCE_PATHOGENIC = 3;
}

// Counts per frequency and impact.
message GeneFreqRecordCounts {
// Frequency
CoarseClinicalSignificance coarse_clinsig = 1;
// Counts for ACMG1..5
repeated uint32 counts = 2;
}

// Variants for a given genome release / assembly.
message GeneVariantsForRelease {
// Genome release / assembly
string genome_release = 1;
// Variants
repeated annonars.clinvar.minimal.Record variants = 2;
import "annonars/clinvar_data/class_by_freq.proto";
import "annonars/clinvar_data/extracted_vars.proto";
import "annonars/clinvar_data/gene_impact.proto";

// Extracted variants per release.
message ExtractedVariantsPerRelease {
// Release version.
optional string release = 1;
// Variants per gene.
repeated clinvar_data.extracted_vars.ExtractedVcvRecord variants = 2;
}

// ClinVar detailed information per gene.
message ClinvarPerGeneRecord {
// Counts of variants per impact
repeated GeneImpactRecordCounts per_impact_counts = 1;
optional clinvar_data.gene_impact.GeneImpactCounts per_impact_counts = 1;
// Counts of variants per impact / frequency
repeated GeneFreqRecordCounts per_freq_counts = 2;
optional clinvar_data.class_by_freq.GeneCoarseClinsigFrequencyCounts per_freq_counts = 2;
// Variants for the given gene.
repeated GeneVariantsForRelease variants = 3;
repeated ExtractedVariantsPerRelease per_release_vars = 3;
}
29 changes: 29 additions & 0 deletions protos/annonars/clinvar_data/class_by_freq.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Protocol buffers for types for class-by-freq tool.

syntax = "proto3";

package annonars.clinvar_data.class_by_freq;

// Enumeration for coarse-grain classification.
enum CoarseClinicalSignificance {
// unspecified coarse clinical significance
COARSE_CLINICAL_SIGNIFICANCE_UNSPECIFIED = 0;
// Corresponds to "benign".
COARSE_CLINICAL_SIGNIFICANCE_BENIGN = 1;
// Corresponds to "uncertain".
COARSE_CLINICAL_SIGNIFICANCE_UNCERTAIN = 2;
// Corresponds to "pathogenic".
COARSE_CLINICAL_SIGNIFICANCE_PATHOGENIC = 3;
}

// Output record.
message GeneCoarseClinsigFrequencyCounts {
// The gene HGNC ID.
string hgnc_id = 1;
// The counts for (likely) pathogenic.
repeated uint32 pathogenic_counts = 2;
// The counts for uncertain significance.
repeated uint32 uncertain_counts = 3;
// The counts for (likely) benign.
repeated uint32 benign_counts = 4;
}
Loading

0 comments on commit 484453a

Please sign in to comment.