Skip to content

Commit

Permalink
feat: gnomAD v4 frequency import (#275) (#368)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Dec 28, 2023
1 parent 29f71d1 commit 2a7e098
Show file tree
Hide file tree
Showing 203 changed files with 17,202 additions and 221 deletions.
10 changes: 10 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ tracing = "0.1"
tracing-subscriber = "0.3"
rustc-hash = "1.1.0"
noodles-gff = "0.25.0"
erased-serde = "0.4.1"

[build-dependencies]
prost-build = "0.12"
Expand All @@ -71,3 +72,9 @@ temp_testdir = "0.2"
test-log = "0.2"
tracing-subscriber = "0.3"
tracing-test = "0.2.4"

# Compile insta with full optimization.
[profile.dev.package.insta]
opt-level = 3
[profile.dev.package.similar]
opt-level = 3
2 changes: 2 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ fn main() -> Result<(), anyhow::Error> {
"annonars/gnomad/exac_cnv.proto",
"annonars/gnomad/gnomad2.proto",
"annonars/gnomad/gnomad3.proto",
"annonars/gnomad/gnomad4.proto",
"annonars/gnomad/gnomad_cnv4.proto",
"annonars/gnomad/gnomad_sv2.proto",
"annonars/gnomad/gnomad_sv4.proto",
"annonars/gnomad/mtdna.proto",
"annonars/gnomad/vep_common.proto",
"annonars/gnomad/vep_gnomad2.proto",
"annonars/gnomad/vep_gnomad3.proto",
"annonars/gnomad/vep_gnomad4.proto",
"annonars/helixmtdb/base.proto",
"annonars/regions/clingen.proto",
]
Expand Down
17 changes: 15 additions & 2 deletions protos/annonars/gnomad/gnomad3.proto
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
// Protobufs for gnomAD v3.
//
// In the case of smaller additions in **both exomes and genomes** in v4.0,
// we extended the protobufs here and re-used them.

syntax = "proto3";

package annonars.gnomad.gnomad3;
Expand All @@ -11,7 +16,11 @@ enum Filter {
// Allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB <
// 0.2 for het calls).
FILTER_ALLELE_COUNT_IS_ZERO = 1;
// Failed VQSR filtering thresholds of -2.7739 for SNPs and -1.0606 for indels.
// Failed VQSR filtering thresholds of:
//
// gnomAD-genomes v3: -2.7739 for SNPs and -1.0606 for indels
// gnomAD-genomes v4: -2.502 for SNPs and -0.7156 for indels
// gnomAD-exomes v4: -1.4526 for SNPs and 0.0717 for indels
FILTER_AS_VSQR = 2;
// InbreedingCoeff < -0.3.
FILTER_INBREEDING_COEFF = 3;
Expand Down Expand Up @@ -185,6 +194,8 @@ message QualityInfo {
optional int32 as_qual_approx = 18;
// Allele-specific forward/reverse read counts for strand bias tests.
optional string as_sb_table = 19;
// Strand bias estimated by the symmetric odds ratio test (v4 only).
optional float sor = 20;
}


Expand All @@ -202,9 +213,11 @@ message VariantInfo {
bool monoallelic = 5;
// Depth over variant genotypes (does not include depth of reference samples).
int32 var_dp = 6;
// Allele-specific depth over variant genotypes (does not include depth of reference samples) (v4 only).
optional int32 as_vardp = 7;
}

// Protocol buffer for the gnomAD-mtDNA VCF record.
// Protocol buffer for the gnomAD-nuclear VCF record.
//
// The more specialized fields from the INFO column are stored in separate, optional fields such
// that we don't end up with a humongous message.
Expand Down
130 changes: 130 additions & 0 deletions protos/annonars/gnomad/gnomad4.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
syntax = "proto3";

package annonars.gnomad.gnomad4;

import "annonars/gnomad/gnomad3.proto";
import "annonars/gnomad/vep_gnomad4.proto";

// Store details on variant effect predictions.
message EffectInfo {
// Pangolin's largest delta score across 2 splicing consequences, which reflects the probability of the variant being splice-altering">
optional float pangolin_largest_ds = 1;
// Base-wise conservation score across the 241 placental mammals in the Zoonomia project. Score ranges from -20 to 9.28, and reflects acceleration (faster evolution than expected under neutral drift, assigned negative scores) as well as conservation (slower than expected evolution, assigned positive scores).">
optional float phylop = 2;
// Score that predicts the possible impact of an amino acid substitution on the structure and function of a human protein, ranging from 0.0 (tolerated) to 1.0 (deleterious). We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript.">
optional float polyphen_max = 3;
// The maximum REVEL score at a site's MANE Select or canonical transcript. It's an ensemble score for predicting the pathogenicity of missense variants (based on 13 other variant predictors). Scores ranges from 0 to 1. Variants with higher scores are predicted to be more likely to be deleterious.">
optional float revel_max = 4;
// Score reflecting the scaled probability of the amino acid substitution being tolerated, ranging from 0 to 1. Scores below 0.05 are predicted to impact protein function. We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript.">
optional float sift_max = 5;
// Illumina's SpliceAI max delta score; interpreted as the probability of the variant being splice-altering.">
optional float spliceai_ds_max = 6;
// Raw CADD scores are interpretable as the extent to which the annotation profile for a given variant suggests that the variant is likely to be 'observed' (negative values) vs 'simulated' (positive values). Larger values are more deleterious.
optional float cadd_raw = 7;
// Cadd Phred-like scores ('scaled C-scores') ranging from 1 to 99, based on the rank of each variant relative to all possible 8.6 billion substitutions in the human reference genome. Larger values are more deleterious.
optional float cadd_phred = 8;
}

// Store the allele counts for the given sub cohort in the given ancestry group.
message AncestryGroupAlleleCounts {
// Name of the ancestry group.
string ancestry_group = 1;
// The overall allele counts and the one by sex.
annonars.gnomad.gnomad3.AlleleCountsBySex counts = 2;

// The filtering allele frequency (using Poisson 95% CI).
optional float faf95 = 3;
// The filtering allele frequency (using Poisson 99% CI).
optional float faf99 = 4;
// The filtering allele frequency for XX samples (using Poisson 95% CI).
optional float faf95_xx = 5;
// The filtering allele frequency for XX samples (using Poisson 99% CI).
optional float faf99_xx = 6;
// The filtering allele frequency for XY samples (using Poisson 95% CI).
optional float faf95_xy = 7;
// The filtering allele frequency for XY samples (using Poisson 99% CI).
optional float faf99_xy = 8;
}

// Store the allele counts for the given cohort.
message CohortAlleleCounts {
// Name of the cohort.
optional string cohort = 1;
// Allele counts for each population.
repeated AncestryGroupAlleleCounts by_ancestry_group = 2;
// Allele counts by sex.
annonars.gnomad.gnomad3.AlleleCountsBySex by_sex = 3;
// Raw allele counts.
annonars.gnomad.gnomad3.AlleleCounts raw = 4;

// The ancestry group with maximum AF.
optional string grpmax = 5;
// Maximum allele frequency across ancestry groups.
optional float af_grpmax = 6;
// Allele count in ancestry group with maximum AF.
optional int32 ac_grpmax = 7;
// Total number of alleles in ancestry group with maximum AF.
optional int32 an_grpmax = 8;
// Total number of homozygous individuals in ancestry group with maximum AF.
optional int32 nhomalt_grpmax = 9;
}

// VRS information
message VrsInfo {
// The computed identifiers for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields
repeated string allele_ids = 1;
// Interresidue coordinates used as the location ends for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields
repeated int32 ends = 2;
// Interresidue coordinates used as the location starts for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields
repeated int32 starts = 3;
// The literal sequence states used for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields
repeated string states = 4;
}

// Protocol buffer for the gnomAD-nuclear VCF record.
//
// The more specialized fields from the INFO column are stored in separate, optional fields such
// that we don't end up with a humongous message.
message Record {
// Chromosome name.
string chrom = 1;
// 1-based start position.
int32 pos = 2;
// Reference allele.
string ref_allele = 3;
// Alternate allele.
string alt_allele = 4;

// Site-level filters.
repeated annonars.gnomad.gnomad3.Filter filters = 5;
// VEP annotation records.
repeated annonars.gnomad.vep_gnomad4.Vep vep = 6;

// Variant allele counts in the different cohorts and population.
//
// The populations in gnomAD v4 are: empty for global, "joint" for exome+genomes.
repeated CohortAlleleCounts allele_counts = 7;
// Variant (on sex chromosome) falls outside a pseudoautosomal region
bool nonpar = 8;
// All samples are heterozygous for the variant
bool only_het = 9;
// Variant falls outside of Broad exome capture regions (exomes only).
bool outside_broad_capture_region = 10;
// Variant falls outside of UK Biobank exome capture regions(exomes only).
bool outside_ukb_capture_region = 11;
// Variant was a callset-wide doubleton that was present only in two siblings (i.e., a singleton amongst unrelated samples in cohort) (exomes only).
bool sibling_singleton = 12;

// Information on variant scores.
optional EffectInfo effect_info = 13;
// Variant-related information details.
optional annonars.gnomad.gnomad3.VariantInfo variant_info = 14;
// Summary information for variant quality interpretation.
optional annonars.gnomad.gnomad3.QualityInfo quality_info = 15;
// Age-related information.
optional annonars.gnomad.gnomad3.AgeInfo age_info = 16;
// Depth of coverage-related information.
optional annonars.gnomad.gnomad3.DepthInfo depth_info = 17;
// VRS infos.
optional VrsInfo vrs_info = 18;
}
3 changes: 2 additions & 1 deletion protos/annonars/gnomad/vep_gnomad3.proto
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ package annonars.gnomad.vep_gnomad3;

import "annonars/gnomad/vep_common.proto";

// Protocol buffer for the gnomAD-mtDNA VEP predictions.
// Protocol buffer for the gnomAD-nuclear VEP predictions.
message Vep {
// Allele of record.
string allele = 1;
Expand Down Expand Up @@ -42,6 +42,7 @@ message Vep {
optional string amino_acids = 16;
// Codons, e.g., `"gCg/gGg"`.
optional string codons = 17;
// TODO: actually is optional int32 allele_num = 18;
// dbSNP ID, e.g., `"rs28942080"`.
optional string dbsnp_id = 18;
// Distance output of VEP.
Expand Down
103 changes: 103 additions & 0 deletions protos/annonars/gnomad/vep_gnomad4.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// Protocol buffer definitions for gnomAD v4 VEP records.

syntax = "proto3";

package annonars.gnomad.vep_gnomad4;

import "annonars/gnomad/vep_common.proto";

// Protocol buffer for the gnomAD-nuclear VEP predictions.
message Vep {
// Allele of record.
string allele = 1;
// Consequence, e.g., `"missense_variant"`.
string consequence = 2;
// Impact, e.g., `"MODERATE"`.
string impact = 3;
// Gene symbol, e.g., `"PCSK9"`.
string symbol = 4;
// Gene ID, `e.g., "ENSG00000169174"`.
string gene = 5;
// Feature type, e.g., `"Transcript"`.
string feature_type = 6;
// Feature ID, e.g., `"ENST00000302118"`.
string feature = 7;
// Feature biotype, e.g., `"protein_coding"`.
string feature_biotype = 8;
// Ranked exon number, e.g., `"1/4"`.
optional string exon = 9;
// Ranked intron number, e.g., `"1/4"`.
optional string intron = 10;
// cDNA position, e.g., `"ENST00000302118.5:c.89C>G"`.
optional string hgvsc = 11;
// Protein position, e.g., `"ENSP00000302118.5:p.Thr30Arg"`.
optional string hgvsp = 12;
// cDNA position, e.g., `"89/1863"`.
optional string cdna_position = 13;
// CDS position, e.g., `"89/1863"`.
optional string cds_position = 14;
// Protein position, e.g., `"30/620"`.
optional string protein_position = 15;
// Amino acids, e.g., `"T/R"`.
optional string amino_acids = 16;
// Codons, e.g., `"gCg/gGg"`.
optional string codons = 17;
// Allele count.
optional int32 allele_num = 18;
// Distance output of VEP.
optional string distance = 19;
// Strand, e.g., `"1"`.
optional string strand = 20;
// Flags
optional string flags = 21;
// Variant class, e.g., `"SNV"`.
optional string variant_class = 22;
// Symbol source, e.g., `"HGNC"`.
optional string symbol_source = 23;
// HGNC ID, e.g., `"HGNC:8706"`.
optional string hgnc_id = 24;
// Whether this is the canonical transcript.
optional bool canonical = 25;
// Presence in MANE Select
optional bool mane_select = 26;
// Presence in MANE Plus Clinical
optional bool mane_plus_clinical = 27;
// Transcript support level, e.g., `"1"`.
optional int32 tsl = 28;
// APPRIS annotation, e.g. `"P1"`.
optional string appris = 29;
// CCDS ID, e.g., `"CCDS30547.1"`.
optional string ccds = 30;
// Ensembl protein ID, e.g., `"ENSP00000302118"`.
optional string ensp = 31;
// Uniprot isoform.
optional string uniprot_isoform = 32;
// Value of VEP "SOURCE" field.
optional string source = 33;
// Protein domains, e.g., `[["2p4e", "ENSP_mappings"], ["2qtw", "ENSP_mappings"]]`.
repeated annonars.gnomad.vep_common.Domain domains = 34;
// miRNA information.
optional string mirna = 35;
// HGVS offset.
optional string hgvs_offset = 36;
// PubMed IDs
optional string pubmed = 37;
// Motif name.
optional string motif_name = 38;
// Motif name.
optional string motif_pos = 39;
// "high inf pos" from VEP.
optional string high_inf_pos = 40;
// Motif score change.
optional string motif_score_change = 41;
// Transcription factors.
optional string transcription_factors = 42;
// Loss of function prediction.
optional string lof = 43;
// Loss of function filter.
optional string lof_filter = 44;
// Loss of function flags.
optional string lof_flags = 45;
// Loss of function info.
optional string lof_info = 46;
}
13 changes: 13 additions & 0 deletions src/common/noodles.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,19 @@ pub fn get_vec_str(record: &noodles_vcf::Record, name: &str) -> Result<Vec<Strin
}
}

/// Extract an `Vec<i32>` field from record with an array field.
///
/// This is different than parsing the histograms from pipe-separated strings.
pub fn get_vec_i32(record: &noodles_vcf::Record, name: &str) -> Result<Vec<i32>, anyhow::Error> {
if let Some(Some(field::Value::Array(field::value::Array::Integer(vs)))) =
record.info().get(&field::Key::from_str(name)?)
{
Ok(vs.iter().flatten().cloned().collect())
} else {
anyhow::bail!("missing INFO/{} in gnomAD record", name)
}
}

/// Extract an `Vec<FromStr>` field from a record encoded as a pipe symbol separated string.
pub fn get_vec<T>(record: &noodles_vcf::Record, name: &str) -> Result<Vec<T>, anyhow::Error>
where
Expand Down
Loading

0 comments on commit 2a7e098

Please sign in to comment.