feat: gnomAD v4 frequency import (#275) (#368)

varfish-org · Dec 28, 2023 · 2a7e098 · 2a7e098
1 parent 29f71d1
commit 2a7e098
Show file tree

Hide file tree

Showing 203 changed files with 17,202 additions and 221 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -56,6 +56,7 @@ tracing = "0.1"
 tracing-subscriber = "0.3"
 rustc-hash = "1.1.0"
 noodles-gff = "0.25.0"
+erased-serde = "0.4.1"
 
 [build-dependencies]
 prost-build = "0.12"
@@ -71,3 +72,9 @@ temp_testdir = "0.2"
 test-log = "0.2"
 tracing-subscriber = "0.3"
 tracing-test = "0.2.4"
+
+# Compile insta with full optimization.
+[profile.dev.package.insta]
+opt-level = 3
+[profile.dev.package.similar]
+opt-level = 3
diff --git a/build.rs b/build.rs
@@ -16,13 +16,15 @@ fn main() -> Result<(), anyhow::Error> {
         "annonars/gnomad/exac_cnv.proto",
         "annonars/gnomad/gnomad2.proto",
         "annonars/gnomad/gnomad3.proto",
+        "annonars/gnomad/gnomad4.proto",
         "annonars/gnomad/gnomad_cnv4.proto",
         "annonars/gnomad/gnomad_sv2.proto",
         "annonars/gnomad/gnomad_sv4.proto",
         "annonars/gnomad/mtdna.proto",
         "annonars/gnomad/vep_common.proto",
         "annonars/gnomad/vep_gnomad2.proto",
         "annonars/gnomad/vep_gnomad3.proto",
+        "annonars/gnomad/vep_gnomad4.proto",
         "annonars/helixmtdb/base.proto",
         "annonars/regions/clingen.proto",
     ]

diff --git a/protos/annonars/gnomad/gnomad3.proto b/protos/annonars/gnomad/gnomad3.proto
@@ -1,3 +1,8 @@
+// Protobufs for gnomAD v3.
+//
+// In the case of smaller additions in **both exomes and genomes** in v4.0,
+// we extended the protobufs here and re-used them.
+
 syntax = "proto3";
 
 package annonars.gnomad.gnomad3;
@@ -11,7 +16,11 @@ enum Filter {
     // Allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB <
     // 0.2 for het calls).
     FILTER_ALLELE_COUNT_IS_ZERO = 1;
-    // Failed VQSR filtering thresholds of -2.7739 for SNPs and -1.0606 for indels.
+    // Failed VQSR filtering thresholds of:
+    //
+    // gnomAD-genomes v3: -2.7739 for SNPs and -1.0606 for indels
+    // gnomAD-genomes v4: -2.502 for SNPs and -0.7156 for indels
+    // gnomAD-exomes v4: -1.4526 for SNPs and 0.0717 for indels
     FILTER_AS_VSQR = 2;
     // InbreedingCoeff < -0.3.
     FILTER_INBREEDING_COEFF = 3;
@@ -185,6 +194,8 @@ message QualityInfo {
     optional int32 as_qual_approx = 18;
     // Allele-specific forward/reverse read counts for strand bias tests.
     optional string as_sb_table = 19;
+    // Strand bias estimated by the symmetric odds ratio test (v4 only).
+    optional float sor = 20;
 }
 
 
@@ -202,9 +213,11 @@ message VariantInfo {
     bool monoallelic = 5;
     // Depth over variant genotypes (does not include depth of reference samples).
     int32 var_dp = 6;
+    // Allele-specific depth over variant genotypes (does not include depth of reference samples) (v4 only).
+    optional int32 as_vardp = 7;
 }
 
-// Protocol buffer for the gnomAD-mtDNA VCF record.
+// Protocol buffer for the gnomAD-nuclear VCF record.
 //
 // The more specialized fields from the INFO column are stored in separate, optional fields such
 // that we don't end up with a humongous message.

diff --git a/protos/annonars/gnomad/gnomad4.proto b/protos/annonars/gnomad/gnomad4.proto
@@ -0,0 +1,130 @@
+syntax = "proto3";
+
+package annonars.gnomad.gnomad4;
+
+import "annonars/gnomad/gnomad3.proto";
+import "annonars/gnomad/vep_gnomad4.proto";
+
+// Store details on variant effect predictions.
+message EffectInfo {
+    // Pangolin's largest delta score across 2 splicing consequences, which reflects the probability of the variant being splice-altering">
+    optional float pangolin_largest_ds = 1;
+    // Base-wise conservation score across the 241 placental mammals in the Zoonomia project. Score ranges from -20 to 9.28, and reflects acceleration (faster evolution than expected under neutral drift, assigned negative scores) as well as conservation (slower than expected evolution, assigned positive scores).">
+    optional float phylop = 2;
+    // Score that predicts the possible impact of an amino acid substitution on the structure and function of a human protein, ranging from 0.0 (tolerated) to 1.0 (deleterious).  We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript.">
+    optional float polyphen_max = 3;
+    // The maximum REVEL score at a site's MANE Select or canonical transcript. It's an ensemble score for predicting the pathogenicity of missense variants (based on 13 other variant predictors). Scores ranges from 0 to 1. Variants with higher scores are predicted to be more likely to be deleterious.">
+    optional float revel_max = 4;
+    // Score reflecting the scaled probability of the amino acid substitution being tolerated, ranging from 0 to 1. Scores below 0.05 are predicted to impact protein function. We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript.">
+    optional float sift_max = 5;
+    // Illumina's SpliceAI max delta score; interpreted as the probability of the variant being splice-altering.">
+    optional float spliceai_ds_max = 6;
+    // Raw CADD scores are interpretable as the extent to which the annotation profile for a given variant suggests that the variant is likely to be 'observed' (negative values) vs 'simulated' (positive values). Larger values are more deleterious.
+    optional float cadd_raw = 7;
+    // Cadd Phred-like scores ('scaled C-scores') ranging from 1 to 99, based on the rank of each variant relative to all possible 8.6 billion substitutions in the human reference genome. Larger values are more deleterious.
+    optional float cadd_phred = 8;
+}
+
+// Store the allele counts for the given sub cohort in the given ancestry group.
+message AncestryGroupAlleleCounts {
+    // Name of the ancestry group.
+    string ancestry_group = 1;
+    // The overall allele counts and the one by sex.
+    annonars.gnomad.gnomad3.AlleleCountsBySex counts = 2;
+
+    // The filtering allele frequency (using Poisson 95% CI).
+    optional float faf95 = 3;
+    // The filtering allele frequency (using Poisson 99% CI).
+    optional float faf99 = 4;
+    // The filtering allele frequency for XX samples (using Poisson 95% CI).
+    optional float faf95_xx = 5;
+    // The filtering allele frequency for XX samples (using Poisson 99% CI).
+    optional float faf99_xx = 6;
+    // The filtering allele frequency for XY samples (using Poisson 95% CI).
+    optional float faf95_xy = 7;
+    // The filtering allele frequency for XY samples (using Poisson 99% CI).
+    optional float faf99_xy = 8;
+}
+
+// Store the allele counts for the given cohort.
+message CohortAlleleCounts {
+    // Name of the cohort.
+    optional string cohort = 1;
+    // Allele counts for each population.
+    repeated AncestryGroupAlleleCounts by_ancestry_group = 2;
+    // Allele counts by sex.
+    annonars.gnomad.gnomad3.AlleleCountsBySex by_sex = 3;
+    // Raw allele counts.
+    annonars.gnomad.gnomad3.AlleleCounts raw = 4;
+
+    // The ancestry group with maximum AF.
+    optional string grpmax = 5;
+    // Maximum allele frequency across ancestry groups.
+    optional float af_grpmax = 6;
+    // Allele count in ancestry group with maximum AF.
+    optional int32 ac_grpmax = 7;
+    // Total number of alleles in ancestry group with maximum AF.
+    optional int32 an_grpmax = 8;
+    // Total number of homozygous individuals in ancestry group with maximum AF.
+    optional int32 nhomalt_grpmax = 9;
+}
+
+// VRS information
+message VrsInfo {
+    // The computed identifiers for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields
+    repeated string allele_ids = 1;
+    // Interresidue coordinates used as the location ends for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields
+    repeated int32 ends = 2;
+    // Interresidue coordinates used as the location starts for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields
+    repeated int32 starts = 3;
+    // The literal sequence states used for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields
+    repeated string states = 4;
+}
+
+// Protocol buffer for the gnomAD-nuclear VCF record.
+//
+// The more specialized fields from the INFO column are stored in separate, optional fields such
+// that we don't end up with a humongous message.
+message Record {
+    // Chromosome name.
+    string chrom = 1;
+    // 1-based start position.
+    int32 pos = 2;
+    // Reference allele.
+    string ref_allele = 3;
+    // Alternate allele.
+    string alt_allele = 4;
+
+    // Site-level filters.
+    repeated annonars.gnomad.gnomad3.Filter filters = 5;
+    // VEP annotation records.
+    repeated annonars.gnomad.vep_gnomad4.Vep vep = 6;
+
+    // Variant allele counts in the different cohorts and population.
+    //
+    // The populations in gnomAD v4 are: empty for global, "joint" for exome+genomes.
+    repeated CohortAlleleCounts allele_counts = 7;
+    // Variant (on sex chromosome) falls outside a pseudoautosomal region
+    bool nonpar = 8;
+    // All samples are heterozygous for the variant
+    bool only_het = 9;
+    // Variant falls outside of Broad exome capture regions (exomes only).
+    bool outside_broad_capture_region = 10;
+    // Variant falls outside of UK Biobank exome capture regions(exomes only).
+    bool outside_ukb_capture_region = 11;
+    // Variant was a callset-wide doubleton that was present only in two siblings (i.e., a singleton amongst unrelated samples in cohort) (exomes only).
+    bool sibling_singleton = 12;
+
+    // Information on variant scores.
+    optional EffectInfo effect_info = 13;
+    // Variant-related information details.
+    optional annonars.gnomad.gnomad3.VariantInfo variant_info = 14;
+    // Summary information for variant quality interpretation.
+    optional annonars.gnomad.gnomad3.QualityInfo quality_info = 15;
+    // Age-related information.
+    optional annonars.gnomad.gnomad3.AgeInfo age_info = 16;
+    // Depth of coverage-related information.
+    optional annonars.gnomad.gnomad3.DepthInfo depth_info = 17;
+    // VRS infos.
+    optional VrsInfo vrs_info = 18;
+}
diff --git a/protos/annonars/gnomad/vep_gnomad3.proto b/protos/annonars/gnomad/vep_gnomad3.proto
@@ -6,7 +6,7 @@ package annonars.gnomad.vep_gnomad3;
 
 import "annonars/gnomad/vep_common.proto";
 
-// Protocol buffer for the gnomAD-mtDNA VEP predictions.
+// Protocol buffer for the gnomAD-nuclear VEP predictions.
 message Vep {
     // Allele of record.
     string allele = 1;
@@ -42,6 +42,7 @@ message Vep {
     optional string amino_acids = 16;
     // Codons, e.g., `"gCg/gGg"`.
     optional string codons = 17;
+    // TODO: actually is optional int32 allele_num = 18;
     // dbSNP ID, e.g., `"rs28942080"`.
     optional string dbsnp_id = 18;
     // Distance output of VEP.

diff --git a/protos/annonars/gnomad/vep_gnomad4.proto b/protos/annonars/gnomad/vep_gnomad4.proto
@@ -0,0 +1,103 @@
+// Protocol buffer definitions for gnomAD v4 VEP records.
+
+syntax = "proto3";
+
+package annonars.gnomad.vep_gnomad4;
+
+import "annonars/gnomad/vep_common.proto";
+
+// Protocol buffer for the gnomAD-nuclear VEP predictions.
+message Vep {
+    // Allele of record.
+    string allele = 1;
+    // Consequence, e.g., `"missense_variant"`.
+    string consequence = 2;
+    // Impact, e.g., `"MODERATE"`.
+    string impact = 3;
+    // Gene symbol, e.g., `"PCSK9"`.
+    string symbol = 4;
+    // Gene ID, `e.g., "ENSG00000169174"`.
+    string gene = 5;
+    // Feature type, e.g., `"Transcript"`.
+    string feature_type = 6;
+    // Feature ID, e.g., `"ENST00000302118"`.
+    string feature = 7;
+    // Feature biotype, e.g., `"protein_coding"`.
+    string feature_biotype = 8;
+    // Ranked exon number, e.g., `"1/4"`.
+    optional string exon = 9;
+    // Ranked intron number, e.g., `"1/4"`.
+    optional string intron = 10;
+    // cDNA position, e.g., `"ENST00000302118.5:c.89C>G"`.
+    optional string hgvsc = 11;
+    // Protein position, e.g., `"ENSP00000302118.5:p.Thr30Arg"`.
+    optional string hgvsp = 12;
+    // cDNA position, e.g., `"89/1863"`.
+    optional string cdna_position = 13;
+    // CDS position, e.g., `"89/1863"`.
+    optional string cds_position = 14;
+    // Protein position, e.g., `"30/620"`.
+    optional string protein_position = 15;
+    // Amino acids, e.g., `"T/R"`.
+    optional string amino_acids = 16;
+    // Codons, e.g., `"gCg/gGg"`.
+    optional string codons = 17;
+    // Allele count.
+    optional int32 allele_num = 18;
+    // Distance output of VEP.
+    optional string distance = 19;
+    // Strand, e.g., `"1"`.
+    optional string strand = 20;
+    // Flags
+    optional string flags = 21;
+    // Variant class, e.g., `"SNV"`.
+    optional string variant_class = 22;
+    // Symbol source, e.g., `"HGNC"`.
+    optional string symbol_source = 23;
+    // HGNC ID, e.g., `"HGNC:8706"`.
+    optional string hgnc_id = 24;
+    // Whether this is the canonical transcript.
+    optional bool canonical = 25;
+    // Presence in MANE Select
+    optional bool mane_select = 26;
+    // Presence in MANE Plus Clinical
+    optional bool mane_plus_clinical = 27;
+    // Transcript support level, e.g., `"1"`.
+    optional int32 tsl = 28;
+    // APPRIS annotation, e.g. `"P1"`.
+    optional string appris = 29;
+    // CCDS ID, e.g., `"CCDS30547.1"`.
+    optional string ccds = 30;
+    // Ensembl protein ID, e.g., `"ENSP00000302118"`.
+    optional string ensp = 31;
+    // Uniprot isoform.
+    optional string uniprot_isoform = 32;
+    // Value of VEP "SOURCE" field.
+    optional string source = 33;
+    // Protein domains, e.g., `[["2p4e", "ENSP_mappings"], ["2qtw", "ENSP_mappings"]]`.
+    repeated annonars.gnomad.vep_common.Domain domains = 34;
+    // miRNA information.
+    optional string mirna = 35;
+    // HGVS offset.
+    optional string hgvs_offset = 36;
+    // PubMed IDs
+    optional string pubmed = 37;
+    // Motif name.
+    optional string motif_name = 38;
+    // Motif name.
+    optional string motif_pos = 39;
+    // "high inf pos" from VEP.
+    optional string high_inf_pos = 40;
+    // Motif score change.
+    optional string motif_score_change = 41;
+    // Transcription factors.
+    optional string transcription_factors = 42;
+    // Loss of function prediction.
+    optional string lof = 43;
+    // Loss of function filter.
+    optional string lof_filter = 44;
+    // Loss of function flags.
+    optional string lof_flags = 45;
+    // Loss of function info.
+    optional string lof_info = 46;
+}
diff --git a/src/common/noodles.rs b/src/common/noodles.rs
@@ -64,6 +64,19 @@ pub fn get_vec_str(record: &noodles_vcf::Record, name: &str) -> Result<Vec<Strin
     }
 }
 
+/// Extract an `Vec<i32>` field from record with an array field.
+///
+/// This is different than parsing the histograms from pipe-separated strings.
+pub fn get_vec_i32(record: &noodles_vcf::Record, name: &str) -> Result<Vec<i32>, anyhow::Error> {
+    if let Some(Some(field::Value::Array(field::value::Array::Integer(vs)))) =
+        record.info().get(&field::Key::from_str(name)?)
+    {
+        Ok(vs.iter().flatten().cloned().collect())
+    } else {
+        anyhow::bail!("missing INFO/{} in gnomAD record", name)
+    }
+}
+
 /// Extract an `Vec<FromStr>` field from a record encoded as a pipe symbol separated string.
 pub fn get_vec<T>(record: &noodles_vcf::Record, name: &str) -> Result<Vec<T>, anyhow::Error>
 where