Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: gnomAD v4 frequency import (#275) #368

Merged
merged 15 commits into from
Dec 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ tracing = "0.1"
tracing-subscriber = "0.3"
rustc-hash = "1.1.0"
noodles-gff = "0.25.0"
erased-serde = "0.4.1"

[build-dependencies]
prost-build = "0.12"
Expand All @@ -71,3 +72,9 @@ temp_testdir = "0.2"
test-log = "0.2"
tracing-subscriber = "0.3"
tracing-test = "0.2.4"

# Compile insta with full optimization.
[profile.dev.package.insta]
opt-level = 3
[profile.dev.package.similar]
opt-level = 3
2 changes: 2 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ fn main() -> Result<(), anyhow::Error> {
"annonars/gnomad/exac_cnv.proto",
"annonars/gnomad/gnomad2.proto",
"annonars/gnomad/gnomad3.proto",
"annonars/gnomad/gnomad4.proto",
"annonars/gnomad/gnomad_cnv4.proto",
"annonars/gnomad/gnomad_sv2.proto",
"annonars/gnomad/gnomad_sv4.proto",
"annonars/gnomad/mtdna.proto",
"annonars/gnomad/vep_common.proto",
"annonars/gnomad/vep_gnomad2.proto",
"annonars/gnomad/vep_gnomad3.proto",
"annonars/gnomad/vep_gnomad4.proto",
"annonars/helixmtdb/base.proto",
"annonars/regions/clingen.proto",
]
Expand Down
17 changes: 15 additions & 2 deletions protos/annonars/gnomad/gnomad3.proto
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
// Protobufs for gnomAD v3.
//
// In the case of smaller additions in **both exomes and genomes** in v4.0,
// we extended the protobufs here and re-used them.

syntax = "proto3";

package annonars.gnomad.gnomad3;
Expand All @@ -11,7 +16,11 @@ enum Filter {
// Allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB <
// 0.2 for het calls).
FILTER_ALLELE_COUNT_IS_ZERO = 1;
// Failed VQSR filtering thresholds of -2.7739 for SNPs and -1.0606 for indels.
// Failed VQSR filtering thresholds of:
//
// gnomAD-genomes v3: -2.7739 for SNPs and -1.0606 for indels
// gnomAD-genomes v4: -2.502 for SNPs and -0.7156 for indels
// gnomAD-exomes v4: -1.4526 for SNPs and 0.0717 for indels
FILTER_AS_VSQR = 2;
// InbreedingCoeff < -0.3.
FILTER_INBREEDING_COEFF = 3;
Expand Down Expand Up @@ -185,6 +194,8 @@ message QualityInfo {
optional int32 as_qual_approx = 18;
// Allele-specific forward/reverse read counts for strand bias tests.
optional string as_sb_table = 19;
// Strand bias estimated by the symmetric odds ratio test (v4 only).
optional float sor = 20;
}


Expand All @@ -202,9 +213,11 @@ message VariantInfo {
bool monoallelic = 5;
// Depth over variant genotypes (does not include depth of reference samples).
int32 var_dp = 6;
// Allele-specific depth over variant genotypes (does not include depth of reference samples) (v4 only).
optional int32 as_vardp = 7;
}

// Protocol buffer for the gnomAD-mtDNA VCF record.
// Protocol buffer for the gnomAD-nuclear VCF record.
//
// The more specialized fields from the INFO column are stored in separate, optional fields such
// that we don't end up with a humongous message.
Expand Down
130 changes: 130 additions & 0 deletions protos/annonars/gnomad/gnomad4.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
syntax = "proto3";

package annonars.gnomad.gnomad4;

import "annonars/gnomad/gnomad3.proto";
import "annonars/gnomad/vep_gnomad4.proto";

// Store details on variant effect predictions.
message EffectInfo {
// Pangolin's largest delta score across 2 splicing consequences, which reflects the probability of the variant being splice-altering">
optional float pangolin_largest_ds = 1;
// Base-wise conservation score across the 241 placental mammals in the Zoonomia project. Score ranges from -20 to 9.28, and reflects acceleration (faster evolution than expected under neutral drift, assigned negative scores) as well as conservation (slower than expected evolution, assigned positive scores).">
optional float phylop = 2;
// Score that predicts the possible impact of an amino acid substitution on the structure and function of a human protein, ranging from 0.0 (tolerated) to 1.0 (deleterious). We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript.">
optional float polyphen_max = 3;
// The maximum REVEL score at a site's MANE Select or canonical transcript. It's an ensemble score for predicting the pathogenicity of missense variants (based on 13 other variant predictors). Scores ranges from 0 to 1. Variants with higher scores are predicted to be more likely to be deleterious.">
optional float revel_max = 4;
// Score reflecting the scaled probability of the amino acid substitution being tolerated, ranging from 0 to 1. Scores below 0.05 are predicted to impact protein function. We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript.">
optional float sift_max = 5;
// Illumina's SpliceAI max delta score; interpreted as the probability of the variant being splice-altering.">
optional float spliceai_ds_max = 6;
// Raw CADD scores are interpretable as the extent to which the annotation profile for a given variant suggests that the variant is likely to be 'observed' (negative values) vs 'simulated' (positive values). Larger values are more deleterious.
optional float cadd_raw = 7;
// Cadd Phred-like scores ('scaled C-scores') ranging from 1 to 99, based on the rank of each variant relative to all possible 8.6 billion substitutions in the human reference genome. Larger values are more deleterious.
optional float cadd_phred = 8;
}

// Store the allele counts for the given sub cohort in the given ancestry group.
message AncestryGroupAlleleCounts {
// Name of the ancestry group.
string ancestry_group = 1;
// The overall allele counts and the one by sex.
annonars.gnomad.gnomad3.AlleleCountsBySex counts = 2;

// The filtering allele frequency (using Poisson 95% CI).
optional float faf95 = 3;
// The filtering allele frequency (using Poisson 99% CI).
optional float faf99 = 4;
// The filtering allele frequency for XX samples (using Poisson 95% CI).
optional float faf95_xx = 5;
// The filtering allele frequency for XX samples (using Poisson 99% CI).
optional float faf99_xx = 6;
// The filtering allele frequency for XY samples (using Poisson 95% CI).
optional float faf95_xy = 7;
// The filtering allele frequency for XY samples (using Poisson 99% CI).
optional float faf99_xy = 8;
}

// Store the allele counts for the given cohort.
message CohortAlleleCounts {
// Name of the cohort.
optional string cohort = 1;
// Allele counts for each population.
repeated AncestryGroupAlleleCounts by_ancestry_group = 2;
// Allele counts by sex.
annonars.gnomad.gnomad3.AlleleCountsBySex by_sex = 3;
// Raw allele counts.
annonars.gnomad.gnomad3.AlleleCounts raw = 4;

// The ancestry group with maximum AF.
optional string grpmax = 5;
// Maximum allele frequency across ancestry groups.
optional float af_grpmax = 6;
// Allele count in ancestry group with maximum AF.
optional int32 ac_grpmax = 7;
// Total number of alleles in ancestry group with maximum AF.
optional int32 an_grpmax = 8;
// Total number of homozygous individuals in ancestry group with maximum AF.
optional int32 nhomalt_grpmax = 9;
}

// VRS information
message VrsInfo {
// The computed identifiers for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields
repeated string allele_ids = 1;
// Interresidue coordinates used as the location ends for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields
repeated int32 ends = 2;
// Interresidue coordinates used as the location starts for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields
repeated int32 starts = 3;
// The literal sequence states used for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields
repeated string states = 4;
}

// Protocol buffer for the gnomAD-nuclear VCF record.
//
// The more specialized fields from the INFO column are stored in separate, optional fields such
// that we don't end up with a humongous message.
message Record {
// Chromosome name.
string chrom = 1;
// 1-based start position.
int32 pos = 2;
// Reference allele.
string ref_allele = 3;
// Alternate allele.
string alt_allele = 4;

// Site-level filters.
repeated annonars.gnomad.gnomad3.Filter filters = 5;
// VEP annotation records.
repeated annonars.gnomad.vep_gnomad4.Vep vep = 6;

// Variant allele counts in the different cohorts and population.
//
// The populations in gnomAD v4 are: empty for global, "joint" for exome+genomes.
repeated CohortAlleleCounts allele_counts = 7;
// Variant (on sex chromosome) falls outside a pseudoautosomal region
bool nonpar = 8;
// All samples are heterozygous for the variant
bool only_het = 9;
// Variant falls outside of Broad exome capture regions (exomes only).
bool outside_broad_capture_region = 10;
// Variant falls outside of UK Biobank exome capture regions(exomes only).
bool outside_ukb_capture_region = 11;
// Variant was a callset-wide doubleton that was present only in two siblings (i.e., a singleton amongst unrelated samples in cohort) (exomes only).
bool sibling_singleton = 12;

// Information on variant scores.
optional EffectInfo effect_info = 13;
// Variant-related information details.
optional annonars.gnomad.gnomad3.VariantInfo variant_info = 14;
// Summary information for variant quality interpretation.
optional annonars.gnomad.gnomad3.QualityInfo quality_info = 15;
// Age-related information.
optional annonars.gnomad.gnomad3.AgeInfo age_info = 16;
// Depth of coverage-related information.
optional annonars.gnomad.gnomad3.DepthInfo depth_info = 17;
// VRS infos.
optional VrsInfo vrs_info = 18;
}
3 changes: 2 additions & 1 deletion protos/annonars/gnomad/vep_gnomad3.proto
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ package annonars.gnomad.vep_gnomad3;

import "annonars/gnomad/vep_common.proto";

// Protocol buffer for the gnomAD-mtDNA VEP predictions.
// Protocol buffer for the gnomAD-nuclear VEP predictions.
message Vep {
// Allele of record.
string allele = 1;
Expand Down Expand Up @@ -42,6 +42,7 @@ message Vep {
optional string amino_acids = 16;
// Codons, e.g., `"gCg/gGg"`.
optional string codons = 17;
// TODO: actually is optional int32 allele_num = 18;
// dbSNP ID, e.g., `"rs28942080"`.
optional string dbsnp_id = 18;
// Distance output of VEP.
Expand Down
103 changes: 103 additions & 0 deletions protos/annonars/gnomad/vep_gnomad4.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// Protocol buffer definitions for gnomAD v4 VEP records.

syntax = "proto3";

package annonars.gnomad.vep_gnomad4;

import "annonars/gnomad/vep_common.proto";

// Protocol buffer for the gnomAD-nuclear VEP predictions.
message Vep {
// Allele of record.
string allele = 1;
// Consequence, e.g., `"missense_variant"`.
string consequence = 2;
// Impact, e.g., `"MODERATE"`.
string impact = 3;
// Gene symbol, e.g., `"PCSK9"`.
string symbol = 4;
// Gene ID, `e.g., "ENSG00000169174"`.
string gene = 5;
// Feature type, e.g., `"Transcript"`.
string feature_type = 6;
// Feature ID, e.g., `"ENST00000302118"`.
string feature = 7;
// Feature biotype, e.g., `"protein_coding"`.
string feature_biotype = 8;
// Ranked exon number, e.g., `"1/4"`.
optional string exon = 9;
// Ranked intron number, e.g., `"1/4"`.
optional string intron = 10;
// cDNA position, e.g., `"ENST00000302118.5:c.89C>G"`.
optional string hgvsc = 11;
// Protein position, e.g., `"ENSP00000302118.5:p.Thr30Arg"`.
optional string hgvsp = 12;
// cDNA position, e.g., `"89/1863"`.
optional string cdna_position = 13;
// CDS position, e.g., `"89/1863"`.
optional string cds_position = 14;
// Protein position, e.g., `"30/620"`.
optional string protein_position = 15;
// Amino acids, e.g., `"T/R"`.
optional string amino_acids = 16;
// Codons, e.g., `"gCg/gGg"`.
optional string codons = 17;
// Allele count.
optional int32 allele_num = 18;
// Distance output of VEP.
optional string distance = 19;
// Strand, e.g., `"1"`.
optional string strand = 20;
// Flags
optional string flags = 21;
// Variant class, e.g., `"SNV"`.
optional string variant_class = 22;
// Symbol source, e.g., `"HGNC"`.
optional string symbol_source = 23;
// HGNC ID, e.g., `"HGNC:8706"`.
optional string hgnc_id = 24;
// Whether this is the canonical transcript.
optional bool canonical = 25;
// Presence in MANE Select
optional bool mane_select = 26;
// Presence in MANE Plus Clinical
optional bool mane_plus_clinical = 27;
// Transcript support level, e.g., `"1"`.
optional int32 tsl = 28;
// APPRIS annotation, e.g. `"P1"`.
optional string appris = 29;
// CCDS ID, e.g., `"CCDS30547.1"`.
optional string ccds = 30;
// Ensembl protein ID, e.g., `"ENSP00000302118"`.
optional string ensp = 31;
// Uniprot isoform.
optional string uniprot_isoform = 32;
// Value of VEP "SOURCE" field.
optional string source = 33;
// Protein domains, e.g., `[["2p4e", "ENSP_mappings"], ["2qtw", "ENSP_mappings"]]`.
repeated annonars.gnomad.vep_common.Domain domains = 34;
// miRNA information.
optional string mirna = 35;
// HGVS offset.
optional string hgvs_offset = 36;
// PubMed IDs
optional string pubmed = 37;
// Motif name.
optional string motif_name = 38;
// Motif name.
optional string motif_pos = 39;
// "high inf pos" from VEP.
optional string high_inf_pos = 40;
// Motif score change.
optional string motif_score_change = 41;
// Transcription factors.
optional string transcription_factors = 42;
// Loss of function prediction.
optional string lof = 43;
// Loss of function filter.
optional string lof_filter = 44;
// Loss of function flags.
optional string lof_flags = 45;
// Loss of function info.
optional string lof_info = 46;
}
13 changes: 13 additions & 0 deletions src/common/noodles.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,19 @@ pub fn get_vec_str(record: &noodles_vcf::Record, name: &str) -> Result<Vec<Strin
}
}

/// Extract an `Vec<i32>` field from record with an array field.
///
/// This is different than parsing the histograms from pipe-separated strings.
pub fn get_vec_i32(record: &noodles_vcf::Record, name: &str) -> Result<Vec<i32>, anyhow::Error> {
if let Some(Some(field::Value::Array(field::value::Array::Integer(vs)))) =
record.info().get(&field::Key::from_str(name)?)
{
Ok(vs.iter().flatten().cloned().collect())
} else {
anyhow::bail!("missing INFO/{} in gnomAD record", name)
}
}

/// Extract an `Vec<FromStr>` field from a record encoded as a pipe symbol separated string.
pub fn get_vec<T>(record: &noodles_vcf::Record, name: &str) -> Result<Vec<T>, anyhow::Error>
where
Expand Down
Loading
Loading