-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
- Loading branch information
Showing
203 changed files
with
17,202 additions
and
221 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
syntax = "proto3"; | ||
|
||
package annonars.gnomad.gnomad4; | ||
|
||
import "annonars/gnomad/gnomad3.proto"; | ||
import "annonars/gnomad/vep_gnomad4.proto"; | ||
|
||
// Store details on variant effect predictions. | ||
message EffectInfo { | ||
// Pangolin's largest delta score across 2 splicing consequences, which reflects the probability of the variant being splice-altering"> | ||
optional float pangolin_largest_ds = 1; | ||
// Base-wise conservation score across the 241 placental mammals in the Zoonomia project. Score ranges from -20 to 9.28, and reflects acceleration (faster evolution than expected under neutral drift, assigned negative scores) as well as conservation (slower than expected evolution, assigned positive scores)."> | ||
optional float phylop = 2; | ||
// Score that predicts the possible impact of an amino acid substitution on the structure and function of a human protein, ranging from 0.0 (tolerated) to 1.0 (deleterious). We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript."> | ||
optional float polyphen_max = 3; | ||
// The maximum REVEL score at a site's MANE Select or canonical transcript. It's an ensemble score for predicting the pathogenicity of missense variants (based on 13 other variant predictors). Scores ranges from 0 to 1. Variants with higher scores are predicted to be more likely to be deleterious."> | ||
optional float revel_max = 4; | ||
// Score reflecting the scaled probability of the amino acid substitution being tolerated, ranging from 0 to 1. Scores below 0.05 are predicted to impact protein function. We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript."> | ||
optional float sift_max = 5; | ||
// Illumina's SpliceAI max delta score; interpreted as the probability of the variant being splice-altering."> | ||
optional float spliceai_ds_max = 6; | ||
// Raw CADD scores are interpretable as the extent to which the annotation profile for a given variant suggests that the variant is likely to be 'observed' (negative values) vs 'simulated' (positive values). Larger values are more deleterious. | ||
optional float cadd_raw = 7; | ||
// Cadd Phred-like scores ('scaled C-scores') ranging from 1 to 99, based on the rank of each variant relative to all possible 8.6 billion substitutions in the human reference genome. Larger values are more deleterious. | ||
optional float cadd_phred = 8; | ||
} | ||
|
||
// Store the allele counts for the given sub cohort in the given ancestry group. | ||
message AncestryGroupAlleleCounts { | ||
// Name of the ancestry group. | ||
string ancestry_group = 1; | ||
// The overall allele counts and the one by sex. | ||
annonars.gnomad.gnomad3.AlleleCountsBySex counts = 2; | ||
|
||
// The filtering allele frequency (using Poisson 95% CI). | ||
optional float faf95 = 3; | ||
// The filtering allele frequency (using Poisson 99% CI). | ||
optional float faf99 = 4; | ||
// The filtering allele frequency for XX samples (using Poisson 95% CI). | ||
optional float faf95_xx = 5; | ||
// The filtering allele frequency for XX samples (using Poisson 99% CI). | ||
optional float faf99_xx = 6; | ||
// The filtering allele frequency for XY samples (using Poisson 95% CI). | ||
optional float faf95_xy = 7; | ||
// The filtering allele frequency for XY samples (using Poisson 99% CI). | ||
optional float faf99_xy = 8; | ||
} | ||
|
||
// Store the allele counts for the given cohort. | ||
message CohortAlleleCounts { | ||
// Name of the cohort. | ||
optional string cohort = 1; | ||
// Allele counts for each population. | ||
repeated AncestryGroupAlleleCounts by_ancestry_group = 2; | ||
// Allele counts by sex. | ||
annonars.gnomad.gnomad3.AlleleCountsBySex by_sex = 3; | ||
// Raw allele counts. | ||
annonars.gnomad.gnomad3.AlleleCounts raw = 4; | ||
|
||
// The ancestry group with maximum AF. | ||
optional string grpmax = 5; | ||
// Maximum allele frequency across ancestry groups. | ||
optional float af_grpmax = 6; | ||
// Allele count in ancestry group with maximum AF. | ||
optional int32 ac_grpmax = 7; | ||
// Total number of alleles in ancestry group with maximum AF. | ||
optional int32 an_grpmax = 8; | ||
// Total number of homozygous individuals in ancestry group with maximum AF. | ||
optional int32 nhomalt_grpmax = 9; | ||
} | ||
|
||
// VRS information | ||
message VrsInfo { | ||
// The computed identifiers for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields | ||
repeated string allele_ids = 1; | ||
// Interresidue coordinates used as the location ends for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields | ||
repeated int32 ends = 2; | ||
// Interresidue coordinates used as the location starts for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields | ||
repeated int32 starts = 3; | ||
// The literal sequence states used for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields | ||
repeated string states = 4; | ||
} | ||
|
||
// Protocol buffer for the gnomAD-nuclear VCF record. | ||
// | ||
// The more specialized fields from the INFO column are stored in separate, optional fields such | ||
// that we don't end up with a humongous message. | ||
message Record { | ||
// Chromosome name. | ||
string chrom = 1; | ||
// 1-based start position. | ||
int32 pos = 2; | ||
// Reference allele. | ||
string ref_allele = 3; | ||
// Alternate allele. | ||
string alt_allele = 4; | ||
|
||
// Site-level filters. | ||
repeated annonars.gnomad.gnomad3.Filter filters = 5; | ||
// VEP annotation records. | ||
repeated annonars.gnomad.vep_gnomad4.Vep vep = 6; | ||
|
||
// Variant allele counts in the different cohorts and population. | ||
// | ||
// The populations in gnomAD v4 are: empty for global, "joint" for exome+genomes. | ||
repeated CohortAlleleCounts allele_counts = 7; | ||
// Variant (on sex chromosome) falls outside a pseudoautosomal region | ||
bool nonpar = 8; | ||
// All samples are heterozygous for the variant | ||
bool only_het = 9; | ||
// Variant falls outside of Broad exome capture regions (exomes only). | ||
bool outside_broad_capture_region = 10; | ||
// Variant falls outside of UK Biobank exome capture regions(exomes only). | ||
bool outside_ukb_capture_region = 11; | ||
// Variant was a callset-wide doubleton that was present only in two siblings (i.e., a singleton amongst unrelated samples in cohort) (exomes only). | ||
bool sibling_singleton = 12; | ||
|
||
// Information on variant scores. | ||
optional EffectInfo effect_info = 13; | ||
// Variant-related information details. | ||
optional annonars.gnomad.gnomad3.VariantInfo variant_info = 14; | ||
// Summary information for variant quality interpretation. | ||
optional annonars.gnomad.gnomad3.QualityInfo quality_info = 15; | ||
// Age-related information. | ||
optional annonars.gnomad.gnomad3.AgeInfo age_info = 16; | ||
// Depth of coverage-related information. | ||
optional annonars.gnomad.gnomad3.DepthInfo depth_info = 17; | ||
// VRS infos. | ||
optional VrsInfo vrs_info = 18; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
// Protocol buffer definitions for gnomAD v4 VEP records. | ||
|
||
syntax = "proto3"; | ||
|
||
package annonars.gnomad.vep_gnomad4; | ||
|
||
import "annonars/gnomad/vep_common.proto"; | ||
|
||
// Protocol buffer for the gnomAD-nuclear VEP predictions. | ||
message Vep { | ||
// Allele of record. | ||
string allele = 1; | ||
// Consequence, e.g., `"missense_variant"`. | ||
string consequence = 2; | ||
// Impact, e.g., `"MODERATE"`. | ||
string impact = 3; | ||
// Gene symbol, e.g., `"PCSK9"`. | ||
string symbol = 4; | ||
// Gene ID, `e.g., "ENSG00000169174"`. | ||
string gene = 5; | ||
// Feature type, e.g., `"Transcript"`. | ||
string feature_type = 6; | ||
// Feature ID, e.g., `"ENST00000302118"`. | ||
string feature = 7; | ||
// Feature biotype, e.g., `"protein_coding"`. | ||
string feature_biotype = 8; | ||
// Ranked exon number, e.g., `"1/4"`. | ||
optional string exon = 9; | ||
// Ranked intron number, e.g., `"1/4"`. | ||
optional string intron = 10; | ||
// cDNA position, e.g., `"ENST00000302118.5:c.89C>G"`. | ||
optional string hgvsc = 11; | ||
// Protein position, e.g., `"ENSP00000302118.5:p.Thr30Arg"`. | ||
optional string hgvsp = 12; | ||
// cDNA position, e.g., `"89/1863"`. | ||
optional string cdna_position = 13; | ||
// CDS position, e.g., `"89/1863"`. | ||
optional string cds_position = 14; | ||
// Protein position, e.g., `"30/620"`. | ||
optional string protein_position = 15; | ||
// Amino acids, e.g., `"T/R"`. | ||
optional string amino_acids = 16; | ||
// Codons, e.g., `"gCg/gGg"`. | ||
optional string codons = 17; | ||
// Allele count. | ||
optional int32 allele_num = 18; | ||
// Distance output of VEP. | ||
optional string distance = 19; | ||
// Strand, e.g., `"1"`. | ||
optional string strand = 20; | ||
// Flags | ||
optional string flags = 21; | ||
// Variant class, e.g., `"SNV"`. | ||
optional string variant_class = 22; | ||
// Symbol source, e.g., `"HGNC"`. | ||
optional string symbol_source = 23; | ||
// HGNC ID, e.g., `"HGNC:8706"`. | ||
optional string hgnc_id = 24; | ||
// Whether this is the canonical transcript. | ||
optional bool canonical = 25; | ||
// Presence in MANE Select | ||
optional bool mane_select = 26; | ||
// Presence in MANE Plus Clinical | ||
optional bool mane_plus_clinical = 27; | ||
// Transcript support level, e.g., `"1"`. | ||
optional int32 tsl = 28; | ||
// APPRIS annotation, e.g. `"P1"`. | ||
optional string appris = 29; | ||
// CCDS ID, e.g., `"CCDS30547.1"`. | ||
optional string ccds = 30; | ||
// Ensembl protein ID, e.g., `"ENSP00000302118"`. | ||
optional string ensp = 31; | ||
// Uniprot isoform. | ||
optional string uniprot_isoform = 32; | ||
// Value of VEP "SOURCE" field. | ||
optional string source = 33; | ||
// Protein domains, e.g., `[["2p4e", "ENSP_mappings"], ["2qtw", "ENSP_mappings"]]`. | ||
repeated annonars.gnomad.vep_common.Domain domains = 34; | ||
// miRNA information. | ||
optional string mirna = 35; | ||
// HGVS offset. | ||
optional string hgvs_offset = 36; | ||
// PubMed IDs | ||
optional string pubmed = 37; | ||
// Motif name. | ||
optional string motif_name = 38; | ||
// Motif name. | ||
optional string motif_pos = 39; | ||
// "high inf pos" from VEP. | ||
optional string high_inf_pos = 40; | ||
// Motif score change. | ||
optional string motif_score_change = 41; | ||
// Transcription factors. | ||
optional string transcription_factors = 42; | ||
// Loss of function prediction. | ||
optional string lof = 43; | ||
// Loss of function filter. | ||
optional string lof_filter = 44; | ||
// Loss of function flags. | ||
optional string lof_flags = 45; | ||
// Loss of function info. | ||
optional string lof_info = 46; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.