Add more annotations to IMPACT (#267)

* Add more annotations to IMPACT
BimberLab · Jul 11, 2023 · 0d057b5 · 0d057b5
1 parent 874ceab
commit 0d057b5
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 6 deletions.
diff --git a/src/main/java/com/github/discvrseq/walkers/MultiSourceAnnotator.java b/src/main/java/com/github/discvrseq/walkers/MultiSourceAnnotator.java
@@ -406,7 +406,7 @@ public class MultiSourceAnnotator extends VariantWalker {
 
     private final List<String> allAnnotationKeys = new ArrayList<>();
 
-    private final Collection<String> ALLOWABLE_FILTERS = Arrays.asList("ReverseComplementedIndel", "NoTarget", "MismatchedRefAllele", "IndelStraddlesMultipleIntevals");
+    private final Collection<String> ALLOWABLE_FILTERS = Arrays.asList("ReverseComplementedIndel", "NoTarget", "MismatchedRefAllele", "IndelStraddlesMultipleIntervals");
 
     @Override
     public void onTraversalStart() {

diff --git a/src/main/java/com/github/discvrseq/walkers/annotator/Impact.java b/src/main/java/com/github/discvrseq/walkers/annotator/Impact.java
@@ -25,6 +25,8 @@ public class Impact implements InfoFieldAnnotation, StandardAnnotation {
 
     public static final String IMPACT_KEY = "IMPACT";
     public static final String IMPACT_GENES_KEY = "HIG";
+    public static final String OVERLAPPING_GENES_KEY = "OG";
+    public static final String EFFECT_KEY = "VE";
 
     @Override
     public Map<String, Object> annotate(ReferenceContext ref, VariantContext vc, AlleleLikelihoods<GATKRead, Allele> likelihoods) {
@@ -50,6 +52,8 @@ public Map<String, Object> annotate(ReferenceContext ref, VariantContext vc, All
 
         Map<Allele, String> impactMap = new HashMap<>();
         Map<Allele, String> impactGeneMap = new HashMap<>();
+        Map<Allele, String> allGeneMap = new HashMap<>();
+        Map<Allele, String> effectMap = new HashMap<>();
 
         for (Allele a : vc.getAlternateAlleles()) {
             if (!annByAllele.containsKey(a.getBaseString())) {
@@ -58,14 +62,23 @@ public Map<String, Object> annotate(ReferenceContext ref, VariantContext vc, All
 
             Set<String> impacts = new HashSet<>();
             Set<String> hig = new HashSet<>();
+            Set<String> og = new HashSet<>();
+            Set<String> effects = new HashSet<>();
             for (String[] split : annByAllele.get(a.getBaseString())) {
                 if (split.length > 7 && "protein_coding".equals(split[7])) {
                     impacts.add(split[2]);
 
-                    if ("HIGH".equals(split[2])) {
-                        hig.add(split[3]);
+                    if (!split[3].isEmpty()) {
+                        og.add(split[3]);
+                        if ("HIGH".equals(split[2])) {
+                            hig.add(split[3]);
+                        }
                     }
                 }
+
+                if (!split[1].isEmpty()) {
+                    effects.add(split[1]);
+                }
             }
 
             for (String val : Arrays.asList("HIGH", "LOW", "MODERATE")) {
@@ -78,28 +91,46 @@ public Map<String, Object> annotate(ReferenceContext ref, VariantContext vc, All
             if (!hig.isEmpty()) {
                 impactGeneMap.put(a, StringUtils.join(hig, "|"));
             }
+
+            if (!og.isEmpty()) {
+                allGeneMap.put(a, StringUtils.join(og, "|"));
+            }
+
+            if (!effects.isEmpty()) {
+                effectMap.put(a, StringUtils.join(effects, "|"));
+            }
         }
 
         if (!impactMap.isEmpty()) {
             attributeMap.put(IMPACT_KEY, vc.getAlternateAlleles().stream().map(a -> impactMap.getOrDefault(a, "")).toList());
         }
 
-        if (!impactMap.isEmpty()) {
+        if (!impactGeneMap.isEmpty()) {
             attributeMap.put(IMPACT_GENES_KEY, vc.getAlternateAlleles().stream().map(a -> impactGeneMap.getOrDefault(a, "")).toList());
         }
 
+        if (!allGeneMap.isEmpty()) {
+            attributeMap.put(OVERLAPPING_GENES_KEY, vc.getAlternateAlleles().stream().map(a -> allGeneMap.getOrDefault(a, "")).toList());
+        }
+
+        if (!effectMap.isEmpty()) {
+            attributeMap.put(EFFECT_KEY, vc.getAlternateAlleles().stream().map(a -> effectMap.getOrDefault(a, "")).toList());
+        }
+
         if (attributeMap.isEmpty()) {
             return null;
         }
 
         return attributeMap;
     }
 
-    public List<String> getKeyNames() { return Arrays.asList(IMPACT_KEY, IMPACT_GENES_KEY); }
+    public List<String> getKeyNames() { return Arrays.asList(IMPACT_KEY, IMPACT_GENES_KEY, OVERLAPPING_GENES_KEY, EFFECT_KEY); }
 
     public List<VCFCompoundHeaderLine> getDescriptions() { return Arrays.asList(
             new VCFInfoHeaderLine(IMPACT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "The highest impact annotation provided by SnpEff, limited to protein_coding features"),
-            new VCFInfoHeaderLine(IMPACT_GENES_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "A comma-separated list of any overlapping genes with high-impact effects on protein coding, identified by SnpEff")
+            new VCFInfoHeaderLine(IMPACT_GENES_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "A comma-separated list of any overlapping genes with high-impact effects on protein coding, identified by SnpEff"),
+            new VCFInfoHeaderLine(OVERLAPPING_GENES_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "A comma-separated list of any overlapping genes, identified by SnpEff. This includes variants immediately upstream/downstream of the coding region."),
+            new VCFInfoHeaderLine(EFFECT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "A comma-separated list of predicted variant effects, generated by SnpEff")
         );
     }
 }
diff --git a/src/test/resources/com/github/discvrseq/walkers/MultiSourceAnnotator/multiSourceOutput.vcf b/src/test/resources/com/github/discvrseq/walkers/MultiSourceAnnotator/multiSourceOutput.vcf
@@ -152,6 +152,7 @@
 ##INFO=<ID=NM,Number=.,Type=String,Description="nsdb Interpro_domain: domain or conserved site on which the variant locates. Domain annotations come from Interpro database. The number in the brackets following a specific domain is the count of times Interpro assigns the variant position to that domain, typically coming from different predicting databases. Multiple entries separated by ';'.">
 ##INFO=<ID=NMD,Number=.,Type=String,Description="Predicted nonsense mediated decay effects for this variant. Format: 'Gene_Name | Gene_ID | Number_of_transcripts_in_gene | Percent_of_transcripts_affected'">
 ##INFO=<ID=OA,Number=R,Type=String,Description="This contains a list of the original alleles at this site.  It is primarily intended to store information prior to liftover.">
+##INFO=<ID=OG,Number=A,Type=String,Description="A comma-separated list of any overlapping genes, identified by SnpEff. This includes variants immediately upstream/downstream of the coding region.">
 ##INFO=<ID=OMIMC,Number=.,Type=String,Description="OMIM Comments.">
 ##INFO=<ID=OMIMD,Number=.,Type=String,Description="OMIM Disorders.">
 ##INFO=<ID=OMIMM,Number=.,Type=String,Description="OMIM Method code:A = in situ DNA-RNA or DNA-DNA annealing (`hybridization'); e.g., ribosomal RNAgenes to acrocentric chromosomes;kappa light chain genes to chromosome 2.AAS = deductions from the amino acid sequence of proteins; e.g., linkage of delta and beta hemoglobin loci from study of hemoglobin Lepore.(Includes deductions of hybrid protein  structure by monoclonal antibodies; e.g., close linkage of MN and SS from study of Lepore-like MNSs blood group antigen.)Also includes examples of hybrid genes as in one form of hypertrophiccardiomyopathy and in apolipoprotein (Detroit).C = chromosome mediated gene transfer (CMGT); e.g., cotransfer of galactokinaseand thymidine kinase.(In conjunction with this approach fluorescence-activated flow sorting can be used for transfer of specific chromosomes.)Ch = chromosomal change associated with particular phenotype and not proved to represent linkage (Fc), deletion (D), or virus effect (V);  e.g., loss of 13q14 band in some cases of retinoblastoma.(`Fragile sites,' observed in cultured cells with or withoutfolate-deficient medium or BrdU treatment, fall into this class of method; e.g., fragile site at Xq27.3 in one form of X-linked mental retardation.Fragile sites have been used as markers in family linkage studies; e.g., FS16q22 and haptoglobin.)D = deletion or dosage mapping (concurrence of chromosomal deletion and phenotypic evidence of hemizygosity), trisomy mapping (presence of three alleles in the case of a highlypolymorphic locus), or gene dosage effects (correlation of trisomic state of part or all of a chromosome with 50% more gene product).Includes 'loss of heterozygosity' (loss of alleles) in malignancies.Examples:  glutathione reductase to chromosome 8.Includes DNA dosage; e.g., fibrinogen loci to 4q2.Dosage mapping also includes coamplification in tumor cells.EM = exclusion mapping, i.e., narrowing the possible location of loci by exclusion of parts of the map by deletion mapping, extended to include negative lod scores from families with marker chromosomes and negative lod scores with other assigned loci; e.g., support for assignment of MNSs to 4q.F = linkage study in families; e.g., linkage of ABO blood group andnail-patella syndrome.(When a chromosomal heteromorphism or rearrangement is one trait, Fcis used; e.g., Duffy blood group locus on chromosome 1.When 1 or both of the linked loci are identified by a DNA polymorphism,Fd is used; e.g., Huntington disease on chromosome 4.  F = L inthe HGM workshops.)H = based on presumed homology; e.g., proposed assignment of TF to 3q.Includes Ohno's law of evolutionary conservatism of X chromosome in mammals.Mainly heuristic or confirmatory.HS = DNA/cDNA molecular hybridization in solution (`Cot analysis'); e.g., assignment of Hb beta to chromosome 11 in derivative hybrid cells.L = lyonization; e.g., OTC to X chromosome.  (L = family linkage study in the HGM workshops.)LD = linkage disequilibrium; e.g., beta and delta globin genes (HBB, HBD).M = Microcell mediated gene transfer (MMGT); e.g., a collagen gene (COL1A1) to chromosome l7.OT = ovarian teratoma (centromere mapping); e.g., PGM3 and centromere of chromosome 6.Pcm = PCR of microdissected chromosome segments (see REl).Psh = PCR of somatic cell hybrid DNA.R = irradiation of cells followed by `rescue' through fusion with nonirradiated (nonhuman) cells (Goss-Harris method of radiation-induced gene segregation); e.g., order of genes on  Xq.(Also called cotransference. The complement of cotransference = recombination.)RE = Restriction endonuclease techniques; e.g., fine structure map of the beta-globin cluster (HBBC) on 11p; physical linkage of 3 fibrinogen genes (on 4q) and APOA1 and APOC3 (on 11p).REa = combined with somatic cell hybridization; e.g., NAG (HBBC) to 11p.REb = combined with chromosome sorting; e.g., insulin to 11p.Includes Lebo's adaptation (dual laser chromosome sorting and spot blot DNAanalysis); e.g., MGP to 11q.  (For this method, using flow sortedchromosomes, W is the symbol adopted by the HGM workshops.)REc = hybridization of cDNA to genomic fragment (by YAC, PFGE,microdissection, etc.), e.g., A11 on Xq.REf = isolation of gene from genomic DNA; includes 'exon trapping'REl = isolation of gene from chromosome-specific genomic library (see Pcm).REn = neighbor analysis in restriction fragments, e.g., in PFGE.S = `segregation' (cosegregation) of human cellular traits and human chromosomes (or segments of chromosomes) in particular clones from interspeciessomatic cell hybrids; e.g., thymidine kinase to chromosome 17.When with restriction enzyme, REa; with hybridization in solution, HS.T = TACT = telomere-associated chromosome fragmentation; e.g., interferon-inducible protein 6-16.V = induction of microscopically evident chromosomal change by a virus; e.g., adenovirus 12 changes on chromosomes 1 and 17.X/A = X-autosome translocation in female with X-linked recessive disorder;e.g., assignment of Duchenne muscular dystrophy to Xp21.">
@@ -190,6 +191,7 @@
 ##INFO=<ID=TEUR,Number=.,Type=String,Description="Thousand Genomes EUR AF">
 ##INFO=<ID=TMAF,Number=.,Type=String,Description="Thousand Genomes Allele frequency">
 ##INFO=<ID=UCG,Number=.,Type=String,Description="Mutation type (UCSC)">
+##INFO=<ID=VE,Number=A,Type=String,Description="A comma-separated list of predicted variant effects, generated by SnpEff">
 ##SnpEffCmd="SnpEff  80_3204292 -noStats /home/exacloud/lustre1/prime-seq/workDir/7dc6e4ab-6108-1035-a9d7-c8ba6188526b/SequenceO.work/WGS_ONPRC_ThroughBatch4.vcf.gz "
 ##SnpEffVersion="4.3k (build 2017-03-29 17:16), by Pablo Cingolani"
 ##contig=<ID=1,length=249250621>

diff --git a/...sources/com/github/discvrseq/walkers/MultiSourceAnnotator/multiSourceOutputSubsetArgs.vcf b/...sources/com/github/discvrseq/walkers/MultiSourceAnnotator/multiSourceOutputSubsetArgs.vcf
@@ -75,13 +75,15 @@
 ##INFO=<ID=NDA,Number=1,Type=Integer,Description="Number of alternate alleles discovered (but not necessarily genotyped) at this site">
 ##INFO=<ID=NMD,Number=.,Type=String,Description="Predicted nonsense mediated decay effects for this variant. Format: 'Gene_Name | Gene_ID | Number_of_transcripts_in_gene | Percent_of_transcripts_affected'">
 ##INFO=<ID=OA,Number=R,Type=String,Description="This contains a list of the original alleles at this site.  It is primarily intended to store information prior to liftover.">
+##INFO=<ID=OG,Number=A,Type=String,Description="A comma-separated list of any overlapping genes, identified by SnpEff. This includes variants immediately upstream/downstream of the coding region.">
 ##INFO=<ID=OriginalContig,Number=1,Type=String,Description="The name of the source contig/chromosome prior to liftover.">
 ##INFO=<ID=OriginalStart,Number=1,Type=String,Description="The position of the variant on the source contig prior to liftover.">
 ##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
 ##INFO=<ID=RAW_MQ,Number=1,Type=Float,Description="Raw data for RMS Mapping Quality">
 ##INFO=<ID=RFG,Number=.,Type=String,Description="Mutation type (Refseq)">
 ##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
 ##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
+##INFO=<ID=VE,Number=A,Type=String,Description="A comma-separated list of predicted variant effects, generated by SnpEff">
 ##SnpEffCmd="SnpEff  80_3204292 -noStats /home/exacloud/lustre1/prime-seq/workDir/7dc6e4ab-6108-1035-a9d7-c8ba6188526b/SequenceO.work/WGS_ONPRC_ThroughBatch4.vcf.gz "
 ##SnpEffVersion="4.3k (build 2017-03-29 17:16), by Pablo Cingolani"
 ##contig=<ID=1,length=249250621>