diff --git a/src/main/java/com/github/discvrseq/walkers/MultiSourceAnnotator.java b/src/main/java/com/github/discvrseq/walkers/MultiSourceAnnotator.java index 458a2dcf..3fe037be 100644 --- a/src/main/java/com/github/discvrseq/walkers/MultiSourceAnnotator.java +++ b/src/main/java/com/github/discvrseq/walkers/MultiSourceAnnotator.java @@ -406,7 +406,7 @@ public class MultiSourceAnnotator extends VariantWalker { private final List allAnnotationKeys = new ArrayList<>(); - private final Collection ALLOWABLE_FILTERS = Arrays.asList("ReverseComplementedIndel", "NoTarget", "MismatchedRefAllele", "IndelStraddlesMultipleIntevals"); + private final Collection ALLOWABLE_FILTERS = Arrays.asList("ReverseComplementedIndel", "NoTarget", "MismatchedRefAllele", "IndelStraddlesMultipleIntervals"); @Override public void onTraversalStart() { diff --git a/src/main/java/com/github/discvrseq/walkers/annotator/Impact.java b/src/main/java/com/github/discvrseq/walkers/annotator/Impact.java index 01ec95e7..ec402de2 100644 --- a/src/main/java/com/github/discvrseq/walkers/annotator/Impact.java +++ b/src/main/java/com/github/discvrseq/walkers/annotator/Impact.java @@ -25,6 +25,8 @@ public class Impact implements InfoFieldAnnotation, StandardAnnotation { public static final String IMPACT_KEY = "IMPACT"; public static final String IMPACT_GENES_KEY = "HIG"; + public static final String OVERLAPPING_GENES_KEY = "OG"; + public static final String EFFECT_KEY = "VE"; @Override public Map annotate(ReferenceContext ref, VariantContext vc, AlleleLikelihoods likelihoods) { @@ -50,6 +52,8 @@ public Map annotate(ReferenceContext ref, VariantContext vc, All Map impactMap = new HashMap<>(); Map impactGeneMap = new HashMap<>(); + Map allGeneMap = new HashMap<>(); + Map effectMap = new HashMap<>(); for (Allele a : vc.getAlternateAlleles()) { if (!annByAllele.containsKey(a.getBaseString())) { @@ -58,14 +62,23 @@ public Map annotate(ReferenceContext ref, VariantContext vc, All Set impacts = new HashSet<>(); Set hig = new HashSet<>(); + Set og = new HashSet<>(); + Set effects = new HashSet<>(); for (String[] split : annByAllele.get(a.getBaseString())) { if (split.length > 7 && "protein_coding".equals(split[7])) { impacts.add(split[2]); - if ("HIGH".equals(split[2])) { - hig.add(split[3]); + if (!split[3].isEmpty()) { + og.add(split[3]); + if ("HIGH".equals(split[2])) { + hig.add(split[3]); + } } } + + if (!split[1].isEmpty()) { + effects.add(split[1]); + } } for (String val : Arrays.asList("HIGH", "LOW", "MODERATE")) { @@ -78,16 +91,32 @@ public Map annotate(ReferenceContext ref, VariantContext vc, All if (!hig.isEmpty()) { impactGeneMap.put(a, StringUtils.join(hig, "|")); } + + if (!og.isEmpty()) { + allGeneMap.put(a, StringUtils.join(og, "|")); + } + + if (!effects.isEmpty()) { + effectMap.put(a, StringUtils.join(effects, "|")); + } } if (!impactMap.isEmpty()) { attributeMap.put(IMPACT_KEY, vc.getAlternateAlleles().stream().map(a -> impactMap.getOrDefault(a, "")).toList()); } - if (!impactMap.isEmpty()) { + if (!impactGeneMap.isEmpty()) { attributeMap.put(IMPACT_GENES_KEY, vc.getAlternateAlleles().stream().map(a -> impactGeneMap.getOrDefault(a, "")).toList()); } + if (!allGeneMap.isEmpty()) { + attributeMap.put(OVERLAPPING_GENES_KEY, vc.getAlternateAlleles().stream().map(a -> allGeneMap.getOrDefault(a, "")).toList()); + } + + if (!effectMap.isEmpty()) { + attributeMap.put(EFFECT_KEY, vc.getAlternateAlleles().stream().map(a -> effectMap.getOrDefault(a, "")).toList()); + } + if (attributeMap.isEmpty()) { return null; } @@ -95,11 +124,13 @@ public Map annotate(ReferenceContext ref, VariantContext vc, All return attributeMap; } - public List getKeyNames() { return Arrays.asList(IMPACT_KEY, IMPACT_GENES_KEY); } + public List getKeyNames() { return Arrays.asList(IMPACT_KEY, IMPACT_GENES_KEY, OVERLAPPING_GENES_KEY, EFFECT_KEY); } public List getDescriptions() { return Arrays.asList( new VCFInfoHeaderLine(IMPACT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "The highest impact annotation provided by SnpEff, limited to protein_coding features"), - new VCFInfoHeaderLine(IMPACT_GENES_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "A comma-separated list of any overlapping genes with high-impact effects on protein coding, identified by SnpEff") + new VCFInfoHeaderLine(IMPACT_GENES_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "A comma-separated list of any overlapping genes with high-impact effects on protein coding, identified by SnpEff"), + new VCFInfoHeaderLine(OVERLAPPING_GENES_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "A comma-separated list of any overlapping genes, identified by SnpEff. This includes variants immediately upstream/downstream of the coding region."), + new VCFInfoHeaderLine(EFFECT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "A comma-separated list of predicted variant effects, generated by SnpEff") ); } } diff --git a/src/test/resources/com/github/discvrseq/walkers/MultiSourceAnnotator/multiSourceOutput.vcf b/src/test/resources/com/github/discvrseq/walkers/MultiSourceAnnotator/multiSourceOutput.vcf index 96294640..32f69fed 100644 --- a/src/test/resources/com/github/discvrseq/walkers/MultiSourceAnnotator/multiSourceOutput.vcf +++ b/src/test/resources/com/github/discvrseq/walkers/MultiSourceAnnotator/multiSourceOutput.vcf @@ -152,6 +152,7 @@ ##INFO= ##INFO= ##INFO= +##INFO= ##INFO= ##INFO= ##INFO= @@ -190,6 +191,7 @@ ##INFO= ##INFO= ##INFO= +##INFO= ##SnpEffCmd="SnpEff 80_3204292 -noStats /home/exacloud/lustre1/prime-seq/workDir/7dc6e4ab-6108-1035-a9d7-c8ba6188526b/SequenceO.work/WGS_ONPRC_ThroughBatch4.vcf.gz " ##SnpEffVersion="4.3k (build 2017-03-29 17:16), by Pablo Cingolani" ##contig= diff --git a/src/test/resources/com/github/discvrseq/walkers/MultiSourceAnnotator/multiSourceOutputSubsetArgs.vcf b/src/test/resources/com/github/discvrseq/walkers/MultiSourceAnnotator/multiSourceOutputSubsetArgs.vcf index 2644ec63..d88ef744 100644 --- a/src/test/resources/com/github/discvrseq/walkers/MultiSourceAnnotator/multiSourceOutputSubsetArgs.vcf +++ b/src/test/resources/com/github/discvrseq/walkers/MultiSourceAnnotator/multiSourceOutputSubsetArgs.vcf @@ -75,6 +75,7 @@ ##INFO= ##INFO= ##INFO= +##INFO= ##INFO= ##INFO= ##INFO= @@ -82,6 +83,7 @@ ##INFO= ##INFO= ##INFO= +##INFO= ##SnpEffCmd="SnpEff 80_3204292 -noStats /home/exacloud/lustre1/prime-seq/workDir/7dc6e4ab-6108-1035-a9d7-c8ba6188526b/SequenceO.work/WGS_ONPRC_ThroughBatch4.vcf.gz " ##SnpEffVersion="4.3k (build 2017-03-29 17:16), by Pablo Cingolani" ##contig=