From 0a14e9701c0270ae73d2fb07e3cff145dae22244 Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 24 Oct 2024 14:00:07 -0700 Subject: [PATCH] Improvements to GenotypeConcordance annotation (#347) * Dont print reference genotypes longer than 10 chars in GenotypeConcordance * Correct bug in GenotypeConcordance with long variants * Limit to just reference variants with the same start --- .../annotator/GenotypeConcordance.java | 29 +++++++++++------- .../annotator/GenotypeConcordanceBySite.java | 10 ++++-- ...DiscvrVariantAnnotatorIntegrationTest.java | 4 ++- .../TestData/MendelianViolationRefGT.vcf | 21 +++++++++++++ .../TestData/MendelianViolationRefGT.vcf.idx | Bin 0 -> 288 bytes .../basicTestWithCustomAnnotationsOutput.vcf | 4 +-- 6 files changed, 51 insertions(+), 17 deletions(-) create mode 100644 src/test/resources/com/github/discvrseq/TestData/MendelianViolationRefGT.vcf create mode 100644 src/test/resources/com/github/discvrseq/TestData/MendelianViolationRefGT.vcf.idx diff --git a/src/main/java/com/github/discvrseq/walkers/annotator/GenotypeConcordance.java b/src/main/java/com/github/discvrseq/walkers/annotator/GenotypeConcordance.java index 7d375e6a..07b60b01 100644 --- a/src/main/java/com/github/discvrseq/walkers/annotator/GenotypeConcordance.java +++ b/src/main/java/com/github/discvrseq/walkers/annotator/GenotypeConcordance.java @@ -27,6 +27,8 @@ public class GenotypeConcordance extends PedigreeAnnotation implements GenotypeA public static final String KEY = "GTD"; public static final String D_KEY = "REF_GT"; + private static final int MAX_GT_LENGTH = 10; + public GenotypeConcordanceArgumentCollection args = null; public GenotypeConcordance() @@ -63,22 +65,27 @@ public void annotate(ReferenceContext ref, VariantContext vc, Genotype g, Genoty return; } - List list = args.featureManager.getFeatures(args.referenceVcf, new SimpleInterval(vc.getContig(), vc.getStart(), vc.getEnd())); - if (list == null || list.isEmpty()){ + List referenceVCs = args.featureManager.getFeatures(args.referenceVcf, new SimpleInterval(vc.getContig(), vc.getStart(), vc.getStart())).stream().filter(refVC -> refVC.getStart() == vc.getStart()).toList(); + if (referenceVCs.isEmpty()){ return; } - for (VariantContext c : list){ - Genotype refGenotype = c.getGenotype(g.getSampleName()); - if (refGenotype != null && !refGenotype.isFiltered() && !refGenotype.isNoCall()) { - if (!refGenotype.sameGenotype(g)) { - gb.attribute(KEY, "1"); + if (referenceVCs.size() > 1) { + throw new IllegalArgumentException("More than one reference found for site: " + vc.getContig() + "-" + vc.getStart()); + } + + VariantContext c = referenceVCs.get(0); + Genotype refGenotype = c.getGenotype(g.getSampleName()); + if (refGenotype != null && !refGenotype.isFiltered() && !refGenotype.isNoCall()) { + if (!refGenotype.sameGenotype(g)) { + gb.attribute(KEY, "1"); + String gt = refGenotype.getGenotypeString(); + if (gt.length() < MAX_GT_LENGTH) { gb.attribute(D_KEY, refGenotype.getGenotypeString()); } - else { - gb.attribute(KEY, "0"); - } - + } + else { + gb.attribute(KEY, "0"); } } } diff --git a/src/main/java/com/github/discvrseq/walkers/annotator/GenotypeConcordanceBySite.java b/src/main/java/com/github/discvrseq/walkers/annotator/GenotypeConcordanceBySite.java index 59f0851c..18084b5e 100644 --- a/src/main/java/com/github/discvrseq/walkers/annotator/GenotypeConcordanceBySite.java +++ b/src/main/java/com/github/discvrseq/walkers/annotator/GenotypeConcordanceBySite.java @@ -57,12 +57,16 @@ public Map annotate(ReferenceContext ref, VariantContext vc, All throw new IllegalArgumentException("Must provide a VCF with reference genotypes!"); } - List list = args.featureManager.getFeatures(args.referenceVcf, new SimpleInterval(vc.getContig(), vc.getStart(), vc.getEnd())); - if (list == null || list.isEmpty()){ + List referenceVCs = args.featureManager.getFeatures(args.referenceVcf, new SimpleInterval(vc.getContig(), vc.getStart(), vc.getStart())).stream().filter(refVC -> refVC.getStart() == vc.getStart()).toList(); + if (referenceVCs.isEmpty()){ return null; } - VariantContext refVC = list.get(0); + if (referenceVCs.size() > 1) { + throw new IllegalArgumentException("More than one reference found for site: " + vc.getContig() + "-" + vc.getStart()); + } + + VariantContext refVC = referenceVCs.get(0); AtomicInteger discord = new AtomicInteger(0); AtomicInteger concord = new AtomicInteger(0); diff --git a/src/test/java/com/github/discvrseq/walkers/annotator/DiscvrVariantAnnotatorIntegrationTest.java b/src/test/java/com/github/discvrseq/walkers/annotator/DiscvrVariantAnnotatorIntegrationTest.java index e220121e..8a5b8838 100644 --- a/src/test/java/com/github/discvrseq/walkers/annotator/DiscvrVariantAnnotatorIntegrationTest.java +++ b/src/test/java/com/github/discvrseq/walkers/annotator/DiscvrVariantAnnotatorIntegrationTest.java @@ -78,7 +78,9 @@ public void basicTestWithCustomAnnotations() throws Exception { args.add("A", "MendelianViolationBySample"); args.addRaw("-rg"); - args.addRaw(normalizePath(input)); + File refGT = new File(testBaseDir, "MendelianViolationRefGT.vcf"); + ensureVcfIndex(refGT); + args.addRaw(normalizePath(refGT)); args.addRaw("-O"); args.addRaw("%s"); diff --git a/src/test/resources/com/github/discvrseq/TestData/MendelianViolationRefGT.vcf b/src/test/resources/com/github/discvrseq/TestData/MendelianViolationRefGT.vcf new file mode 100644 index 00000000..2c4e68df --- /dev/null +++ b/src/test/resources/com/github/discvrseq/TestData/MendelianViolationRefGT.vcf @@ -0,0 +1,21 @@ +##fileformat=VCFv4.0 +##MendelianViolationEvaluator="This file contains genotypes with mendelian violations in the input data. Please note that this file is entirely synthetic and does not represent true SNPs or genotypes observed." +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT TRIO_MOTHER TRIO_FATHER TRIO_CHILD PAIR_PARENT PAIR_CHILD UNRELATED_INDIVIDUAL_CONTROL +1 10109 . A T 99 PASS . GT:GQ:PL 0/1:50:0,50,200 0/0:40:0,40,200 0/1:30:30,0,200 0/0:50:0,50,200 0/1:30:30,0,200 0/0:50:0,50,200 +1 10147 . C A 99 PASS . GT:GQ:PL 0/0:30:0,30,200 0/0:50:0,50,200 1/1:40:200,40,0 0/0:30:0,30,200 1/1:40:200,40,0 0/0:30:0,30,200 +1 10150 . C T 99 PASS . GT:GQ:PL 0/0:40:0,40,200 0/1:30:30,0,200 1/1:50:200,50,0 0/0:40:0,40,200 1/1:50:200,50,0 0/0:40:0,40,200 +1 10177 . A C 99 PASS . GT:GQ:PL 0/0:50:0,50,200 1/1:40:200,40,0 0/0:30:0,30,200 0/0:50:0,50,200 0/0:30:0,30,200 0/0:50:0,50,200 +1 10180 . T C 99 PASS . GT:GQ:PL 1/0:30:0,30,200 1/1:50:200,50,0 0/1:40:200,40,0 0/0:30:0,30,200 1/1:40:200,40,0 0/0:30:0,30,200 +1 10234 . C T 99 PASS . GT:GQ:PL 0/1:40:40,0,200 0/0:30:0,30,200 1/1:50:200,50,0 0/1:40:40,0,200 1/1:50:200,50,0 0/1:40:40,0,200 +1 10235 . T A 99 PASS . GT:GQ:PL 0/1:50:50,0,200 1/1:40:200,40,0 0/0:30:0,30,200 0/1:50:50,0,200 0/0:30:0,30,200 0/1:50:50,0,200 +1 10236 . A G 99 PASS . GT:GQ:PL 1/1:30:200,30,0 0/0:50:0,50,200 0/0:40:0,40,200 1/1:30:200,30,0 0/0:50:0,50,200 1/1:30:200,30,0 +1 10250 . A C 99 PASS . GT:GQ:PL 1/1:40:200,40,0 0/0:30:0,30,200 1/1:50:200,50,0 1/1:40:200,40,0 1/1:50:200,50,0 1/1:40:200,40,0 +1 10257 . A C 99 PASS . GT:GQ:PL 1/1:50:200,50,0 0/1:40:40,0,200 0/0:30:0,30,200 1/1:50:200,50,0 0/0:30:0,30,200 1/1:50:200,50,0 +1 10285 . T C 99 PASS . GT:GQ:PL 1/1:30:200,30,0 1/1:50:200,50,0 0/1:40:40,0,200 1/1:30:200,30,0 0/1:40:40,0,200 1/1:30:200,30,0 +1 10297 . C T 99 PASS . GT:GQ:PL 1/1:40:200,40,0 1/1:30:200,30,0 0/0:50:0,50,200 1/1:40:200,40,0 0/0:50:0,50,200 1/1:40:200,40,0 +1 10304 . A C 99 PASS . GT:GQ:PL 1/1:50:200,50,0 0/1:50:50,0,200 0/1:50:50,0,200 1/1:50:200,50,0 0/1:50:200,0,50 1/1:50:200,50,0 +1 10310 . A C 99 PASS . GT:GQ:PL 1/1:50:200,50,0 0/1:50:50,0,200 1/1:50:200,50,0 1/1:50:200,50,0 1/1:50:200,50,0 1/1:50:200,50,0 +1 10315 . C T 99 PASS . GT:GQ:PL 1/1:50:200,50,0 1/1:50:200,50,0 1/1:50:200,50,0 1/1:50:200,50,0 1/1:50:200,50,0 1/1:50:200,50,0 diff --git a/src/test/resources/com/github/discvrseq/TestData/MendelianViolationRefGT.vcf.idx b/src/test/resources/com/github/discvrseq/TestData/MendelianViolationRefGT.vcf.idx new file mode 100644 index 0000000000000000000000000000000000000000..e7a619b7f9a5116135df8cbf008f47565953a4dd GIT binary patch literal 288 zcmZ8bK~BRk5L^l;E`0(IKNEXX;y;7nPvW6vFG1E+Ubz0_Wk;#)YiP_VxP>|d~Y)at%24Kc)>YX$H KiNe1 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT TRIO_MOTHER TRIO_FATHER TRIO_CHILD PAIR_PARENT PAIR_CHILD UNRELATED_INDIVIDUAL_CONTROL -1 10109 . A T 99 PASS AC=2;AF=0.167;AN=12;GTC=6;GTD=0;MAF=0.167;MV_NUM=1;MV_SAMPLES=TRIO_CHILD GT:GQ:GTD:MV:PL 0/0:50:0:0:0,50,200 0/0:40:0:0:0,40,200 0/1:30:0:1:30,0,200 0/0:50:0:0:0,50,200 0/1:30:0:0:30,0,200 0/0:50:0:0:0,50,200 +1 10109 . A T 99 PASS AC=2;AF=0.167;AN=12;GTC=5;GTD=1;MAF=0.167;MV_NUM=1;MV_SAMPLES=TRIO_CHILD GT:GQ:GTD:MV:PL:REF_GT 0/0:50:1:0:0,50,200:A/T 0/0:40:0:0:0,40,200 0/1:30:0:1:30,0,200 0/0:50:0:0:0,50,200 0/1:30:0:0:30,0,200 0/0:50:0:0:0,50,200 1 10147 . C A 99 PASS AC=4;AF=0.333;AN=12;GTC=6;GTD=0;MAF=0.333;MV_NUM=2;MV_SAMPLES=PAIR_CHILD,TRIO_CHILD GT:GQ:GTD:MV:PL 0/0:30:0:0:0,30,200 0/0:50:0:0:0,50,200 1/1:40:0:1:200,40,0 0/0:30:0:0:0,30,200 1/1:40:0:1:200,40,0 0/0:30:0:0:0,30,200 1 10150 . C T 99 PASS AC=5;AF=0.417;AN=12;GTC=6;GTD=0;MAF=0.417;MV_NUM=2;MV_SAMPLES=PAIR_CHILD,TRIO_CHILD GT:GQ:GTD:MV:PL 0/0:40:0:0:0,40,200 0/1:30:0:0:30,0,200 1/1:50:0:1:200,50,0 0/0:40:0:0:0,40,200 1/1:50:0:1:200,50,0 0/0:40:0:0:0,40,200 1 10177 . A C 99 PASS AC=2;AF=0.167;AN=12;GTC=6;GTD=0;MAF=0.167;MV_NUM=1;MV_SAMPLES=TRIO_CHILD GT:GQ:GTD:MV:PL 0/0:50:0:0:0,50,200 1/1:40:0:0:200,40,0 0/0:30:0:1:0,30,200 0/0:50:0:0:0,50,200 0/0:30:0:0:0,30,200 0/0:50:0:0:0,50,200 -1 10180 . T C 99 PASS AC=6;AF=0.500;AN=12;GTC=6;GTD=0;MAF=0.500;MV_NUM=2;MV_SAMPLES=PAIR_CHILD,TRIO_CHILD GT:GQ:GTD:MV:PL 0/0:30:0:0:0,30,200 1/1:50:0:0:200,50,0 1/1:40:0:1:200,40,0 0/0:30:0:0:0,30,200 1/1:40:0:1:200,40,0 0/0:30:0:0:0,30,200 +1 10180 . T C 99 PASS AC=6;AF=0.500;AN=12;GTC=4;GTD=2;MAF=0.500;MV_NUM=2;MV_SAMPLES=PAIR_CHILD,TRIO_CHILD GT:GQ:GTD:MV:PL:REF_GT 0/0:30:1:0:0,30,200:C/T 1/1:50:0:0:200,50,0 1/1:40:1:1:200,40,0:T/C 0/0:30:0:0:0,30,200 1/1:40:0:1:200,40,0 0/0:30:0:0:0,30,200 1 10234 . C T 99 PASS AC=7;AF=0.583;AN=12;GTC=6;GTD=0;MAF=0.417;MV_NUM=1;MV_SAMPLES=TRIO_CHILD GT:GQ:GTD:MV:PL 0/1:40:0:0:40,0,200 0/0:30:0:0:0,30,200 1/1:50:0:1:200,50,0 0/1:40:0:0:40,0,200 1/1:50:0:0:200,50,0 0/1:40:0:0:40,0,200 1 10235 . T A 99 PASS AC=5;AF=0.417;AN=12;GTC=6;GTD=0;MAF=0.417;MV_NUM=1;MV_SAMPLES=TRIO_CHILD GT:GQ:GTD:MV:PL 0/1:50:0:0:50,0,200 1/1:40:0:0:200,40,0 0/0:30:0:1:0,30,200 0/1:50:0:0:50,0,200 0/0:30:0:0:0,30,200 0/1:50:0:0:50,0,200 1 10236 . A G 99 PASS AC=6;AF=0.500;AN=12;GTC=6;GTD=0;MAF=0.500;MV_NUM=2;MV_SAMPLES=PAIR_CHILD,TRIO_CHILD GT:GQ:GTD:MV:PL 1/1:30:0:0:200,30,0 0/0:50:0:0:0,50,200 0/0:40:0:1:0,40,200 1/1:30:0:0:200,30,0 0/0:50:0:1:0,50,200 1/1:30:0:0:200,30,0