Skip to content

Commit

Permalink
Improvements to GenotypeConcordance annotation (#347)
Browse files Browse the repository at this point in the history
* Dont print reference genotypes longer than 10 chars in GenotypeConcordance
* Correct bug in GenotypeConcordance with long variants
* Limit to just reference variants with the same start
  • Loading branch information
bbimber authored Oct 24, 2024
1 parent 0efe622 commit 0a14e97
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ public class GenotypeConcordance extends PedigreeAnnotation implements GenotypeA
public static final String KEY = "GTD";
public static final String D_KEY = "REF_GT";

private static final int MAX_GT_LENGTH = 10;

public GenotypeConcordanceArgumentCollection args = null;

public GenotypeConcordance()
Expand Down Expand Up @@ -63,22 +65,27 @@ public void annotate(ReferenceContext ref, VariantContext vc, Genotype g, Genoty
return;
}

List<VariantContext> list = args.featureManager.getFeatures(args.referenceVcf, new SimpleInterval(vc.getContig(), vc.getStart(), vc.getEnd()));
if (list == null || list.isEmpty()){
List<VariantContext> referenceVCs = args.featureManager.getFeatures(args.referenceVcf, new SimpleInterval(vc.getContig(), vc.getStart(), vc.getStart())).stream().filter(refVC -> refVC.getStart() == vc.getStart()).toList();
if (referenceVCs.isEmpty()){
return;
}

for (VariantContext c : list){
Genotype refGenotype = c.getGenotype(g.getSampleName());
if (refGenotype != null && !refGenotype.isFiltered() && !refGenotype.isNoCall()) {
if (!refGenotype.sameGenotype(g)) {
gb.attribute(KEY, "1");
if (referenceVCs.size() > 1) {
throw new IllegalArgumentException("More than one reference found for site: " + vc.getContig() + "-" + vc.getStart());
}

VariantContext c = referenceVCs.get(0);
Genotype refGenotype = c.getGenotype(g.getSampleName());
if (refGenotype != null && !refGenotype.isFiltered() && !refGenotype.isNoCall()) {
if (!refGenotype.sameGenotype(g)) {
gb.attribute(KEY, "1");
String gt = refGenotype.getGenotypeString();
if (gt.length() < MAX_GT_LENGTH) {
gb.attribute(D_KEY, refGenotype.getGenotypeString());
}
else {
gb.attribute(KEY, "0");
}

}
else {
gb.attribute(KEY, "0");
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,16 @@ public Map<String, Object> annotate(ReferenceContext ref, VariantContext vc, All
throw new IllegalArgumentException("Must provide a VCF with reference genotypes!");
}

List<VariantContext> list = args.featureManager.getFeatures(args.referenceVcf, new SimpleInterval(vc.getContig(), vc.getStart(), vc.getEnd()));
if (list == null || list.isEmpty()){
List<VariantContext> referenceVCs = args.featureManager.getFeatures(args.referenceVcf, new SimpleInterval(vc.getContig(), vc.getStart(), vc.getStart())).stream().filter(refVC -> refVC.getStart() == vc.getStart()).toList();
if (referenceVCs.isEmpty()){
return null;
}

VariantContext refVC = list.get(0);
if (referenceVCs.size() > 1) {
throw new IllegalArgumentException("More than one reference found for site: " + vc.getContig() + "-" + vc.getStart());
}

VariantContext refVC = referenceVCs.get(0);

AtomicInteger discord = new AtomicInteger(0);
AtomicInteger concord = new AtomicInteger(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ public void basicTestWithCustomAnnotations() throws Exception {
args.add("A", "MendelianViolationBySample");

args.addRaw("-rg");
args.addRaw(normalizePath(input));
File refGT = new File(testBaseDir, "MendelianViolationRefGT.vcf");
ensureVcfIndex(refGT);
args.addRaw(normalizePath(refGT));

args.addRaw("-O");
args.addRaw("%s");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
##fileformat=VCFv4.0
##MendelianViolationEvaluator="This file contains genotypes with mendelian violations in the input data. Please note that this file is entirely synthetic and does not represent true SNPs or genotypes observed."
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=3,Type=Float,Description="Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT TRIO_MOTHER TRIO_FATHER TRIO_CHILD PAIR_PARENT PAIR_CHILD UNRELATED_INDIVIDUAL_CONTROL
1 10109 . A T 99 PASS . GT:GQ:PL 0/1:50:0,50,200 0/0:40:0,40,200 0/1:30:30,0,200 0/0:50:0,50,200 0/1:30:30,0,200 0/0:50:0,50,200
1 10147 . C A 99 PASS . GT:GQ:PL 0/0:30:0,30,200 0/0:50:0,50,200 1/1:40:200,40,0 0/0:30:0,30,200 1/1:40:200,40,0 0/0:30:0,30,200
1 10150 . C T 99 PASS . GT:GQ:PL 0/0:40:0,40,200 0/1:30:30,0,200 1/1:50:200,50,0 0/0:40:0,40,200 1/1:50:200,50,0 0/0:40:0,40,200
1 10177 . A C 99 PASS . GT:GQ:PL 0/0:50:0,50,200 1/1:40:200,40,0 0/0:30:0,30,200 0/0:50:0,50,200 0/0:30:0,30,200 0/0:50:0,50,200
1 10180 . T C 99 PASS . GT:GQ:PL 1/0:30:0,30,200 1/1:50:200,50,0 0/1:40:200,40,0 0/0:30:0,30,200 1/1:40:200,40,0 0/0:30:0,30,200
1 10234 . C T 99 PASS . GT:GQ:PL 0/1:40:40,0,200 0/0:30:0,30,200 1/1:50:200,50,0 0/1:40:40,0,200 1/1:50:200,50,0 0/1:40:40,0,200
1 10235 . T A 99 PASS . GT:GQ:PL 0/1:50:50,0,200 1/1:40:200,40,0 0/0:30:0,30,200 0/1:50:50,0,200 0/0:30:0,30,200 0/1:50:50,0,200
1 10236 . A G 99 PASS . GT:GQ:PL 1/1:30:200,30,0 0/0:50:0,50,200 0/0:40:0,40,200 1/1:30:200,30,0 0/0:50:0,50,200 1/1:30:200,30,0
1 10250 . A C 99 PASS . GT:GQ:PL 1/1:40:200,40,0 0/0:30:0,30,200 1/1:50:200,50,0 1/1:40:200,40,0 1/1:50:200,50,0 1/1:40:200,40,0
1 10257 . A C 99 PASS . GT:GQ:PL 1/1:50:200,50,0 0/1:40:40,0,200 0/0:30:0,30,200 1/1:50:200,50,0 0/0:30:0,30,200 1/1:50:200,50,0
1 10285 . T C 99 PASS . GT:GQ:PL 1/1:30:200,30,0 1/1:50:200,50,0 0/1:40:40,0,200 1/1:30:200,30,0 0/1:40:40,0,200 1/1:30:200,30,0
1 10297 . C T 99 PASS . GT:GQ:PL 1/1:40:200,40,0 1/1:30:200,30,0 0/0:50:0,50,200 1/1:40:200,40,0 0/0:50:0,50,200 1/1:40:200,40,0
1 10304 . A C 99 PASS . GT:GQ:PL 1/1:50:200,50,0 0/1:50:50,0,200 0/1:50:50,0,200 1/1:50:200,50,0 0/1:50:200,0,50 1/1:50:200,50,0
1 10310 . A C 99 PASS . GT:GQ:PL 1/1:50:200,50,0 0/1:50:50,0,200 1/1:50:200,50,0 1/1:50:200,50,0 1/1:50:200,50,0 1/1:50:200,50,0
1 10315 . C T 99 PASS . GT:GQ:PL 1/1:50:200,50,0 1/1:50:200,50,0 1/1:50:200,50,0 1/1:50:200,50,0 1/1:50:200,50,0 1/1:50:200,50,0
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
##MendelianViolationEvaluator="This file contains genotypes with mendelian violations in the input data. Please note that this file is entirely synthetic and does not represent true SNPs or genotypes observed."
##contig=<ID=1,length=0>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT TRIO_MOTHER TRIO_FATHER TRIO_CHILD PAIR_PARENT PAIR_CHILD UNRELATED_INDIVIDUAL_CONTROL
1 10109 . A T 99 PASS AC=2;AF=0.167;AN=12;GTC=6;GTD=0;MAF=0.167;MV_NUM=1;MV_SAMPLES=TRIO_CHILD GT:GQ:GTD:MV:PL 0/0:50:0:0:0,50,200 0/0:40:0:0:0,40,200 0/1:30:0:1:30,0,200 0/0:50:0:0:0,50,200 0/1:30:0:0:30,0,200 0/0:50:0:0:0,50,200
1 10109 . A T 99 PASS AC=2;AF=0.167;AN=12;GTC=5;GTD=1;MAF=0.167;MV_NUM=1;MV_SAMPLES=TRIO_CHILD GT:GQ:GTD:MV:PL:REF_GT 0/0:50:1:0:0,50,200:A/T 0/0:40:0:0:0,40,200 0/1:30:0:1:30,0,200 0/0:50:0:0:0,50,200 0/1:30:0:0:30,0,200 0/0:50:0:0:0,50,200
1 10147 . C A 99 PASS AC=4;AF=0.333;AN=12;GTC=6;GTD=0;MAF=0.333;MV_NUM=2;MV_SAMPLES=PAIR_CHILD,TRIO_CHILD GT:GQ:GTD:MV:PL 0/0:30:0:0:0,30,200 0/0:50:0:0:0,50,200 1/1:40:0:1:200,40,0 0/0:30:0:0:0,30,200 1/1:40:0:1:200,40,0 0/0:30:0:0:0,30,200
1 10150 . C T 99 PASS AC=5;AF=0.417;AN=12;GTC=6;GTD=0;MAF=0.417;MV_NUM=2;MV_SAMPLES=PAIR_CHILD,TRIO_CHILD GT:GQ:GTD:MV:PL 0/0:40:0:0:0,40,200 0/1:30:0:0:30,0,200 1/1:50:0:1:200,50,0 0/0:40:0:0:0,40,200 1/1:50:0:1:200,50,0 0/0:40:0:0:0,40,200
1 10177 . A C 99 PASS AC=2;AF=0.167;AN=12;GTC=6;GTD=0;MAF=0.167;MV_NUM=1;MV_SAMPLES=TRIO_CHILD GT:GQ:GTD:MV:PL 0/0:50:0:0:0,50,200 1/1:40:0:0:200,40,0 0/0:30:0:1:0,30,200 0/0:50:0:0:0,50,200 0/0:30:0:0:0,30,200 0/0:50:0:0:0,50,200
1 10180 . T C 99 PASS AC=6;AF=0.500;AN=12;GTC=6;GTD=0;MAF=0.500;MV_NUM=2;MV_SAMPLES=PAIR_CHILD,TRIO_CHILD GT:GQ:GTD:MV:PL 0/0:30:0:0:0,30,200 1/1:50:0:0:200,50,0 1/1:40:0:1:200,40,0 0/0:30:0:0:0,30,200 1/1:40:0:1:200,40,0 0/0:30:0:0:0,30,200
1 10180 . T C 99 PASS AC=6;AF=0.500;AN=12;GTC=4;GTD=2;MAF=0.500;MV_NUM=2;MV_SAMPLES=PAIR_CHILD,TRIO_CHILD GT:GQ:GTD:MV:PL:REF_GT 0/0:30:1:0:0,30,200:C/T 1/1:50:0:0:200,50,0 1/1:40:1:1:200,40,0:T/C 0/0:30:0:0:0,30,200 1/1:40:0:1:200,40,0 0/0:30:0:0:0,30,200
1 10234 . C T 99 PASS AC=7;AF=0.583;AN=12;GTC=6;GTD=0;MAF=0.417;MV_NUM=1;MV_SAMPLES=TRIO_CHILD GT:GQ:GTD:MV:PL 0/1:40:0:0:40,0,200 0/0:30:0:0:0,30,200 1/1:50:0:1:200,50,0 0/1:40:0:0:40,0,200 1/1:50:0:0:200,50,0 0/1:40:0:0:40,0,200
1 10235 . T A 99 PASS AC=5;AF=0.417;AN=12;GTC=6;GTD=0;MAF=0.417;MV_NUM=1;MV_SAMPLES=TRIO_CHILD GT:GQ:GTD:MV:PL 0/1:50:0:0:50,0,200 1/1:40:0:0:200,40,0 0/0:30:0:1:0,30,200 0/1:50:0:0:50,0,200 0/0:30:0:0:0,30,200 0/1:50:0:0:50,0,200
1 10236 . A G 99 PASS AC=6;AF=0.500;AN=12;GTC=6;GTD=0;MAF=0.500;MV_NUM=2;MV_SAMPLES=PAIR_CHILD,TRIO_CHILD GT:GQ:GTD:MV:PL 1/1:30:0:0:200,30,0 0/0:50:0:0:0,50,200 0/0:40:0:1:0,40,200 1/1:30:0:0:200,30,0 0/0:50:0:1:0,50,200 1/1:30:0:0:200,30,0
Expand Down

0 comments on commit 0a14e97

Please sign in to comment.