Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add malaria spanning deletion exception regression test with fix #8802

Merged
merged 2 commits into from
May 1, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -327,13 +327,11 @@ public static void makeGenotypeCall(final int ploidy,
gb.alleles(noCallAlleles(ploidy));
} else if (assignmentMethod == GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN ||
assignmentMethod == GenotypeAssignmentMethod.PREFER_PLS) {
if (genotypeLikelihoods == null || !isInformative(genotypeLikelihoods)) {
if (assignmentMethod == GenotypeAssignmentMethod.PREFER_PLS) {
if (originalGT == null) {
throw new IllegalArgumentException("original GT cannot be null if assignmentMethod is PREFER_PLS");
} else {
gb.alleles(bestMatchToOriginalGT(allelesToUse, originalGT.getAlleles()));
}
if ((genotypeLikelihoods == null || !isInformative(genotypeLikelihoods)) && assignmentMethod == GenotypeAssignmentMethod.PREFER_PLS) {
if (originalGT == null) {
throw new IllegalArgumentException("original GT cannot be null if assignmentMethod is PREFER_PLS");
} else {
gb.alleles(bestMatchToOriginalGT(allelesToUse, originalGT.getAlleles()));
}
} else {
final int maxLikelihoodIndex = MathUtils.maxElementIndex(genotypeLikelihoods);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -668,4 +668,37 @@ public void testAddedHcAsAnnotations() throws IOException {
true
);
}
@Test
public void testSpanDelRegression() throws IOException {

final File input = getTestFile("reblock_cleanup_bug_variant.g.vcf");
final File output = createTempFile("reblockedgvcf", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();

args.addReference(new File(pf_reference))
.add("V", input)
.addFlag("do-qual-approx")
.add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false")
.add("A", "AssemblyComplexity")
.addFlag("floor-blocks")
.add("GQB", 20)
.add("GQB", 30)
.add("GQB", 40)
.addOutput(output);

runCommandLine(args);

Pair<VCFHeader, List<VariantContext>> actual = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
List<VariantContext> variants = actual.getRight();
Assert.assertEquals(variants.size(), 4);
VariantContext v0 = variants.get(0);
//crappy deletions are all collapsed into GQ0 ref block
Assert.assertEquals(v0.getStart(), 646914);
Assert.assertEquals(v0.getAttributeAsInt(VCFConstants.END_KEY, 0), 646953);
Assert.assertTrue(v0.getGenotype(0).isHomRef());
Assert.assertEquals(v0.getGenotype(0).getGQ(), 0);
//no more star alleles because deletions are all gone
Assert.assertFalse(variants.stream().anyMatch(v -> v.getAlternateAlleles().contains(Allele.SPAN_DEL)));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
##fileformat=VCFv4.2
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
##FILTER=<ID=LowQual,Description="Low quality">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phasing set (typically the position of the first variant in the set)">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=HaplotypeCaller,CommandLine="HaplotypeCaller --annotate-with-num-discovered-alleles true --heterozygosity 0.001 --indel-heterozygosity 1.25E-4 --heterozygosity-stdev 0.01 --sample-ploidy 2 --contamination-fraction-to-filter 0.0 --gvcf-gq-bands 10 --gvcf-gq-bands 20 --gvcf-gq-bands 30 --gvcf-gq-bands 40 --gvcf-gq-bands 50 --gvcf-gq-bands 60 --gvcf-gq-bands 70 --gvcf-gq-bands 80 --gvcf-gq-bands 90 --linked-de-bruijn-graph true --pileup-detection true --pileup-detection-enable-indel-pileup-calling true --bam-output SEN_2012_SLP.CXX_0108.haplotype_caller.Pf3D7_11_v3.bamout.bam --smith-waterman FASTEST_AVAILABLE --emit-ref-confidence GVCF --output SEN_2012_SLP.CXX_0108.haplotype_caller.Pf3D7_11_v3.g.vcf.gz --intervals Pf3D7_11_v3 --input gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/results/SRFlowcell/SEN_2012_SLP.CXX_0108/reads/aligned/SEN_2012_SLP.CXX_0108.aligned.merged.markDuplicates.sorted.BQSR.bam --reference /cromwell_root/broad-dsde-methods-long-reads/resources/references/plasmodb_release-61/Pfalciparum3D7/fasta/data/PlasmoDB-61_Pfalciparum3D7_Genome.fasta --annotation AssemblyComplexity --annotation-group StandardAnnotation --annotation-group StandardHCAnnotation --use-posteriors-to-calculate-qual false --dont-use-dragstr-priors false --use-new-qual-calculator true --standard-min-confidence-threshold-for-calling 30.0 --max-alternate-alleles 6 --max-genotype-count 1024 --num-reference-samples-if-no-call 0 --genotype-assignment-method USE_PLS_TO_ASSIGN --output-mode EMIT_VARIANTS_ONLY --all-site-pls false --flow-likelihood-parallel-threads 0 --flow-likelihood-optimized-comp false --trim-to-haplotype true --exact-matching false --flow-use-t0-tag false --flow-probability-threshold 0.003 --flow-remove-non-single-base-pair-indels false --flow-remove-one-zero-probs false --flow-quantization-bins 121 --flow-fill-empty-bins-value 0.001 --flow-symmetric-indel-probs false --flow-report-insertion-or-deletion false --flow-disallow-probs-larger-than-call false --flow-lump-probs false --flow-retain-max-n-probs-base-format false --flow-probability-scaling-factor 10 --flow-order-cycle-length 4 --keep-boundary-flows false --floor-blocks false --indel-size-to-eliminate-in-ref-model 10 --disable-optimizations false --dragen-mode false --dragen-378-concordance-mode false --flow-mode NONE --apply-bqd false --apply-frd false --disable-spanning-event-genotyping false --transform-dragen-mapping-quality false --mapping-quality-threshold-for-genotyping 20 --max-effective-depth-adjustment-for-frd 0 --just-determine-active-regions false --dont-genotype false --do-not-run-physical-phasing false --do-not-correct-overlapping-quality false --use-filtered-reads-for-annotations false --use-flow-aligner-for-stepwise-hc-filtering false --adaptive-pruning false --do-not-recover-dangling-branches false --recover-dangling-heads false --kmer-size 10 --kmer-size 25 --dont-increase-kmer-sizes-for-cycles false --allow-non-unique-kmers-in-ref false --num-pruning-samples 1 --min-dangling-branch-length 4 --recover-all-dangling-branches false --max-num-haplotypes-in-population 128 --min-pruning 2 --adaptive-pruning-initial-error-rate 0.001 --pruning-lod-threshold 2.302585092994046 --pruning-seeding-lod-threshold 9.210340371976184 --max-unpruned-variants 100 --disable-artificial-haplotype-recovery false --enable-legacy-graph-cycle-detection false --debug-assembly false --debug-graph-transformations false --capture-assembly-failure-bam false --num-matching-bases-in-dangling-end-to-recover -1 --error-correction-log-odds -Infinity --error-correct-reads false --kmer-length-for-read-error-correction 25 --min-observations-for-kmer-to-be-solid 20 --likelihood-calculation-engine PairHMM --base-quality-score-threshold 18 --dragstr-het-hom-ratio 2 --dont-use-dragstr-pair-hmm-scores false --pair-hmm-gap-continuation-penalty 10 --expected-mismatch-rate-for-read-disqualification 0.02 --pair-hmm-implementation FASTEST_AVAILABLE --pcr-indel-model CONSERVATIVE --phred-scaled-global-read-mismapping-rate 45 --disable-symmetric-hmm-normalizing false --disable-cap-base-qualities-to-map-quality false --enable-dynamic-read-disqualification-for-genotyping false --dynamic-read-disqualification-threshold 1.0 --native-pair-hmm-threads 4 --native-pair-hmm-use-double-precision false --flow-hmm-engine-min-indel-adjust 6 --flow-hmm-engine-flat-insertion-penatly 45 --flow-hmm-engine-flat-deletion-penatly 45 --use-pdhmm false --use-pdhmm-overlap-optimization false --make-determined-haps-from-pd-code false --print-pileupcalling-status false --fallback-gga-if-pdhmm-fails true --pileup-detection-active-region-phred-threshold 0.0 --num-artificial-haplotypes-to-add-per-allele 5 --artifical-haplotype-filtering-kmer-size 10 --pileup-detection-snp-alt-threshold 0.1 --pileup-detection-indel-alt-threshold 0.1 --pileup-detection-absolute-alt-depth 0.0 --pileup-detection-snp-adjacent-to-assembled-indel-range 5 --pileup-detection-snp-basequality-filter 12 --pileup-detection-bad-read-tolerance 0.0 --pileup-detection-proper-pair-read-badness true --pileup-detection-edit-distance-read-badness-threshold 0.08 --pileup-detection-chimeric-read-badness true --pileup-detection-template-mean-badness-threshold 0.0 --pileup-detection-template-std-badness-threshold 0.0 --pileup-detection-filter-assembly-alt-bad-read-tolerance 0.0 --pileup-detection-edit-distance-read-badness-for-assembly-filtering-threshold 0.12 --bam-writer-type CALLED_HAPLOTYPES --dont-use-soft-clipped-bases false --override-fragment-softclip-check false --min-base-quality-score 10 --max-mnp-distance 0 --force-call-filtered-alleles false --reference-model-deletion-quality 30 --soft-clip-low-quality-ends false --allele-informative-reads-overlap-margin 2 --smith-waterman-dangling-end-match-value 25 --smith-waterman-dangling-end-mismatch-penalty -50 --smith-waterman-dangling-end-gap-open-penalty -110 --smith-waterman-dangling-end-gap-extend-penalty -6 --smith-waterman-haplotype-to-reference-match-value 200 --smith-waterman-haplotype-to-reference-mismatch-penalty -150 --smith-waterman-haplotype-to-reference-gap-open-penalty -260 --smith-waterman-haplotype-to-reference-gap-extend-penalty -11 --smith-waterman-read-to-haplotype-match-value 10 --smith-waterman-read-to-haplotype-mismatch-penalty -15 --smith-waterman-read-to-haplotype-gap-open-penalty -30 --smith-waterman-read-to-haplotype-gap-extend-penalty -5 --flow-assembly-collapse-hmer-size 0 --flow-assembly-collapse-partial-mode false --flow-filter-alleles false --flow-filter-alleles-qual-threshold 30.0 --flow-filter-alleles-sor-threshold 3.0 --flow-filter-lone-alleles false --flow-filter-alleles-debug-graphs false --min-assembly-region-size 50 --max-assembly-region-size 300 --active-probability-threshold 0.002 --max-prob-propagation-distance 50 --force-active false --assembly-region-padding 100 --padding-around-indels 75 --padding-around-snps 20 --padding-around-strs 75 --max-extension-into-assembly-region-padding-legacy 25 --max-reads-per-alignment-start 50 --enable-legacy-assembly-region-trimming false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --max-variants-per-shard 0 --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays --disable-tool-default-read-filters false --minimum-mapping-quality 20 --disable-tool-default-annotations false --enable-all-annotations false --allow-old-rms-mapping-quality-annotation-data false --assembly-complexity-reference-mode false",Version="4.5.0.0",Date="April 23, 2024 at 6:39:27 PM GMT">
##GVCFBlock0-10=minGQ=0(inclusive),maxGQ=10(exclusive)
##GVCFBlock10-20=minGQ=10(inclusive),maxGQ=20(exclusive)
##GVCFBlock20-30=minGQ=20(inclusive),maxGQ=30(exclusive)
##GVCFBlock30-40=minGQ=30(inclusive),maxGQ=40(exclusive)
##GVCFBlock40-50=minGQ=40(inclusive),maxGQ=50(exclusive)
##GVCFBlock50-60=minGQ=50(inclusive),maxGQ=60(exclusive)
##GVCFBlock60-70=minGQ=60(inclusive),maxGQ=70(exclusive)
##GVCFBlock70-80=minGQ=70(inclusive),maxGQ=80(exclusive)
##GVCFBlock80-90=minGQ=80(inclusive),maxGQ=90(exclusive)
##GVCFBlock90-100=minGQ=90(inclusive),maxGQ=100(exclusive)
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=HAPCOMP,Number=A,Type=Integer,Description="Edit distances of each alt allele's most common supporting haplotype from closest germline haplotype, excluding differences at the site in question.">
##INFO=<ID=HAPDOM,Number=A,Type=Float,Description="For each alt allele, fraction of read support that best fits the most-supported haplotype containing the allele">
##INFO=<ID=HEC,Number=.,Type=Integer,Description="Counts of support for haplotype groups excluding difference at the site in question.">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=NDA,Number=1,Type=Integer,Description="Number of alternate alleles discovered (but not necessarily genotyped) at this site">
##INFO=<ID=RAW_MQandDP,Number=2,Type=Integer,Description="Raw data (sum of squared MQ and total depth) for improved RMS Mapping Quality calculation. Incompatible with deprecated RAW_MQ formulation.">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##contig=<ID=Pf3D7_01_v3,length=640851>
##contig=<ID=Pf3D7_02_v3,length=947102>
##contig=<ID=Pf3D7_03_v3,length=1067971>
##contig=<ID=Pf3D7_04_v3,length=1200490>
##contig=<ID=Pf3D7_05_v3,length=1343557>
##contig=<ID=Pf3D7_06_v3,length=1418242>
##contig=<ID=Pf3D7_07_v3,length=1445207>
##contig=<ID=Pf3D7_08_v3,length=1472805>
##contig=<ID=Pf3D7_09_v3,length=1541735>
##contig=<ID=Pf3D7_10_v3,length=1687656>
##contig=<ID=Pf3D7_11_v3,length=2038340>
##contig=<ID=Pf3D7_12_v3,length=2271494>
##contig=<ID=Pf3D7_13_v3,length=2925236>
##contig=<ID=Pf3D7_14_v3,length=3291936>
##contig=<ID=Pf3D7_API_v3,length=34250>
##contig=<ID=Pf3D7_MIT_v3,length=5967>
##source=HaplotypeCaller
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SENTh086.12-073.18_SEN.Di
Pf3D7_12_v3 646914 . TATA *,T,<NON_REF> 0.01 . HAPCOMP=0,0,0;HAPDOM=0.00,0.800,0.00;HEC=7,4,1,0,0,0,0,0,0;MLEAC=0,0,0;MLEAF=NaN,NaN,NaN;NDA=3 GT:PL ./.:0,0,0,0,0,0,0,0,0,0
Pf3D7_12_v3 646918 . T <NON_REF> . . END=646919 GT:DP:GQ:MIN_DP:PL 0/0:0:0:0:0,0,0
Pf3D7_12_v3 646920 . TTTTTTTTTTTTTC *,T,<NON_REF> 0 . DP=5;ExcessHet=0.0000;HAPCOMP=0,1,0;HAPDOM=0.00,1.00,0.00;HEC=7,4,1,0,0,0,0,0,0;MLEAC=0,0,0;MLEAF=0.00,0.00,0.00;NDA=3;RAW_MQandDP=18000,5 GT:AD:DP:GQ:PL:SB 0/0:4,0,0,0:4:12:0,15,123,12,114,180,15,124,152,151:2,2,0,0
Pf3D7_12_v3 646923 . T A,*,<NON_REF> 0.01 . DP=4;ExcessHet=0.0000;HAPCOMP=0,0,0;HAPDOM=0.111,0.00,0.00;HEC=7,4,1,0,0,0;MLEAC=0,0,0;MLEAF=0.00,0.00,0.00;NDA=3;RAW_MQandDP=14400,4 GT:AD:DP:GQ:PL:SB 0/0:0,0,0,0:0:0:0,0,1,12,13,99,11,12,61,50:0,0,0,0
Pf3D7_12_v3 646924 . T <NON_REF> . . END=646925 GT:DP:GQ:MIN_DP:PL 0/0:4:12:4:0,12,99
Pf3D7_12_v3 646926 . TTTTTTTC *,T,<NON_REF> 0 . DP=5;ExcessHet=0.0000;HAPCOMP=0,0,0;HAPDOM=0.00,1.00,0.00;HEC=7,4,1,0,0,0,0,0,0;MLEAC=0,0,0;MLEAF=0.00,0.00,0.00;NDA=3;RAW_MQandDP=18000,5 GT:AD:DP:GQ:PL:SB 0/0:0,0,0,0:0:0:0,12,180,0,12,0,12,102,12,90:0,0,0,0
Pf3D7_12_v3 646927 . TTTTTTC T,*,<NON_REF> 0.01 . DP=5;ExcessHet=0.0000;HAPCOMP=1,0,0;HAPDOM=0.111,0.00,0.00;HEC=7,4,1,0,0,0,0;MLEAC=0,0,0;MLEAF=0.00,0.00,0.00;NDA=3;RAW_MQandDP=18000,5 GT:AD:DP:GQ:PL:SB 1/2:0,0,0,0:0:0:1,1,1,1,0,0,1,1,1,1:0,0,0,0
Pf3D7_12_v3 646933 . C *,T,<NON_REF> 3.15 . DP=5;ExcessHet=0.0000;HAPCOMP=0,1,0;HAPDOM=0.00,0.111,0.00;HEC=7,4,1,0,0,0,0;MLEAC=1,1,0;MLEAF=0.500,0.500,0.00;NDA=3;RAW_MQandDP=18000,5 GT:AD:DP:GQ:PL:SB 1/1:0,0,0,0:0:1:123,15,0,16,1,1,76,14,15,62:0,0,0,0
Pf3D7_12_v3 646934 . T <NON_REF> . . END=646947 GT:DP:GQ:MIN_DP:PL 0/0:7:15:5:0,15,124
Pf3D7_12_v3 646948 . C <NON_REF> . . END=646948 GT:DP:GQ:MIN_DP:PL 0/0:3:9:3:0,9,74
Pf3D7_12_v3 646949 . A <NON_REF> . . END=646953 GT:DP:GQ:MIN_DP:PL 0/0:5:15:5:0,15,124
Pf3D7_12_v3 646954 . G <NON_REF> . . END=646955 GT:DP:GQ:MIN_DP:PL 0/0:7:21:7:0,21,163
Pf3D7_12_v3 646956 . C <NON_REF> . . END=646956 GT:DP:GQ:MIN_DP:PL 0/0:8:10:8:0,10,159
Pf3D7_12_v3 646957 . A <NON_REF> . . END=646991 GT:DP:GQ:MIN_DP:PL 0/0:8:21:7:0,21,163
Binary file not shown.
Loading