Skip to content

Commit

Permalink
Support for GVCF validation with multiple contigs (#6028)
Browse files Browse the repository at this point in the history
- Extracted the order validation for GVCF files into a separate method and included
a check to reset the counter when a new contig is found. Contigs have to
occur in continuous blocks; validation for files in which contigs occur
alternatingly is not supported.
- Added a set of integration tests for GVCF files with two and three contigs.

Fixes #6023
  • Loading branch information
michaelgatzen authored Jul 3, 2019
1 parent ed33e5b commit 90fd39b
Show file tree
Hide file tree
Showing 8 changed files with 281 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ public enum ValidationType {
// information to keep track of when validating a GVCF
private SimpleInterval previousInterval;
private int previousStart = -1;
private String previousContig = null;

@Override
public void onTraversalStart() {
Expand Down Expand Up @@ -239,14 +240,7 @@ public void apply(final VariantContext vc, final ReadsContext readsContext, fina
if (VALIDATE_GVCF) {
final SimpleInterval refInterval = ref.getInterval();

//if next VC refers to a previous genomic position, throw an error
//Note that HaplotypeCaller can emit variants that start inside of a deletion on another haplotype,
// making v2's start less than the deletion's end
if (previousStart > -1 && vc.getStart() < previousStart) {
final UserException e = new UserException(String.format("In a GVCF all records must ordered. Record: %s covers a position previously traversed.",
vc.toStringWithoutGenotypes()));
throwOrWarn(e);
}
validateVariantsOrder(vc);

// GenomeLocSortedSet will automatically merge intervals that are overlapping when setting `mergeIfIntervalOverlaps`
// to true. In a GVCF most blocks are adjacent to each other so they wouldn't normally get merged. We check
Expand Down Expand Up @@ -347,6 +341,24 @@ private Collection<ValidationType> calculateValidationTypesToApply(final List<Va
}
}

private void validateVariantsOrder(final VariantContext vc) {
// Check if the current VC belongs to the same contig as the previous one.
// If not, reset the start position to -1.
if (previousContig == null || !previousContig.equals(vc.getContig())) {
previousContig = vc.getContig();
previousStart = -1;
}

//if next VC refers to a previous genomic position, throw an error
//Note that HaplotypeCaller can emit variants that start inside of a deletion on another haplotype,
// making v2's start less than the deletion's end
if (previousStart > -1 && vc.getStart() < previousStart) {
final UserException e = new UserException(String.format("In a GVCF all records must ordered. Record: %s covers a position previously traversed.",
vc.toStringWithoutGenotypes()));
throwOrWarn(e);
}
}

private void validateGVCFVariant(final VariantContext vc) {
if (!vc.hasAllele(Allele.NON_REF_ALLELE)) {
final UserException e = new UserException(String.format("In a GVCF all records must contain a %s allele. Offending record: %s",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -355,4 +355,36 @@ public void testNonOverlappingRegionsBP_RESOLUTION() throws IOException {
Collections.emptyList());
spec.executeTest("tests capture of non-complete region, on BP_RESOLUTION gvcf", this);
}

@Test
public void testGoodVariantsOrderTwoContigs() throws IOException {
IntegrationTestSpec spec = new IntegrationTestSpec(
baseTestString(false, "goodGVCF.inOrderTwoContigs.g.vcf", true, ALLELES, null, hg38Reference) + " -gvcf ",
Collections.emptyList());
spec.executeTest("tests the variants order validation for a valid file including two contigs", this);
}

@Test
public void testBadVariantsOrderTwoContigs() throws IOException {
IntegrationTestSpec spec = new IntegrationTestSpec(
baseTestString(false, "badGVCF.outOfOrderTwoContigs.g.vcf", true, ALLELES, null, hg38Reference) + " -gvcf ",
0, UserException.class);
spec.executeTest("tests the variants order validation for an invalid file including two contigs", this);
}

@Test
public void testGoodVariantsOrderThreeContigs() throws IOException {
IntegrationTestSpec spec = new IntegrationTestSpec(
baseTestString(false, "goodGVCF.inOrderThreeContigs.g.vcf", true, ALLELES, null, hg38Reference) + " -gvcf ",
Collections.emptyList());
spec.executeTest("tests the variants order validation for a valid file including three contigs", this);
}

@Test
public void testBadVariantsOrderThreeContigs() throws IOException {
IntegrationTestSpec spec = new IntegrationTestSpec(
baseTestString(false, "badGVCF.outOfOrderThreeContigs.g.vcf", true, ALLELES, null, hg38Reference) + " -gvcf ",
0, UserException.class);
spec.executeTest("tests the variants order validation for an invalid file including three contigs", this);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele fractions of alternate alleles in the tumor">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=F1R2,Number=R,Type=Integer,Description="Count of reads in F1R2 pair orientation supporting each allele">
##FORMAT=<ID=F2R1,Number=R,Type=Integer,Description="Count of reads in F2R1 pair orientation supporting each allele">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MBQ,Number=R,Type=Integer,Description="median base quality">
##FORMAT=<ID=MFRL,Number=R,Type=Integer,Description="median fragment length">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=MMQ,Number=A,Type=Integer,Description="median mapping quality">
##FORMAT=<ID=MPOS,Number=A,Type=Integer,Description="median distance from end of read">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SA_MAP_AF,Number=3,Type=Float,Description="MAP estimates of allele fraction given z">
##FORMAT=<ID=SA_POST_PROB,Number=3,Type=Float,Description="posterior probabilities of the presence of strand artifact">
##FORMAT=<ID=TLOD,Number=A,Type=Float,Description="Tumor LOD score">
##GATKCommandLine=<ID=Mutect2,CommandLine="Mutect2 --tumor-sample NA12878 --min-pruning 5 --emit-ref-confidence GVCF --output /var/folders/v5/gbkn5lv95c51kj3xpx5pzncscnxf_m/T/unfiltered2551757053106549600.vcf --intervals chrM:1-1000 --input /Users/gauthier/workspaces/gatk/src/test/resources/org/broadinstitute/hellbender/tools/mutect/mito/NA12878.bam --reference /Users/gauthier/workspaces/gatk/src/test/resources/org/broadinstitute/hellbender/tools/mutect/mito/Homo_sapiens_assembly38.mt_only.fasta --verbosity ERROR --genotype-pon-sites false --genotype-germline-sites false --af-of-alleles-not-in-resource -1.0 --tumor-lod-to-emit 3.0 --initial-tumor-lod 2.0 --initial-pcr-qual 40 --max-population-af 0.01 --downsampling-stride 1 --max-suspicious-reads-per-alignment-start 0 --normal-lod 2.2 --max-mnp-distance 1 --ignore-itr-artifacts false --get-af-from-ad false --gvcf-lod-band 500 --gvcf-lod-band 1000 --gvcf-lod-band 1700 --gvcf-lod-band 1800 --gvcf-lod-band 1900 --gvcf-lod-band 2000 --dont-trim-active-regions false --max-disc-ar-extension 25 --max-gga-ar-extension 300 --padding-around-indels 150 --padding-around-snps 20 --kmer-size 10 --kmer-size 25 --dont-increase-kmer-sizes-for-cycles false --allow-non-unique-kmers-in-ref false --num-pruning-samples 1 --recover-dangling-heads false --do-not-recover-dangling-branches false --min-dangling-branch-length 4 --consensus false --max-num-haplotypes-in-population 128 --error-correct-kmers false --debug-graph-transformations false --kmer-length-for-read-error-correction 25 --min-observations-for-kmer-to-be-solid 20 --likelihood-calculation-engine PairHMM --base-quality-score-threshold 18 --pair-hmm-gap-continuation-penalty 10 --pair-hmm-implementation FASTEST_AVAILABLE --pcr-indel-model CONSERVATIVE --phred-scaled-global-read-mismapping-rate 45 --native-pair-hmm-threads 4 --native-pair-hmm-use-double-precision false --debug false --use-filtered-reads-for-annotations false --bam-writer-type CALLED_HAPLOTYPES --dont-use-soft-clipped-bases false --capture-assembly-failure-bam false --error-correct-reads false --do-not-run-physical-phasing false --min-base-quality-score 10 --smith-waterman JAVA --use-new-qual-calculator false --annotate-with-num-discovered-alleles false --heterozygosity 0.001 --indel-heterozygosity 1.25E-4 --heterozygosity-stdev 0.01 --standard-min-confidence-threshold-for-calling 10.0 --max-alternate-alleles 6 --max-genotype-count 1024 --sample-ploidy 2 --num-reference-samples-if-no-call 0 --genotyping-mode DISCOVERY --genotype-filtered-alleles false --contamination-fraction-to-filter 0.0 --output-mode EMIT_VARIANTS_ONLY --all-site-pls false --min-assembly-region-size 50 --max-assembly-region-size 300 --assembly-region-padding 100 --max-reads-per-alignment-start 50 --active-probability-threshold 0.002 --max-prob-propagation-distance 50 --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays --disable-tool-default-read-filters false --max-read-length 2147483647 --min-read-length 30 --minimum-mapping-quality 20 --disable-tool-default-annotations false --enable-all-annotations false",Version=Unavailable,Date="October 17, 2018 9:57:46 AM EDT">
##GVCFBlock0-500=minGQ=0(inclusive),maxGQ=500(exclusive)
##GVCFBlock1000-1700=minGQ=1000(inclusive),maxGQ=1700(exclusive)
##GVCFBlock1700-1800=minGQ=1700(inclusive),maxGQ=1800(exclusive)
##GVCFBlock1800-1900=minGQ=1800(inclusive),maxGQ=1900(exclusive)
##GVCFBlock1900-2000=minGQ=1900(inclusive),maxGQ=2000(exclusive)
##GVCFBlock2000-2147483647=minGQ=2000(inclusive),maxGQ=2147483647(exclusive)
##GVCFBlock500-1000=minGQ=500(inclusive),maxGQ=1000(exclusive)
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=ECNT,Number=1,Type=Integer,Description="Number of events in this haplotype">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=IN_PON,Number=0,Type=Flag,Description="site found in panel of normals">
##INFO=<ID=NLOD,Number=A,Type=Float,Description="Normal LOD score">
##INFO=<ID=N_ART_LOD,Number=A,Type=Float,Description="log odds of artifact in normal with same allele fraction as tumor">
##INFO=<ID=POP_AF,Number=A,Type=Float,Description="population allele frequencies of alt alleles">
##INFO=<ID=P_CONTAM,Number=1,Type=Float,Description="Posterior probability for alt allele to be due to contamination">
##INFO=<ID=P_GERMLINE,Number=A,Type=Float,Description="Posterior probability for alt allele to be germline variants">
##INFO=<ID=RPA,Number=.,Type=Integer,Description="Number of times tandem repeat unit is repeated, for each allele (including reference)">
##INFO=<ID=RU,Number=1,Type=String,Description="Tandem repeat unit (bases)">
##INFO=<ID=STR,Number=0,Type=Flag,Description="Variant is a short tandem repeat">
##INFO=<ID=TLOD,Number=A,Type=Float,Description="Tumor LOD score">
##MutectVersion=2.1
##contig=<ID=chrM,length=16569>
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##filtering_status=Warning: unfiltered Mutect 2 calls. Please run FilterMutectCalls to remove false positives.
##source=Mutect2
##tumor_sample=NA12878
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
chrM 1 . G <NON_REF> . . END=156 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chrM 152 . T C,<NON_REF> . . DP=1582;ECNT=2;POP_AF=5.000e-08,5.000e-08;TLOD=5265.15,-2.894e+00 GT:AD:AF:DP:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:PGT:PID:SA_MAP_AF:SA_POST_PROB 0/1/2:3,1556,0:0.997,6.372e-04:1559:2,777,0:1,779,0:30,30,0:16270,369,0:60,0:42,0:0|1:152_T_C:0.990,0.990,0.998:0.036,0.021,0.943
chrM 153 . A <NON_REF> . . END=16569 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr1 1 . N <NON_REF> . . END=99993569 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr1 99993570 . A <NON_REF> . . END=99993570 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr1 99993560 . C <NON_REF> . . END=99993570 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr1 99993571 . A C,<NON_REF> . . DP=1582;ECNT=2;POP_AF=5.000e-08,5.000e-08;TLOD=5265.15,-2.894e+00 GT:AD:AF:DP:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:PGT:PID:SA_MAP_AF:SA_POST_PROB 0/1/2:3,1556,0:0.997,6.372e-04:1559:2,777,0:1,779,0:30,30,0:16270,369,0:60,0:42,0:0|1:152_T_C:0.990,0.990,0.998:0.036,0.021,0.943
chr1 99993572 . A <NON_REF> . . END=248956422 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr2 1 . N <NON_REF> . . END=99993569 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr2 59984617 . A <NON_REF> . . END=59984617 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr2 59984617 . C <NON_REF> . . END=59984617 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr2 59984618 . A C,<NON_REF> . . DP=1582;ECNT=2;POP_AF=5.000e-08,5.000e-08;TLOD=5265.15,-2.894e+00 GT:AD:AF:DP:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:PGT:PID:SA_MAP_AF:SA_POST_PROB 0/1/2:3,1556,0:0.997,6.372e-04:1559:2,777,0:1,779,0:30,30,0:16270,369,0:60,0:42,0:0|1:152_T_C:0.990,0.990,0.998:0.036,0.021,0.943
chr2 59984619 . A <NON_REF> . . END=242193529 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
Loading

0 comments on commit 90fd39b

Please sign in to comment.