Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for GVCF validation with multiple contigs #6028

Merged
merged 1 commit into from
Jul 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ public enum ValidationType {
// information to keep track of when validating a GVCF
private SimpleInterval previousInterval;
private int previousStart = -1;
private String previousContig = null;

@Override
public void onTraversalStart() {
Expand Down Expand Up @@ -239,14 +240,7 @@ public void apply(final VariantContext vc, final ReadsContext readsContext, fina
if (VALIDATE_GVCF) {
final SimpleInterval refInterval = ref.getInterval();

//if next VC refers to a previous genomic position, throw an error
//Note that HaplotypeCaller can emit variants that start inside of a deletion on another haplotype,
// making v2's start less than the deletion's end
if (previousStart > -1 && vc.getStart() < previousStart) {
final UserException e = new UserException(String.format("In a GVCF all records must ordered. Record: %s covers a position previously traversed.",
vc.toStringWithoutGenotypes()));
throwOrWarn(e);
}
validateVariantsOrder(vc);

// GenomeLocSortedSet will automatically merge intervals that are overlapping when setting `mergeIfIntervalOverlaps`
// to true. In a GVCF most blocks are adjacent to each other so they wouldn't normally get merged. We check
Expand Down Expand Up @@ -347,6 +341,24 @@ private Collection<ValidationType> calculateValidationTypesToApply(final List<Va
}
}

private void validateVariantsOrder(final VariantContext vc) {
// Check if the current VC belongs to the same contig as the previous one.
// If not, reset the start position to -1.
if (previousContig == null || !previousContig.equals(vc.getContig())) {
previousContig = vc.getContig();
previousStart = -1;
}

//if next VC refers to a previous genomic position, throw an error
//Note that HaplotypeCaller can emit variants that start inside of a deletion on another haplotype,
// making v2's start less than the deletion's end
if (previousStart > -1 && vc.getStart() < previousStart) {
final UserException e = new UserException(String.format("In a GVCF all records must ordered. Record: %s covers a position previously traversed.",
vc.toStringWithoutGenotypes()));
throwOrWarn(e);
}
}

private void validateGVCFVariant(final VariantContext vc) {
if (!vc.hasAllele(Allele.NON_REF_ALLELE)) {
final UserException e = new UserException(String.format("In a GVCF all records must contain a %s allele. Offending record: %s",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -355,4 +355,36 @@ public void testNonOverlappingRegionsBP_RESOLUTION() throws IOException {
Collections.emptyList());
spec.executeTest("tests capture of non-complete region, on BP_RESOLUTION gvcf", this);
}

@Test
public void testGoodVariantsOrderTwoContigs() throws IOException {
IntegrationTestSpec spec = new IntegrationTestSpec(
baseTestString(false, "goodGVCF.inOrderTwoContigs.g.vcf", true, ALLELES, null, hg38Reference) + " -gvcf ",
Collections.emptyList());
spec.executeTest("tests the variants order validation for a valid file including two contigs", this);
}

@Test
public void testBadVariantsOrderTwoContigs() throws IOException {
IntegrationTestSpec spec = new IntegrationTestSpec(
baseTestString(false, "badGVCF.outOfOrderTwoContigs.g.vcf", true, ALLELES, null, hg38Reference) + " -gvcf ",
0, UserException.class);
spec.executeTest("tests the variants order validation for an invalid file including two contigs", this);
}

@Test
public void testGoodVariantsOrderThreeContigs() throws IOException {
IntegrationTestSpec spec = new IntegrationTestSpec(
baseTestString(false, "goodGVCF.inOrderThreeContigs.g.vcf", true, ALLELES, null, hg38Reference) + " -gvcf ",
Collections.emptyList());
spec.executeTest("tests the variants order validation for a valid file including three contigs", this);
}

@Test
public void testBadVariantsOrderThreeContigs() throws IOException {
IntegrationTestSpec spec = new IntegrationTestSpec(
baseTestString(false, "badGVCF.outOfOrderThreeContigs.g.vcf", true, ALLELES, null, hg38Reference) + " -gvcf ",
0, UserException.class);
spec.executeTest("tests the variants order validation for an invalid file including three contigs", this);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele fractions of alternate alleles in the tumor">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=F1R2,Number=R,Type=Integer,Description="Count of reads in F1R2 pair orientation supporting each allele">
##FORMAT=<ID=F2R1,Number=R,Type=Integer,Description="Count of reads in F2R1 pair orientation supporting each allele">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MBQ,Number=R,Type=Integer,Description="median base quality">
##FORMAT=<ID=MFRL,Number=R,Type=Integer,Description="median fragment length">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=MMQ,Number=A,Type=Integer,Description="median mapping quality">
##FORMAT=<ID=MPOS,Number=A,Type=Integer,Description="median distance from end of read">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SA_MAP_AF,Number=3,Type=Float,Description="MAP estimates of allele fraction given z">
##FORMAT=<ID=SA_POST_PROB,Number=3,Type=Float,Description="posterior probabilities of the presence of strand artifact">
##FORMAT=<ID=TLOD,Number=A,Type=Float,Description="Tumor LOD score">
##GATKCommandLine=<ID=Mutect2,CommandLine="Mutect2 --tumor-sample NA12878 --min-pruning 5 --emit-ref-confidence GVCF --output /var/folders/v5/gbkn5lv95c51kj3xpx5pzncscnxf_m/T/unfiltered2551757053106549600.vcf --intervals chrM:1-1000 --input /Users/gauthier/workspaces/gatk/src/test/resources/org/broadinstitute/hellbender/tools/mutect/mito/NA12878.bam --reference /Users/gauthier/workspaces/gatk/src/test/resources/org/broadinstitute/hellbender/tools/mutect/mito/Homo_sapiens_assembly38.mt_only.fasta --verbosity ERROR --genotype-pon-sites false --genotype-germline-sites false --af-of-alleles-not-in-resource -1.0 --tumor-lod-to-emit 3.0 --initial-tumor-lod 2.0 --initial-pcr-qual 40 --max-population-af 0.01 --downsampling-stride 1 --max-suspicious-reads-per-alignment-start 0 --normal-lod 2.2 --max-mnp-distance 1 --ignore-itr-artifacts false --get-af-from-ad false --gvcf-lod-band 500 --gvcf-lod-band 1000 --gvcf-lod-band 1700 --gvcf-lod-band 1800 --gvcf-lod-band 1900 --gvcf-lod-band 2000 --dont-trim-active-regions false --max-disc-ar-extension 25 --max-gga-ar-extension 300 --padding-around-indels 150 --padding-around-snps 20 --kmer-size 10 --kmer-size 25 --dont-increase-kmer-sizes-for-cycles false --allow-non-unique-kmers-in-ref false --num-pruning-samples 1 --recover-dangling-heads false --do-not-recover-dangling-branches false --min-dangling-branch-length 4 --consensus false --max-num-haplotypes-in-population 128 --error-correct-kmers false --debug-graph-transformations false --kmer-length-for-read-error-correction 25 --min-observations-for-kmer-to-be-solid 20 --likelihood-calculation-engine PairHMM --base-quality-score-threshold 18 --pair-hmm-gap-continuation-penalty 10 --pair-hmm-implementation FASTEST_AVAILABLE --pcr-indel-model CONSERVATIVE --phred-scaled-global-read-mismapping-rate 45 --native-pair-hmm-threads 4 --native-pair-hmm-use-double-precision false --debug false --use-filtered-reads-for-annotations false --bam-writer-type CALLED_HAPLOTYPES --dont-use-soft-clipped-bases false --capture-assembly-failure-bam false --error-correct-reads false --do-not-run-physical-phasing false --min-base-quality-score 10 --smith-waterman JAVA --use-new-qual-calculator false --annotate-with-num-discovered-alleles false --heterozygosity 0.001 --indel-heterozygosity 1.25E-4 --heterozygosity-stdev 0.01 --standard-min-confidence-threshold-for-calling 10.0 --max-alternate-alleles 6 --max-genotype-count 1024 --sample-ploidy 2 --num-reference-samples-if-no-call 0 --genotyping-mode DISCOVERY --genotype-filtered-alleles false --contamination-fraction-to-filter 0.0 --output-mode EMIT_VARIANTS_ONLY --all-site-pls false --min-assembly-region-size 50 --max-assembly-region-size 300 --assembly-region-padding 100 --max-reads-per-alignment-start 50 --active-probability-threshold 0.002 --max-prob-propagation-distance 50 --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays --disable-tool-default-read-filters false --max-read-length 2147483647 --min-read-length 30 --minimum-mapping-quality 20 --disable-tool-default-annotations false --enable-all-annotations false",Version=Unavailable,Date="October 17, 2018 9:57:46 AM EDT">
##GVCFBlock0-500=minGQ=0(inclusive),maxGQ=500(exclusive)
##GVCFBlock1000-1700=minGQ=1000(inclusive),maxGQ=1700(exclusive)
##GVCFBlock1700-1800=minGQ=1700(inclusive),maxGQ=1800(exclusive)
##GVCFBlock1800-1900=minGQ=1800(inclusive),maxGQ=1900(exclusive)
##GVCFBlock1900-2000=minGQ=1900(inclusive),maxGQ=2000(exclusive)
##GVCFBlock2000-2147483647=minGQ=2000(inclusive),maxGQ=2147483647(exclusive)
##GVCFBlock500-1000=minGQ=500(inclusive),maxGQ=1000(exclusive)
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=ECNT,Number=1,Type=Integer,Description="Number of events in this haplotype">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=IN_PON,Number=0,Type=Flag,Description="site found in panel of normals">
##INFO=<ID=NLOD,Number=A,Type=Float,Description="Normal LOD score">
##INFO=<ID=N_ART_LOD,Number=A,Type=Float,Description="log odds of artifact in normal with same allele fraction as tumor">
##INFO=<ID=POP_AF,Number=A,Type=Float,Description="population allele frequencies of alt alleles">
##INFO=<ID=P_CONTAM,Number=1,Type=Float,Description="Posterior probability for alt allele to be due to contamination">
##INFO=<ID=P_GERMLINE,Number=A,Type=Float,Description="Posterior probability for alt allele to be germline variants">
##INFO=<ID=RPA,Number=.,Type=Integer,Description="Number of times tandem repeat unit is repeated, for each allele (including reference)">
##INFO=<ID=RU,Number=1,Type=String,Description="Tandem repeat unit (bases)">
##INFO=<ID=STR,Number=0,Type=Flag,Description="Variant is a short tandem repeat">
##INFO=<ID=TLOD,Number=A,Type=Float,Description="Tumor LOD score">
##MutectVersion=2.1
##contig=<ID=chrM,length=16569>
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##filtering_status=Warning: unfiltered Mutect 2 calls. Please run FilterMutectCalls to remove false positives.
##source=Mutect2
##tumor_sample=NA12878
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
chrM 1 . G <NON_REF> . . END=156 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chrM 152 . T C,<NON_REF> . . DP=1582;ECNT=2;POP_AF=5.000e-08,5.000e-08;TLOD=5265.15,-2.894e+00 GT:AD:AF:DP:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:PGT:PID:SA_MAP_AF:SA_POST_PROB 0/1/2:3,1556,0:0.997,6.372e-04:1559:2,777,0:1,779,0:30,30,0:16270,369,0:60,0:42,0:0|1:152_T_C:0.990,0.990,0.998:0.036,0.021,0.943
chrM 153 . A <NON_REF> . . END=16569 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr1 1 . N <NON_REF> . . END=99993569 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr1 99993570 . A <NON_REF> . . END=99993570 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr1 99993560 . C <NON_REF> . . END=99993570 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr1 99993571 . A C,<NON_REF> . . DP=1582;ECNT=2;POP_AF=5.000e-08,5.000e-08;TLOD=5265.15,-2.894e+00 GT:AD:AF:DP:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:PGT:PID:SA_MAP_AF:SA_POST_PROB 0/1/2:3,1556,0:0.997,6.372e-04:1559:2,777,0:1,779,0:30,30,0:16270,369,0:60,0:42,0:0|1:152_T_C:0.990,0.990,0.998:0.036,0.021,0.943
chr1 99993572 . A <NON_REF> . . END=248956422 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr2 1 . N <NON_REF> . . END=99993569 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr2 59984617 . A <NON_REF> . . END=59984617 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr2 59984617 . C <NON_REF> . . END=59984617 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
chr2 59984618 . A C,<NON_REF> . . DP=1582;ECNT=2;POP_AF=5.000e-08,5.000e-08;TLOD=5265.15,-2.894e+00 GT:AD:AF:DP:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:PGT:PID:SA_MAP_AF:SA_POST_PROB 0/1/2:3,1556,0:0.997,6.372e-04:1559:2,777,0:1,779,0:30,30,0:16270,369,0:60,0:42,0:0|1:152_T_C:0.990,0.990,0.998:0.036,0.021,0.943
chr2 59984619 . A <NON_REF> . . END=242193529 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51
Loading