-
Notifications
You must be signed in to change notification settings - Fork 594
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support for GVCF validation with multiple contigs (#6028)
- Extracted the order validation for GVCF files into a separate method and included a check to reset the counter when a new contig is found. Contigs have to occur in continuous blocks; validation for files in which contigs occur alternatingly is not supported. - Added a set of integration tests for GVCF files with two and three contigs. Fixes #6023
- Loading branch information
1 parent
ed33e5b
commit 90fd39b
Showing
8 changed files
with
281 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
61 changes: 61 additions & 0 deletions
61
...dinstitute/hellbender/tools/walkers/ValidateVariants/badGVCF.outOfOrderThreeContigs.g.vcf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
##fileformat=VCFv4.2 | ||
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> | ||
##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele fractions of alternate alleles in the tumor"> | ||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"> | ||
##FORMAT=<ID=F1R2,Number=R,Type=Integer,Description="Count of reads in F1R2 pair orientation supporting each allele"> | ||
##FORMAT=<ID=F2R1,Number=R,Type=Integer,Description="Count of reads in F2R1 pair orientation supporting each allele"> | ||
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> | ||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||
##FORMAT=<ID=MBQ,Number=R,Type=Integer,Description="median base quality"> | ||
##FORMAT=<ID=MFRL,Number=R,Type=Integer,Description="median fragment length"> | ||
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block"> | ||
##FORMAT=<ID=MMQ,Number=A,Type=Integer,Description="median mapping quality"> | ||
##FORMAT=<ID=MPOS,Number=A,Type=Integer,Description="median distance from end of read"> | ||
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another"> | ||
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group"> | ||
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"> | ||
##FORMAT=<ID=SA_MAP_AF,Number=3,Type=Float,Description="MAP estimates of allele fraction given z"> | ||
##FORMAT=<ID=SA_POST_PROB,Number=3,Type=Float,Description="posterior probabilities of the presence of strand artifact"> | ||
##FORMAT=<ID=TLOD,Number=A,Type=Float,Description="Tumor LOD score"> | ||
##GATKCommandLine=<ID=Mutect2,CommandLine="Mutect2 --tumor-sample NA12878 --min-pruning 5 --emit-ref-confidence GVCF --output /var/folders/v5/gbkn5lv95c51kj3xpx5pzncscnxf_m/T/unfiltered2551757053106549600.vcf --intervals chrM:1-1000 --input /Users/gauthier/workspaces/gatk/src/test/resources/org/broadinstitute/hellbender/tools/mutect/mito/NA12878.bam --reference /Users/gauthier/workspaces/gatk/src/test/resources/org/broadinstitute/hellbender/tools/mutect/mito/Homo_sapiens_assembly38.mt_only.fasta --verbosity ERROR --genotype-pon-sites false --genotype-germline-sites false --af-of-alleles-not-in-resource -1.0 --tumor-lod-to-emit 3.0 --initial-tumor-lod 2.0 --initial-pcr-qual 40 --max-population-af 0.01 --downsampling-stride 1 --max-suspicious-reads-per-alignment-start 0 --normal-lod 2.2 --max-mnp-distance 1 --ignore-itr-artifacts false --get-af-from-ad false --gvcf-lod-band 500 --gvcf-lod-band 1000 --gvcf-lod-band 1700 --gvcf-lod-band 1800 --gvcf-lod-band 1900 --gvcf-lod-band 2000 --dont-trim-active-regions false --max-disc-ar-extension 25 --max-gga-ar-extension 300 --padding-around-indels 150 --padding-around-snps 20 --kmer-size 10 --kmer-size 25 --dont-increase-kmer-sizes-for-cycles false --allow-non-unique-kmers-in-ref false --num-pruning-samples 1 --recover-dangling-heads false --do-not-recover-dangling-branches false --min-dangling-branch-length 4 --consensus false --max-num-haplotypes-in-population 128 --error-correct-kmers false --debug-graph-transformations false --kmer-length-for-read-error-correction 25 --min-observations-for-kmer-to-be-solid 20 --likelihood-calculation-engine PairHMM --base-quality-score-threshold 18 --pair-hmm-gap-continuation-penalty 10 --pair-hmm-implementation FASTEST_AVAILABLE --pcr-indel-model CONSERVATIVE --phred-scaled-global-read-mismapping-rate 45 --native-pair-hmm-threads 4 --native-pair-hmm-use-double-precision false --debug false --use-filtered-reads-for-annotations false --bam-writer-type CALLED_HAPLOTYPES --dont-use-soft-clipped-bases false --capture-assembly-failure-bam false --error-correct-reads false --do-not-run-physical-phasing false --min-base-quality-score 10 --smith-waterman JAVA --use-new-qual-calculator false --annotate-with-num-discovered-alleles false --heterozygosity 0.001 --indel-heterozygosity 1.25E-4 --heterozygosity-stdev 0.01 --standard-min-confidence-threshold-for-calling 10.0 --max-alternate-alleles 6 --max-genotype-count 1024 --sample-ploidy 2 --num-reference-samples-if-no-call 0 --genotyping-mode DISCOVERY --genotype-filtered-alleles false --contamination-fraction-to-filter 0.0 --output-mode EMIT_VARIANTS_ONLY --all-site-pls false --min-assembly-region-size 50 --max-assembly-region-size 300 --assembly-region-padding 100 --max-reads-per-alignment-start 50 --active-probability-threshold 0.002 --max-prob-propagation-distance 50 --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays --disable-tool-default-read-filters false --max-read-length 2147483647 --min-read-length 30 --minimum-mapping-quality 20 --disable-tool-default-annotations false --enable-all-annotations false",Version=Unavailable,Date="October 17, 2018 9:57:46 AM EDT"> | ||
##GVCFBlock0-500=minGQ=0(inclusive),maxGQ=500(exclusive) | ||
##GVCFBlock1000-1700=minGQ=1000(inclusive),maxGQ=1700(exclusive) | ||
##GVCFBlock1700-1800=minGQ=1700(inclusive),maxGQ=1800(exclusive) | ||
##GVCFBlock1800-1900=minGQ=1800(inclusive),maxGQ=1900(exclusive) | ||
##GVCFBlock1900-2000=minGQ=1900(inclusive),maxGQ=2000(exclusive) | ||
##GVCFBlock2000-2147483647=minGQ=2000(inclusive),maxGQ=2147483647(exclusive) | ||
##GVCFBlock500-1000=minGQ=500(inclusive),maxGQ=1000(exclusive) | ||
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered"> | ||
##INFO=<ID=ECNT,Number=1,Type=Integer,Description="Number of events in this haplotype"> | ||
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval"> | ||
##INFO=<ID=IN_PON,Number=0,Type=Flag,Description="site found in panel of normals"> | ||
##INFO=<ID=NLOD,Number=A,Type=Float,Description="Normal LOD score"> | ||
##INFO=<ID=N_ART_LOD,Number=A,Type=Float,Description="log odds of artifact in normal with same allele fraction as tumor"> | ||
##INFO=<ID=POP_AF,Number=A,Type=Float,Description="population allele frequencies of alt alleles"> | ||
##INFO=<ID=P_CONTAM,Number=1,Type=Float,Description="Posterior probability for alt allele to be due to contamination"> | ||
##INFO=<ID=P_GERMLINE,Number=A,Type=Float,Description="Posterior probability for alt allele to be germline variants"> | ||
##INFO=<ID=RPA,Number=.,Type=Integer,Description="Number of times tandem repeat unit is repeated, for each allele (including reference)"> | ||
##INFO=<ID=RU,Number=1,Type=String,Description="Tandem repeat unit (bases)"> | ||
##INFO=<ID=STR,Number=0,Type=Flag,Description="Variant is a short tandem repeat"> | ||
##INFO=<ID=TLOD,Number=A,Type=Float,Description="Tumor LOD score"> | ||
##MutectVersion=2.1 | ||
##contig=<ID=chrM,length=16569> | ||
##contig=<ID=chr1,length=248956422> | ||
##contig=<ID=chr2,length=242193529> | ||
##filtering_status=Warning: unfiltered Mutect 2 calls. Please run FilterMutectCalls to remove false positives. | ||
##source=Mutect2 | ||
##tumor_sample=NA12878 | ||
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 | ||
chrM 1 . G <NON_REF> . . END=156 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51 | ||
chrM 152 . T C,<NON_REF> . . DP=1582;ECNT=2;POP_AF=5.000e-08,5.000e-08;TLOD=5265.15,-2.894e+00 GT:AD:AF:DP:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:PGT:PID:SA_MAP_AF:SA_POST_PROB 0/1/2:3,1556,0:0.997,6.372e-04:1559:2,777,0:1,779,0:30,30,0:16270,369,0:60,0:42,0:0|1:152_T_C:0.990,0.990,0.998:0.036,0.021,0.943 | ||
chrM 153 . A <NON_REF> . . END=16569 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51 | ||
chr1 1 . N <NON_REF> . . END=99993569 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51 | ||
chr1 99993570 . A <NON_REF> . . END=99993570 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51 | ||
chr1 99993560 . C <NON_REF> . . END=99993570 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51 | ||
chr1 99993571 . A C,<NON_REF> . . DP=1582;ECNT=2;POP_AF=5.000e-08,5.000e-08;TLOD=5265.15,-2.894e+00 GT:AD:AF:DP:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:PGT:PID:SA_MAP_AF:SA_POST_PROB 0/1/2:3,1556,0:0.997,6.372e-04:1559:2,777,0:1,779,0:30,30,0:16270,369,0:60,0:42,0:0|1:152_T_C:0.990,0.990,0.998:0.036,0.021,0.943 | ||
chr1 99993572 . A <NON_REF> . . END=248956422 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51 | ||
chr2 1 . N <NON_REF> . . END=99993569 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51 | ||
chr2 59984617 . A <NON_REF> . . END=59984617 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51 | ||
chr2 59984617 . C <NON_REF> . . END=59984617 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51 | ||
chr2 59984618 . A C,<NON_REF> . . DP=1582;ECNT=2;POP_AF=5.000e-08,5.000e-08;TLOD=5265.15,-2.894e+00 GT:AD:AF:DP:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:PGT:PID:SA_MAP_AF:SA_POST_PROB 0/1/2:3,1556,0:0.997,6.372e-04:1559:2,777,0:1,779,0:30,30,0:16270,369,0:60,0:42,0:0|1:152_T_C:0.990,0.990,0.998:0.036,0.021,0.943 | ||
chr2 59984619 . A <NON_REF> . . END=242193529 GT:DP:MIN_DP:TLOD 0/0:1694:1680:1803.51 |
Oops, something went wrong.