-
Notifications
You must be signed in to change notification settings - Fork 596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Handle trailing reference blocks in CombineGVCFs. #4615
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -74,7 +74,10 @@ public Object[][] gvcfsToCombine() { | |
//Testing allele-specific annotations | ||
{new File[]{getTestFile("NA12878.AS.chr20snippet.g.vcf"), getTestFile("NA12892.AS.chr20snippet.g.vcf")}, getTestFile("testAlleleSpecificAnnotations.vcf"), Arrays.asList("-G", "Standard", "-G", "AS_Standard"), b37_reference_20_21}, | ||
//Testing allele-specific annotations missing AS_Standard Group | ||
{new File[]{getTestFile("NA12878.AS.chr20snippet.g.vcf"), getTestFile("NA12892.AS.chr20snippet.g.vcf")}, getTestFile("testAlleleSpecificAnnotationsNoGroup.vcf"), Arrays.asList("-G", "Standard", "-G", "AS_Standard"), b37_reference_20_21},}; | ||
{new File[]{getTestFile("NA12878.AS.chr20snippet.g.vcf"), getTestFile("NA12892.AS.chr20snippet.g.vcf")}, getTestFile("testAlleleSpecificAnnotationsNoGroup.vcf"), Arrays.asList("-G", "Standard", "-G", "AS_Standard"), b37_reference_20_21}, | ||
//Test that trailing reference blocks are emitted with correct intervals | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could demonstrate the issue i'm talking about if you combined two files that are already banded in step with eachother and you set BREAK_BANDS_LONG_NAME to some short value. you would expect a gap at the end of the contig given your change. |
||
{new File[]{getTestFile("gvcfExample1WithTrailingReferenceBlocks.g.vcf"), getTestFile("gvcfExample2WithTrailingReferenceBlocks.g.vcf")}, getTestFile("gvcfWithTrailingReferenceBlocksExpected.g.vcf"), NO_EXTRA_ARGS, b38_reference_20_21}, | ||
}; | ||
} | ||
|
||
|
||
|
@@ -118,11 +121,6 @@ public void compareToGATK3(File[] inputs, File outputFile, List<String> extraArg | |
} else { | ||
System.out.println("Found precomputed gatk3Result"); | ||
} | ||
final Path expectedResultsDir = Paths.get(getToolTestDataDir()); | ||
if ( !Files.exists(expectedResultsDir)) { | ||
Files.createDirectory(expectedResultsDir); | ||
} | ||
Files.copy(gatk3Result.toPath(), expectedResultsDir.resolve(outputFile.getName()), StandardCopyOption.REPLACE_EXISTING); | ||
|
||
assertVariantContextsMatch(Arrays.asList(inputs), gatk3Result, extraArgs, reference); | ||
} | ||
|
@@ -151,7 +149,7 @@ private void assertVariantContextsMatch(List<File> inputs, File expected, List<S | |
} | ||
|
||
public void runCombineGVCFSandAssertSomething(List<File> inputs, File expected, List<String> additionalArguments, BiConsumer<VariantContext, VariantContext> assertion, String reference) throws IOException { | ||
final File output = createTempFile("genotypegvcf", ".vcf"); | ||
final File output = createTempFile("combinegvcfs", ".vcf"); | ||
|
||
final ArgumentsBuilder args = new ArgumentsBuilder(); | ||
args.addReference(new File(reference)) | ||
|
@@ -195,7 +193,7 @@ private static VCFHeader getHeaderFromFile(final File vcfFile) throws IOExceptio | |
|
||
@Test | ||
public void testOneStartsBeforeTwoAndEndsAfterwards() throws Exception { | ||
final File output = createTempFile("genotypegvcf", ".vcf"); | ||
final File output = createTempFile("combinegvcfs", ".vcf"); | ||
|
||
final ArgumentsBuilder args = new ArgumentsBuilder(); | ||
args.addReference(new File(b37_reference_20_21)) | ||
|
@@ -227,7 +225,7 @@ public void testOneStartsBeforeTwoAndEndsAfterwards() throws Exception { | |
|
||
@Test() | ||
public void testTetraploidRun() throws IOException { | ||
final File output = createTempFile("genotypegvcf", ".vcf"); | ||
final File output = createTempFile("combinegvcfs", ".vcf"); | ||
|
||
final ArgumentsBuilder args = new ArgumentsBuilder(); | ||
args.addReference(new File(b37_reference_20_21)) | ||
|
@@ -248,7 +246,7 @@ public void testTetraploidRun() throws IOException { | |
|
||
@Test | ||
public void testTwoSpansManyBlocksInOne() throws Exception { | ||
final File output = createTempFile("genotypegvcf", ".vcf"); | ||
final File output = createTempFile("combinegvcfs", ".vcf"); | ||
|
||
final ArgumentsBuilder args = new ArgumentsBuilder(); | ||
args.addReference(new File(b37_reference_20_21)) | ||
|
@@ -267,7 +265,7 @@ public void testTwoSpansManyBlocksInOne() throws Exception { | |
// Ensuring that no exception is thrown and that the resulting VCF is empty | ||
@Test | ||
public void testNoDataInInterval() throws Exception { | ||
final File output = createTempFile("genotypegvcf", ".vcf"); | ||
final File output = createTempFile("combinegvcfs", ".vcf"); | ||
|
||
final ArgumentsBuilder args = new ArgumentsBuilder(); | ||
args.addReference(new File(b37_reference_20_21)) | ||
|
@@ -286,7 +284,7 @@ public void testNoDataInInterval() throws Exception { | |
|
||
@Test | ||
public void testOneHasAltAndTwoHasNothing() throws Exception { | ||
final File output = createTempFile("genotypegvcf", ".vcf"); | ||
final File output = createTempFile("combinegvcfs", ".vcf"); | ||
|
||
final ArgumentsBuilder args = new ArgumentsBuilder(); | ||
args.addReference(new File(b37_reference_20_21)) | ||
|
@@ -307,7 +305,7 @@ public void testOneHasAltAndTwoHasNothing() throws Exception { | |
|
||
@Test | ||
public void testOneHasAltAndTwoHasRefBlock() throws Exception { | ||
final File output = createTempFile("genotypegvcf", ".vcf"); | ||
final File output = createTempFile("combinegvcfs", ".vcf"); | ||
|
||
final ArgumentsBuilder args = new ArgumentsBuilder(); | ||
args.addReference(new File(b37_reference_20_21)) | ||
|
@@ -332,7 +330,7 @@ public void testOneHasAltAndTwoHasRefBlock() throws Exception { | |
|
||
@Test | ||
public void testOneHasDeletionAndTwoHasRefBlock() throws Exception { | ||
final File output = createTempFile("genotypegvcf", ".vcf"); | ||
final File output = createTempFile("combinegvcfs", ".vcf"); | ||
|
||
final ArgumentsBuilder args = new ArgumentsBuilder(); | ||
args.addReference(new File(b37_reference_20_21)) | ||
|
@@ -369,7 +367,7 @@ public void testOneHasDeletionAndTwoHasRefBlock() throws Exception { | |
// This test asserts that the behavior is rational in the case of whole genome gvcfs which have variants starting at | ||
// the first and ending on the last base of each contig. | ||
public void testStartChromosome() throws Exception { | ||
final File output = createTempFile("genotypegvcf", ".vcf"); | ||
final File output = createTempFile("combinegvcfs", ".vcf"); | ||
|
||
final ArgumentsBuilder args = new ArgumentsBuilder(); | ||
args.addReference(new File(b37_reference_20_21)) | ||
|
47 changes: 47 additions & 0 deletions
47
...itute/hellbender/tools/walkers/CombineGVCFs/gvcfExample1WithTrailingReferenceBlocks.g.vcf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
##fileformat=VCFv4.1 | ||
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location"> | ||
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> | ||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"> | ||
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> | ||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block"> | ||
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block"> | ||
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"> | ||
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."> | ||
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive) | ||
##INFO=<ID=BLOCK_SIZE,Number=1,Type=Integer,Description="Size of the homozygous reference GVCF block"> | ||
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities"> | ||
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered"> | ||
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval"> | ||
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes"> | ||
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality"> | ||
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads"> | ||
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"> | ||
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"> | ||
##contig=<ID=chr20,length=64444167> | ||
##contig=<ID=chr21,length=46709983> | ||
##contig=<ID=chr20_GL383577v2_alt,length=128386> | ||
##contig=<ID=chr20_KI270869v1_alt,length=118774> | ||
##contig=<ID=chr20_KI270871v1_alt,length=58661> | ||
##contig=<ID=chr20_KI270870v1_alt,length= 183433> | ||
##contig=<ID=chr21_GL383578v2_alt,length=63917> | ||
##contig=<ID=chr21_KI270874v1_alt,length= 166743> | ||
##contig=<ID=chr21_KI270873v1_alt,length=143900> | ||
##contig=<ID=chr21_GL383579v2_alt,length=201197> | ||
##contig=<ID=chr21_GL383580v2_alt,length=74653> | ||
##contig=<ID=chr21_GL383581v2_alt,length=116689> | ||
##contig=<ID=chr21_KI270872v1_alt,length=82692> | ||
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1 | ||
chr20 69491 . G <NON_REF> . . BLOCK_SIZE=20;END=69510 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:94:99:82:99:0,120,1800 | ||
chr20 69511 . A G,<NON_REF> 2253.77 . BaseQRankSum=1.169;DP=82;MQ=31.05;MQ0=0;MQRankSum=-0.866;ReadPosRankSum=1.689 GT:AD:DP:GQ:PL:SB 1/1:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33 | ||
chr20 69512 . C <NON_REF> . . BLOCK_SIZE=10;END=69521 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:96:99:82:99:0,120,1800 | ||
chr20 69522 . T <NON_REF> . . BLOCK_SIZE=27;END=69548 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:95:0:95:0:0,0,0 | ||
chr20 69549 . G <NON_REF> . . BLOCK_SIZE=86;END=69634 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:156:99:56:66:0,66,990 | ||
chr20 69635 . G T,<NON_REF> 60.77 . BaseQRankSum=0.937;DP=7;MQ=34.15;MQ0=0;MQRankSum=1.300;ReadPosRankSum=1.754 GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:89:89,0,119,101,128,229:0,4,0,3 | ||
chr20 69762 . T <NON_REF> . . BLOCK_SIZE=1;END=69762 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:18:7:18:0,18,270 | ||
chr20 69763 . A <NON_REF> . . BLOCK_SIZE=4;END=69766 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:21:7:21:0,21,253 | ||
chr20 69767 . C <NON_REF> . . BLOCK_SIZE=5;END=69771 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:12:7:12:0,12,180 | ||
chr20 69772 . AAAGC A,<NON_REF> 60.77 . BaseQRankSum=0.937;DP=7;MQ=34.15;MQ0=0;MQRankSum=1.300;ReadPosRankSum=1.754 GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:89:89,0,119,101,128,229:0,4,0,3 | ||
chr20 69773 . A <NON_REF> . . BLOCK_SIZE=10;END=69783 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:0:3:0:0,0,0 | ||
chr21 1 . G <NON_REF> . . END=46709983 GT:DP:GQ:MIN_DP:PL 0/0:0:0:0:0,0,0 | ||
chr20_GL383577v2_alt 1 . G <NON_REF> . . END=128386 GT:DP:GQ:MIN_DP:PL 0/0:0:0:0:0,0,0 |
41 changes: 41 additions & 0 deletions
41
...itute/hellbender/tools/walkers/CombineGVCFs/gvcfExample2WithTrailingReferenceBlocks.g.vcf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
##fileformat=VCFv4.1 | ||
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location"> | ||
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> | ||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"> | ||
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> | ||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block"> | ||
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block"> | ||
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"> | ||
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."> | ||
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive) | ||
##INFO=<ID=BLOCK_SIZE,Number=1,Type=Integer,Description="Size of the homozygous reference GVCF block"> | ||
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities"> | ||
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered"> | ||
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval"> | ||
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes"> | ||
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality"> | ||
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads"> | ||
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"> | ||
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"> | ||
##contig=<ID=chr20,length=64444167> | ||
##contig=<ID=chr21,length=46709983> | ||
##contig=<ID=chr20_GL383577v2_alt,length=128386> | ||
##contig=<ID=chr20_KI270869v1_alt,length=118774> | ||
##contig=<ID=chr20_KI270871v1_alt,length=58661> | ||
##contig=<ID=chr20_KI270870v1_alt,length= 183433> | ||
##contig=<ID=chr21_GL383578v2_alt,length=63917> | ||
##contig=<ID=chr21_KI270874v1_alt,length= 166743> | ||
##contig=<ID=chr21_KI270873v1_alt,length=143900> | ||
##contig=<ID=chr21_GL383579v2_alt,length=201197> | ||
##contig=<ID=chr21_GL383580v2_alt,length=74653> | ||
##contig=<ID=chr21_GL383581v2_alt,length=116689> | ||
##contig=<ID=chr21_KI270872v1_alt,length=82692> | ||
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA2 | ||
chr20 69498 . C <NON_REF> . . BLOCK_SIZE=9;END=69506 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:94:99:82:99:0,120,1800 | ||
chr20 69514 . T <NON_REF> . . BLOCK_SIZE=111;END=69624 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:96:99:82:99:0,120,1800 | ||
chr20 69625 . A <NON_REF> . . BLOCK_SIZE=51;END=69675 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:18:7:18:0,18,270 | ||
chr20 69767 . C <NON_REF> . . BLOCK_SIZE=8;END=69774 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:12:7:15:0,12,180 | ||
chr20 69775 . C <NON_REF> . . BLOCK_SIZE=17;END=69791 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:15:7:15:0,15,160 | ||
chr21 1 . G <NON_REF> . . END=46709983 GT:DP:GQ:MIN_DP:PL 0/0:0:0:0:0,0,0 | ||
chr20_GL383577v2_alt 1 . G <NON_REF> . . END=128386 GT:DP:GQ:MIN_DP:PL 0/0:0:0:0:0,0,0 |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmmm... So this change is mostly agnostic and is definately important for the case where the assumption that the prevPos must be on the right contig (which wasn't checked before). I'm not convinced this works universally in the case where there is a gap before the last contig. So imagine there are still reads queued to be closed at prevPos, a gap in coverage, then a stack of variant contexts at the last position. It doesn't strike me that this should handle closing that last interval if it needs to generate its own stops (for example if BREAK_BANDS_LONG_NAME) is set then it may well generate breaks in the gap between prevpos and the last stack of VCs. I think we should prefer to use prevPos in these cases and only switch to variantContextsOverlappingCurrentMerge.get(0) in the case where prevpos is on the wrong contig/starts later than it should.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jamesemery Having thought about this a bit more, I'm not sure that problem is new to this change. As far as I can tell, this implementation of the tool only emits coverage for territory that is covered by the inputs, and doesn't pad out territory beyond that. In onTraversalSuccess, if prevPos is on a different contig than the remaining variants to be merged, we won't close out (pad out) the previous contig, but I don't think the tool ever does that ?). We might want to fix that, but I think thats a separate/bigger issue ? Otherwise, the variants that are remaining are all guaranteed to be on the same contig (right ?) Sorry if I'm missing something, but please push back if you think I am.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You know, you are right. I was thinking of the code being like a normal
apply()
call where you are holding onto something "next." Your fix looks good.