Skip to content

Commit

Permalink
Fix in gvcf merging
Browse files Browse the repository at this point in the history
and prevent a an infinite loop when a variant record and a gVCF block start at the same position.
Fixes #1164
  • Loading branch information
pd3 committed Mar 5, 2020
1 parent 9d66868 commit 2de40a8
Show file tree
Hide file tree
Showing 15 changed files with 115 additions and 32 deletions.
6 changes: 6 additions & 0 deletions test/merge.gvcf.10.1.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr1,length=248956422>
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 1 . A <*>,C . . END=2
7 changes: 7 additions & 0 deletions test/merge.gvcf.10.2.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr1,length=248956422>
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 1 . A <*> . . END=2
chr1 1 . A C . . .
7 changes: 7 additions & 0 deletions test/merge.gvcf.10.3.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr1,length=248956422>
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 1 . A <*>,C . . .
chr1 2 . C <*> . . .
7 changes: 7 additions & 0 deletions test/merge.gvcf.10.4.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr1,length=248956422>
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 1 . A C . . .
chr1 2 . C <*> . . .
7 changes: 7 additions & 0 deletions test/merge.gvcf.10.5.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr1,length=248956422>
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 1 . A C,<*> . . .
chr1 2 . C <*> . . .
6 changes: 6 additions & 0 deletions test/merge.gvcf.10.a.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr1,length=248956422>
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 1 . A <*> . . END=2
5 changes: 5 additions & 0 deletions test/merge.gvcf.10.b.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 1 . A C . . .
2 changes: 2 additions & 0 deletions test/merge.gvcf.10.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>chr1
ACGT
1 change: 1 addition & 0 deletions test/merge.gvcf.10.fa.fai
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chr1 4 6 4 5
2 changes: 2 additions & 0 deletions test/merge.gvcf.10.ref
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>chr1
ACGT
8 changes: 4 additions & 4 deletions test/merge.gvcf.2.a.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
4 20001071 . T G,<*> 0 . . PL:DP:DV 0,4,10,20,30,40:4:1
5 110285 . TAACCCC T . . . PL 89,6,0
5 1110285 . T TAACCCC . . . PL 89,6,0
6 600 . T A . . END=666 PL 66,1,1
7 701 . T A . . END=702 PL 77,1,1
7 703 . T A . . END=777 PL 77,1,2
8 1 . T A . . END=10 PL 88,1,1
6 600 . T <*> . . END=666 PL 66,1,1
7 701 . T <*> . . END=702 PL 77,1,1
7 703 . T <*> . . END=777 PL 77,1,2
8 1 . T <*> . . END=10 PL 88,1,1
10 changes: 5 additions & 5 deletions test/merge.gvcf.2.b.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
4 20001071 . T G,<*> 0 . QS=0.75,0.25,0 PL:DP:DV 0,4,10,35,73,113:4:1
5 110285 . T C,<*> . . . PL 114,0,15,35,73,113
5 1110285 . T C,<*> . . . PL 114,0,15,35,73,113
6 610 . T A . . . PL 66,2,1
6 620 . T A . . END=625 PL 66,2,2
6 630 . T A . . . PL 66,2,3
7 701 . T A . . . PL 77,2,1
7 702 . T A . . END=703 PL 77,2,2
6 610 . T <*> . . . PL 66,2,1
6 620 . T <*> . . END=625 PL 66,2,2
6 630 . T <*> . . . PL 66,2,3
7 701 . T <*> . . . PL 77,2,1
7 702 . T <*> . . END=703 PL 77,2,2
29 changes: 14 additions & 15 deletions test/merge.gvcf.2.out
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,17 @@
5 110285 . TAACCCC T . . . PL 89,6,0 . .
5 1110285 . T C,<*> . . . PL . 114,0,15,35,73,113 .
5 1110285 . T TAACCCC . . . PL 89,6,0 . .
6 600 . T A . . END=609 PL 66,1,1 . .
6 610 . T A . . . PL 66,1,1 66,2,1 .
6 611 . N A . . END=619 PL 66,1,1 . .
6 620 . T A . . END=625 PL 66,1,1 66,2,2 .
6 626 . N A . . END=629 PL 66,1,1 . .
6 630 . T A . . . PL 66,1,1 66,2,3 .
6 631 . N A . . END=666 PL 66,1,1 . .
7 701 . T A . . . PL 77,1,1 77,2,1 .
7 702 . T A . . . PL . 77,2,2 .
7 703 . T A . . . PL 77,1,2 . .
7 704 . N A . . END=777 PL 77,1,2 . .
8 1 . T A . . END=2 PL 88,1,1 . .
8 3 . T A . . END=5 PL 88,1,1 88,2,1 88,3,1
8 6 . N A . . END=7 PL 88,1,1 88,2,1 .
8 8 . N A . . END=10 PL 88,1,1 . .
6 600 . T <*> . . END=609 PL 66,1,1 . .
6 610 . T <*> . . . PL 66,1,1 66,2,1 .
6 611 . N <*> . . END=619 PL 66,1,1 . .
6 620 . T <*> . . END=625 PL 66,1,1 66,2,2 .
6 626 . N <*> . . END=629 PL 66,1,1 . .
6 630 . T <*> . . . PL 66,1,1 66,2,3 .
6 631 . N <*> . . END=666 PL 66,1,1 . .
7 701 . T <*> . . . PL 77,1,1 77,2,1 .
7 702 . T <*> . . . PL . 77,2,2 .
7 703 . T <*> . . . PL 77,1,2 . .
7 704 . N <*> . . END=777 PL 77,1,2 . .
8 1 . T <*> . . END=2 PL 88,1,1 . .
8 3 . T <*>,A . . . PL 88,1,1,.,.,. 88,.,.,2,.,1 88,.,.,3,.,1
8 4 . N <*> . . END=10 PL 88,1,1 . .
6 changes: 6 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,12 @@
test_vcf_merge($opts,in=>['merge.gvcf.9a','merge.gvcf.9b','merge.gvcf.9c','merge.gvcf.9d'],out=>'merge.gvcf.9.2.out',args=>'--gvcf {PATH}/gvcf.fa -r 22:21-23');
test_vcf_merge($opts,in=>['merge.gvcf.9a','merge.gvcf.9b','merge.gvcf.9c','merge.gvcf.9d','merge.gvcf.9e'],out=>'merge.gvcf.9.3.out',args=>'--gvcf {PATH}/gvcf.fa');
test_vcf_merge($opts,in=>['merge.gvcf.9a','merge.gvcf.9b','merge.gvcf.9c','merge.gvcf.9d','merge.gvcf.9e'],out=>'merge.gvcf.9.4.out',args=>'--gvcf {PATH}/gvcf.fa -r 22:21-23');
test_vcf_merge($opts,in=>['merge.gvcf.10.a','merge.gvcf.10.b'],out=>'merge.gvcf.10.1.out',args=>'');
test_vcf_merge($opts,in=>['merge.gvcf.10.a','merge.gvcf.10.b'],out=>'merge.gvcf.10.2.out',args=>'-m none');
test_vcf_merge($opts,in=>['merge.gvcf.10.a','merge.gvcf.10.b'],out=>'merge.gvcf.10.3.out',args=>'-g {PATH}/merge.gvcf.10.fa');
test_vcf_merge($opts,in=>['merge.gvcf.10.a','merge.gvcf.10.b'],out=>'merge.gvcf.10.4.out',args=>'-g {PATH}/merge.gvcf.10.fa -m none');
test_vcf_merge($opts,in=>['merge.gvcf.10.b','merge.gvcf.10.a'],out=>'merge.gvcf.10.5.out',args=>'-g {PATH}/merge.gvcf.10.fa');
test_vcf_merge($opts,in=>['merge.gvcf.10.b','merge.gvcf.10.a'],out=>'merge.gvcf.10.4.out',args=>'-g {PATH}/merge.gvcf.10.fa -m none');
test_vcf_query($opts,in=>'query',out=>'query.out',args=>q[-f '%CHROM\\t%POS\\t%REF\\t%ALT\\t%DP4\\t%AN[\\t%GT\\t%TGT]\\n']);
test_vcf_query($opts,in=>'query.variantkey',out=>'query.variantkey.hex.out',args=>q[-f '%RSX\\t%VKX\\n']);
test_vcf_query($opts,in=>'view.filter',out=>'query.2.out',args=>q[-f'%XRI\\n' -i'XRI[*]>1111']);
Expand Down
44 changes: 36 additions & 8 deletions vcfmerge.c
Original file line number Diff line number Diff line change
Expand Up @@ -2042,6 +2042,23 @@ void gvcf_flush(args_t *args, int done)
}
}

static inline int is_gvcf_block(bcf1_t *line)
{
if ( line->rlen<=1 ) return 0;
if ( strlen(line->d.allele[0])==line->rlen ) return 0;
if ( line->n_allele==1 ) return 1;

int i;
for (i=1; i<line->n_allele; i++)
{
if ( !strcmp(line->d.allele[i],"<*>") ) return 1;
if ( !strcmp(line->d.allele[i],"<NON_REF>") ) return 1;
if ( !strcmp(line->d.allele[i],"<X>") ) return 1;
}
return 0;
}
static const int snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), indel_mask = VCF_INDEL<<2, ref_mask = 2;

/*
Check incoming lines for new gVCF blocks, set pointer to the current source
buffer (gvcf or readers). In contrast to gvcf_flush, this function can be
Expand Down Expand Up @@ -2079,7 +2096,7 @@ void gvcf_stage(args_t *args, int pos)
int irec = maux->buf[i].beg;
bcf_hdr_t *hdr = bcf_sr_get_header(files, i);
bcf1_t *line = args->files->readers[i].buffer[irec];
int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend);
int ret = is_gvcf_block(line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0;
if ( ret==1 )
{
if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END
Expand Down Expand Up @@ -2220,7 +2237,6 @@ void debug_state(args_t *args)
fprintf(stderr,"\n");
}


/*
Determine which line should be merged from which reader: go through all
readers and all buffered lines, expand REF,ALT and try to match lines with
Expand All @@ -2229,7 +2245,6 @@ void debug_state(args_t *args)
int can_merge(args_t *args)
{
bcf_srs_t *files = args->files;
int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
maux_t *maux = args->maux;
gvcf_aux_t *gaux = maux->gvcf;
char *id = NULL, ref = 'N';
Expand All @@ -2242,6 +2257,9 @@ int can_merge(args_t *args)
}
maux->var_types = maux->nals = 0;

// this is only for the `-m none -g` mode, ensure that <*> lines come last
#define VCF_GVCF_REF 1

for (i=0; i<files->nreaders; i++)
{
buffer_t *buf = &maux->buf[i];
Expand All @@ -2259,12 +2277,17 @@ int can_merge(args_t *args)
buf->rec[j].skip = SKIP_DIFF;
ntodo++;

bcf1_t *line = buf->lines[j];
if ( args->merge_by_id )
id = buf->lines[j]->d.id;
id = line->d.id;
else
{
int var_type = bcf_get_variant_types(buf->lines[j]);
maux->var_types |= var_type ? var_type<<1 : 1;
int var_type = bcf_get_variant_types(line);
maux->var_types |= var_type ? var_type<<2 : 2;

// for the `-m none -g` mode
if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) )
maux->var_types |= VCF_GVCF_REF;
}
}

Expand Down Expand Up @@ -2296,7 +2319,7 @@ int can_merge(args_t *args)
bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer

int line_type = bcf_get_variant_types(line);
line_type = line_type ? line_type<<1 : 1;
line_type = line_type ? line_type<<2 : 2;

// select relevant lines
if ( args->merge_by_id )
Expand All @@ -2305,6 +2328,12 @@ int can_merge(args_t *args)
}
else
{
// when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant
// records come last, otherwise infinite loop is created (#1164)
if ( args->collapse==COLLAPSE_NONE && args->do_gvcf )
{
if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue;
}
if ( args->collapse==COLLAPSE_NONE && maux->nals )
{
// All alleles of the tested record must be present in the
Expand Down Expand Up @@ -2368,7 +2397,6 @@ int can_merge(args_t *args)
*/
void stage_line(args_t *args)
{
int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
bcf_srs_t *files = args->files;
maux_t *maux = args->maux;

Expand Down

0 comments on commit 2de40a8

Please sign in to comment.