Skip to content

Commit

Permalink
Add support for non-standard chromosome names containing [:-] characters
Browse files Browse the repository at this point in the history
Note hts_parse_region() cannot be used because it requires the header
and without the header the caller does not learn the contig name.

Resolves samtools#1620
  • Loading branch information
pd3 authored and vasudeva8 committed Aug 17, 2023
1 parent 857112a commit de19711
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 5 deletions.
2 changes: 2 additions & 0 deletions htslib/synced_bcf_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,8 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file);
* supply 'from' in place of 'to'. When 'to' is negative, first
* abs(to) will be attempted and if that fails, 'from' will be used
* instead.
* If chromosome name contains the characters ':' or '-', it should
* be put in curly brackets, for example as "{weird-chr-name:1-2}:1000-2000"
*
* The bcf_sr_regions_t struct returned by a successful call should be freed
* via bcf_sr_regions_destroy() when it is no longer needed.
Expand Down
32 changes: 27 additions & 5 deletions synced_bcf_reader.c
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,9 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg)
}

// File name or a list of genomic locations. If file name, NULL is returned.
// Recognises regions in the form chr, chr:pos, chr:beg-end, chr:beg-, {weird-chr-name}:pos.
// Cannot use hts_parse_region() as that requires the header and if header is not present,
// wouldn't learn the chromosome name.
static bcf_sr_regions_t *_regions_init_string(const char *str)
{
bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
Expand All @@ -1043,17 +1046,31 @@ static bcf_sr_regions_t *_regions_init_string(const char *str)
hts_pos_t from, to;
while ( 1 )
{
while ( *ep && *ep!=',' && *ep!=':' ) ep++;
tmp.l = 0;
kputsn(sp,ep-sp,&tmp);
if ( *ep=='{' )
{
while ( *ep && *ep!='}' ) ep++;
if ( !*ep )
{
hts_log_error("Could not parse the region, mismatching braces in: \"%s\"", str);
goto exit_nicely;
}
ep++;
kputsn(sp+1,ep-sp-2,&tmp);
}
else
{
while ( *ep && *ep!=',' && *ep!=':' ) ep++;
kputsn(sp,ep-sp,&tmp);
}
if ( *ep==':' )
{
sp = ep+1;
from = hts_parse_decimal(sp,(char**)&ep,0);
if ( sp==ep )
{
hts_log_error("Could not parse the region(s): %s", str);
free(reg); free(tmp.s); return NULL;
goto exit_nicely;
}
if ( !*ep || *ep==',' )
{
Expand All @@ -1064,15 +1081,15 @@ static bcf_sr_regions_t *_regions_init_string(const char *str)
if ( *ep!='-' )
{
hts_log_error("Could not parse the region(s): %s", str);
free(reg); free(tmp.s); return NULL;
goto exit_nicely;
}
ep++;
sp = ep;
to = hts_parse_decimal(sp,(char**)&ep,0);
if ( *ep && *ep!=',' )
{
hts_log_error("Could not parse the region(s): %s", str);
free(reg); free(tmp.s); return NULL;
goto exit_nicely;
}
if ( sp==ep ) to = MAX_CSI_COOR-1;
_regions_add(reg, tmp.s, from, to);
Expand All @@ -1088,6 +1105,11 @@ static bcf_sr_regions_t *_regions_init_string(const char *str)
}
free(tmp.s);
return reg;

exit_nicely:
bcf_sr_regions_destroy(reg);
free(tmp.s);
return NULL;
}

// ichr,ifrom,ito are 0-based;
Expand Down

0 comments on commit de19711

Please sign in to comment.