Skip to content

Commit

Permalink
Fix cram_index_query_last function
Browse files Browse the repository at this point in the history
The cram_index_query_last used by sam_itr_regarray had problems when
dealing with slices whose region is contained within another region.

Fundamentally this is due to the cram_index arrays being contiguous in
memory until a containment is found, at which point the pointers will
be to an entirely different array.  This breaks naive pointer
comparisons.

The cram_index struct already had a "next" field holding the file
offset of the next container.  This has been replaced by e_next
pointing to the next cram_entry struct in file ordering, and
e_next->offset is equivalent to the old "next".  This allows
consumption of the index either as the original nested containment
list or as a traditional linked list.

Also fixed cram_index_query with from != NULL, which similarly was
incorrect before.  We never used this function and it's not public,
but we now use it within the rewrite of cram_index_query_last.

Fixes samtools#1569
  • Loading branch information
jkbonfield committed Mar 1, 2023
1 parent 0ad23b0 commit 6184206
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 48 deletions.
116 changes: 71 additions & 45 deletions cram/cram_index.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void dump_index_(cram_index *e, int level) {
int i, n;
n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
printf("%*soffset %"PRId64" %p %p\n", MAX(0,50-n), "", e->offset, e, e->e_next);
for (i = 0; i < e->nslice; i++) {
dump_index_(&e->e[i], level+1);
}
Expand All @@ -86,6 +86,38 @@ static void dump_index(cram_fd *fd) {
}
#endif

// Thread a linked list through the nested containment list.
// This makes navigating it and finding the "next" index entry
// trivial.
static cram_index *link_index_(cram_index *e, cram_index *e_last) {
int i;
if (e_last)
e_last->e_next = e;

// We don't want to link in the top-level cram_index with
// offset=0 and start/end = INT_MIN/INT_MAX.
if (e->offset)
e_last = e;

for (i = 0; i < e->nslice; i++)
e_last = link_index_(&e->e[i], e_last);

return e_last;
return e->nslice ? &e->e[e->nslice-1] : e;
}

static void link_index(cram_fd *fd) {
int i;
cram_index *e_last = NULL;

for (i = 0; i < fd->index_sz; i++) {
e_last = link_index_(&fd->index[i], e_last);
}

if (e_last)
e_last->e_next = NULL;
}

static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
int sign = 1;
int32_t val = 0;
Expand Down Expand Up @@ -313,7 +345,10 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) {
free(kstr.s);
free(tfn_idx);

// dump_index(fd);
// Convert NCList to linear linked list
link_index(fd);

//dump_index(fd);

return 0;

Expand Down Expand Up @@ -356,7 +391,7 @@ void cram_index_free(cram_fd *fd) {
* entries, but we require at least one per reference.)
*
* If the index finds multiple slices overlapping this position we
* return the first one only. Subsequent calls should specifying
* return the first one only. Subsequent calls should specify
* "from" as the last slice we checked to find the next one. Otherwise
* set "from" to be NULL to find the first one.
*
Expand All @@ -371,6 +406,17 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
int i, j, k;
cram_index *e;

if (from) {
// Continue from a previous search.
// We switch to just scanning the linked list, as the nested
// lists are typically short.
e = from->e_next;
if (e && e->refid == refid && e->start <= pos)
return e;
else
return NULL;
}

switch(refid) {
case HTS_IDX_NONE:
case HTS_IDX_REST:
Expand Down Expand Up @@ -400,8 +446,7 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
return NULL;
}

if (!from)
from = &fd->index[refid+1];
from = &fd->index[refid+1];

// Ref with nothing aligned against it.
if (!from->e)
Expand Down Expand Up @@ -469,52 +514,33 @@ cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) {
return &from->e[slice];
}

/*
* Find the last container overlapping pos 'end', and the file offset of
* its end (equivalent to the start offset of the container following it).
*/
cram_index *cram_index_query_last(cram_fd *fd, int refid, hts_pos_t end) {
cram_index *first = cram_index_query(fd, refid, end, NULL);
cram_index *last = cram_index_last(fd, refid, NULL);
if (!first || !last)
return NULL;

while (first < last && (first+1)->start <= end)
first++;
cram_index *e = NULL, *prev_e;
do {
prev_e = e;
e = cram_index_query(fd, refid, end, prev_e);
} while (e);

while (first->e) {
int count = 0;
int nslices = first->nslice;
first = first->e;
while (++count < nslices && (first+1)->start <= end)
first++;
}
if (!prev_e)
return NULL;
e = prev_e;

// Compute the start location of next container.
//
// This is useful for stitching containers together in the multi-region
// iterator. Sadly we can't compute this from the single index line.
// Note: offset of e and e->e_next may be the same if we're using a
// multi-ref container where a single container generates multiple
// index entries.
//
// Note we can have neighbouring index entries at the same location
// for when we have multi-reference mode and/or multiple slices per
// container.
cram_index *next = first;
// We need to keep iterating until offset differs in order to find
// the genuine file offset for the end of container.
do {
if (next >= last) {
// Next non-empty reference
while (++refid+1 < fd->index_sz)
if (fd->index[refid+1].nslice)
break;
if (refid+1 >= fd->index_sz) {
next = NULL;
} else {
next = fd->index[refid+1].e;
last = fd->index[refid+1].e + fd->index[refid+1].nslice;
}
} else {
next++;
}
} while (next && next->offset == first->offset);

first->next = next ? next->offset : 0;
prev_e = e;
e = e->e_next;
} while (e && e->offset == prev_e->offset);

return first;
return prev_e;
}

/*
Expand Down
5 changes: 4 additions & 1 deletion cram/cram_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,10 @@ typedef struct cram_index {
int slice; // 1.0 landmark index, 1.1 landmark value
int len; // 1.1 - size of slice in bytes
int64_t offset; // 1.0 1.1
int64_t next; // derived: offset of next container.

// Linked list of cram_index entries. Used to convert recursive
// NCList back to a linear list.
struct cram_index *e_next;
} cram_index;

typedef struct {
Expand Down
4 changes: 2 additions & 2 deletions hts.c
Original file line number Diff line number Diff line change
Expand Up @@ -3408,8 +3408,8 @@ int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter)
}

if (e) {
off[n_off++].v = e->next
? e->next
off[n_off++].v = e->e_next
? e->e_next->offset
: e->offset + e->slice + e->len;
} else {
hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end);
Expand Down

0 comments on commit 6184206

Please sign in to comment.