From dcb7d37635c008402dc31a7415d562b1928bae22 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 28 Feb 2023 13:53:29 +0000 Subject: [PATCH] Fix cram_index_query_last function The cram_index_query_last used by sam_itr_regarray had problems when dealing with slices whose region is contained within another region. Fundamentally this is due to the cram_index arrays being contiguous in memory until a containment is found, at which point the pointers will be to an entirely different array. This breaks naive pointer comparisons. The cram_index struct already had a "next" field holding the file offset of the next container. This has been replaced by e_next pointing to the next cram_entry struct in file ordering, and e_next->offset is equivalent to the old "next". This allows consumption of the index either as the original nested containment list or as a traditional linked list. Also fixed cram_index_query with from != NULL, which similarly was incorrect before. We never used this function and it's not public, but we now use it within the rewrite of cram_index_query_last. Fixes #1569 --- cram/cram_index.c | 110 ++++++++++++++++++++++++++------------------ cram/cram_structs.h | 5 +- hts.c | 4 +- 3 files changed, 72 insertions(+), 47 deletions(-) diff --git a/cram/cram_index.c b/cram/cram_index.c index 45d420df2..24bcf09db 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -86,6 +86,34 @@ static void dump_index(cram_fd *fd) { } #endif +// Thread a linked list through the nested containment list. +// This makes navigating it and finding the "next" index entry +// trivial. +static cram_index *link_index_(cram_index *e, cram_index *e_last) { + int i; + if (e_last) + e_last->e_next = e; + + e_last = e; + for (i = 0; i < e->nslice; i++) + e_last = link_index_(&e->e[i], e_last); + + return e_last; + return e->nslice ? &e->e[e->nslice-1] : e; +} + +static void link_index(cram_fd *fd) { + int i; + cram_index *e_last = NULL; + + for (i = 0; i < fd->index_sz; i++) { + e_last = link_index_(&fd->index[i], e_last); + } + + if (e_last) + e_last->e_next = NULL; +} + static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) { int sign = 1; int32_t val = 0; @@ -313,7 +341,10 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { free(kstr.s); free(tfn_idx); - // dump_index(fd); + // Convert NCList to linear linked list + link_index(fd); + + //dump_index(fd); return 0; @@ -356,7 +387,7 @@ void cram_index_free(cram_fd *fd) { * entries, but we require at least one per reference.) * * If the index finds multiple slices overlapping this position we - * return the first one only. Subsequent calls should specifying + * return the first one only. Subsequent calls should specify * "from" as the last slice we checked to find the next one. Otherwise * set "from" to be NULL to find the first one. * @@ -371,6 +402,19 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos, int i, j, k; cram_index *e; + if (from) { + // Continue from a previous search. + // We switch to just scanning the linked list, as the nested + // lists are typically short. + e = from->e_next; + while (e && e->refid == refid && e->start <= pos) { + if (e->end >= pos) + return e; + e = e->e_next; + } + return NULL; + } + switch(refid) { case HTS_IDX_NONE: case HTS_IDX_REST: @@ -400,8 +444,7 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos, return NULL; } - if (!from) - from = &fd->index[refid+1]; + from = &fd->index[refid+1]; // Ref with nothing aligned against it. if (!from->e) @@ -469,52 +512,31 @@ cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) { return &from->e[slice]; } +/* + * Find the last container overlapping pos 'end', and the file offset of + * its end (equivalent to the start offset of the container following it). + */ cram_index *cram_index_query_last(cram_fd *fd, int refid, hts_pos_t end) { - cram_index *first = cram_index_query(fd, refid, end, NULL); - cram_index *last = cram_index_last(fd, refid, NULL); - if (!first || !last) + cram_index *e = NULL, *prev_e; + do { + prev_e = e; + e = cram_index_query(fd, refid, end, prev_e); + } while (e); + if (!prev_e) return NULL; - while (first < last && (first+1)->start <= end) - first++; - - while (first->e) { - int count = 0; - int nslices = first->nslice; - first = first->e; - while (++count < nslices && (first+1)->start <= end) - first++; - } - - // Compute the start location of next container. - // - // This is useful for stitching containers together in the multi-region - // iterator. Sadly we can't compute this from the single index line. - // - // Note we can have neighbouring index entries at the same location - // for when we have multi-reference mode and/or multiple slices per + // Note: offset of e and e->e_next may be the same if we're using a + // multi-ref container. We need to keep iterating until offset + // differs in order to find the genuine file offset for the end of // container. - cram_index *next = first; + e = prev_e; do { - if (next >= last) { - // Next non-empty reference - while (++refid+1 < fd->index_sz) - if (fd->index[refid+1].nslice) - break; - if (refid+1 >= fd->index_sz) { - next = NULL; - } else { - next = fd->index[refid+1].e; - last = fd->index[refid+1].e + fd->index[refid+1].nslice; - } - } else { - next++; - } - } while (next && next->offset == first->offset); - - first->next = next ? next->offset : 0; + prev_e = e; + e = e->e_next; + } while (e && e->offset == prev_e->offset); - return first; + //prev_e->next = prev_e->e_next ? prev_e->e_next->offset : 0; + return prev_e; } /* diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 1ee4b9e85..0a66d51b9 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -725,7 +725,10 @@ typedef struct cram_index { int slice; // 1.0 landmark index, 1.1 landmark value int len; // 1.1 - size of slice in bytes int64_t offset; // 1.0 1.1 - int64_t next; // derived: offset of next container. + + // Linked list of cram_index entries. Used to convert recursive + // NCList back to a linear list. + struct cram_index *e_next; } cram_index; typedef struct { diff --git a/hts.c b/hts.c index cead9d537..cb88160a4 100644 --- a/hts.c +++ b/hts.c @@ -3408,8 +3408,8 @@ int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) } if (e) { - off[n_off++].v = e->next - ? e->next + off[n_off++].v = e->e_next + ? e->e_next->offset : e->offset + e->slice + e->len; } else { hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end);