From f3ad960fa36e263684fd0822a14cb72e8f1b2d5c Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 3 May 2023 16:02:52 +0100 Subject: [PATCH] Don't create overly large CRAM blocks. Currently CRAM containers can in some circumstance become huge. To prevent this we currently have a limit of the number of sequences (default 10,000) and also by number of bases (default 500 * number of seqs) so long-read technologies don't put too much in a container. However if we have 10k of reads with jointly under 5Mb of sequence that also have over 2GB worth of aux data, then we can trigger the overflow fixed in the previous commit. How do we get >430 bytes worth of aux for every base and >214Kb of aux for every read, in real world data rather than in deliberate stress testing? One possibility is with SEQ "*" (eg secondary alignments from minimap2) on very long-read data with heavy aux tag usage, as this doesn't increase base count at all. The same issue occurs to a lesser extent which supplementaries and hard-clipping. We now create new containers when seq+aux goes beyond the specified limit instead of just seq. In normal circumstances this will have a limited effect. Thanks to Martin Pollard for triggering and reporting this corner case. --- cram/cram_encode.c | 5 +++-- cram/cram_structs.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 5b56aedd5..9797fa7a8 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -3852,7 +3852,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { if (!c->slice || c->curr_rec == c->max_rec || (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) || - (c->s_num_bases >= fd->bases_per_slice)) { + (c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice)) { int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1; int curr_ref = c->slice ? c->curr_ref : bam_ref(b); @@ -3885,7 +3885,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { if (CRAM_MAJOR_VERS(fd->version) == 1 || c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice || - c->s_num_bases >= fd->bases_per_slice) { + c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice) { if (NULL == (c = cram_next_container(fd, b))) { if (fd->ctr) { // prevent cram_close attempting to flush @@ -3997,6 +3997,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { c->curr_rec++; c->curr_c_rec++; c->s_num_bases += bam_seq_len(b); + c->s_aux_bytes += bam_get_l_aux(b); c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1; fd->record_counter++; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 8b21d29c0..15b7f145b 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -473,6 +473,7 @@ struct cram_container { uint32_t crc32; // CRC32 uint64_t s_num_bases; // number of bases in this slice + uint64_t s_aux_bytes; // number of bytes of aux in BAM uint32_t n_mapped; // Number of mapped reads int ref_free; // whether 'ref' is owned by us and must be freed.