diff --git a/recipes/bwa-mem2/Makefile.patch b/recipes/bwa-mem2/Makefile.patch index e72d7281c5076..e5225aeb2c42c 100644 --- a/recipes/bwa-mem2/Makefile.patch +++ b/recipes/bwa-mem2/Makefile.patch @@ -1,11 +1,2352 @@ ---- Source_code_including_submodules/bwa-mem2-2.1/Makefile 2020-10-16 17:24:18.085319255 +0100 -+++ Source_code_including_submodules/bwa-mem2-2.1/Makefile 2020-10-16 17:24:26.977342749 +0100 -@@ -43,7 +43,7 @@ +diff --git a/.gitmodules b/.gitmodules +index 2cdb3f2..a66a0f3 100644 +--- a/.gitmodules ++++ b/.gitmodules +@@ -1,3 +1,6 @@ + [submodule "ext/safestringlib"] + path = ext/safestringlib + url = https://github.com/intel/safestringlib.git ++[submodule "ext/simde"] ++ path = ext/simde ++ url = https://github.com/simd-everywhere/simde-no-tests.git +diff --git a/Makefile b/Makefile +index 359585f..ba7bc00 100644 +--- a/Makefile ++++ b/Makefile +@@ -42,7 +42,7 @@ endif + ARCH_FLAGS= -msse -msse2 -msse3 -mssse3 -msse4.1 MEM_FLAGS= -DSAIS=1 - CPPFLAGS+= -DENABLE_PREFETCH -DV17=1 $(MEM_FLAGS) - INCLUDES= -Isrc -Iext/safestringlib/include + CPPFLAGS+= -DENABLE_PREFETCH -DV17=1 -DMATE_SORT=0 $(MEM_FLAGS) +-INCLUDES= -Isrc -Iext/safestringlib/include ++INCLUDES= -Isrc -Iext/safestringlib/include -Iext -LIBS= -lpthread -lm -lz -L. -lbwa -Lext/safestringlib -lsafestring $(STATIC_GCC) +LIBS+= -lpthread -lm -lz -L. -lbwa -Lext/safestringlib -lsafestring $(STATIC_GCC) OBJS= src/fastmap.o src/bwtindex.o src/utils.o src/memcpy_bwamem.o src/kthread.o \ src/kstring.o src/ksw.o src/bntseq.o src/bwamem.o src/profiling.o src/bandedSWA.o \ - src/FMI_search.o src/read_index_ele.o src/bwamem_pair.o src/kswv.o src/bwa.o \ +diff --git a/ext/simde b/ext/simde +new file mode 160000 +index 0000000..6a1db3a +--- /dev/null ++++ b/ext/simde +@@ -0,0 +1 @@ ++Subproject commit 6a1db3a5a16e7ad04d05fc07d1f58da55f74766b +diff --git a/src/FMI_search.h b/src/FMI_search.h +index 25c4d0d..92ac03f 100644 +--- a/src/FMI_search.h ++++ b/src/FMI_search.h +@@ -34,7 +34,8 @@ Authors: Sanchit Misra ; Vasimuddin Md + #include + #include +-#include ++#define SIMDE_ENABLE_NATIVE_ALIASES ++#include + #include + #include + +diff --git a/src/bandedSWA.cpp b/src/bandedSWA.cpp +index dfd81bc..6bd26e6 100644 +--- a/src/bandedSWA.cpp ++++ b/src/bandedSWA.cpp +@@ -4146,16 +4146,6 @@ void BandedPairWiseSW::smithWaterman128_16(uint16_t seq1SoA[], + return; + } + +-/********************************************************************************/ +-/* SSE2 - 8 bit version */ +-#ifndef __SSE4_1__ +-static inline __m128i _mm_blendv_epi8 (__m128i x, __m128i y, __m128i mask) +-{ +- // Replace bit in x with bit in y when matching bit in mask is set: +- return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(mask, y)); +-} +-#endif +- + #define ZSCORE8(i4_128, y4_128) \ + { \ + __m128i tmpi = _mm_sub_epi8(i4_128, x128); \ +diff --git a/src/bandedSWA.h b/src/bandedSWA.h +index c81552b..1a9fd1f 100644 +--- a/src/bandedSWA.h ++++ b/src/bandedSWA.h +@@ -36,10 +36,17 @@ Authors: Vasimuddin Md ; Sanchit Misra + #include "macro.h" + ++#if !defined(__SSE__) ++#define _mm_malloc(size, align) aligned_alloc(align, size) ++#define _mm_free free ++#define _MM_HINT_NTA 0 ++#endif ++ ++#define SIMDE_ENABLE_NATIVE_ALIASES + #if (__AVX512BW__ || __AVX2__) + #include + #else +-#include // for SSE4.1 ++#include // for SSE4.1 + #define __mmask8 uint8_t + #define __mmask16 uint16_t + #endif +diff --git a/src/bwa.h b/src/bwa.h +index 877f00c..aa36083 100644 +--- a/src/bwa.h ++++ b/src/bwa.h +@@ -37,6 +37,13 @@ Authors: Vasimuddin Md ; Sanchit Misra (y)?(x):(y)) + #define min_(x, y) ((x)>(y)?(y):(x)) +- ++ + #define MAX_BAND_TRY 2 +- ++ + int tcnt = 0; + /******************** + * Filtering chains * +@@ -180,7 +180,7 @@ int mem_patch_reg(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, + double r; + if (bns == 0 || pac == 0 || query == 0) return 0; + assert(a->rid == b->rid && a->rb <= b->rb); +- ++ + if (a->rb < bns->l_pac && b->rb >= bns->l_pac) return 0; // on different strands + if (a->qb >= b->qb || a->qe >= b->qe || a->re >= b->re) return 0; // not colinear + w = (a->re - b->rb) - (a->qe - b->qb); // required bandwidth +@@ -193,32 +193,32 @@ int mem_patch_reg(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, + "[%d,%d)<=>[%ld,%ld), @ %s; w=%d, r=%.4g\n", + a->qb, a->qe, (long)a->rb, (long)a->re, b->qb, b->qe, + (long)b->rb, (long)b->re, bns->anns[a->rid].name, w, r); +- ++ + if (a->re < b->rb || a->qe < b->qb) // no overlap on query or on ref + { + if (w > opt->w<<1 || r >= PATCH_MAX_R_BW) return 0; // the bandwidth or the relative bandwidth is too large + } else if (w > opt->w<<2 || r >= PATCH_MAX_R_BW*2) return 0; // more permissive if overlapping on both ref and query +- ++ + // global alignment + w += a->w + b->w; + w = w < opt->w<<2? w : opt->w<<2; + + if (bwa_verbose >= 4) + fprintf(stderr, "* test potential hit merge with global alignment; w=%d\n", w); +- ++ + bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w, + bns->l_pac, pac, b->qe - a->qb, query + a->qb, a->rb, b->re, + &score, 0, 0); +- ++ + q_s = (int)((double)(b->qe - a->qb) / ((b->qe - b->qb) + (a->qe - a->qb)) * + (b->score + a->score) + .499); // predicted score from query +- ++ + r_s = (int)((double)(b->re - a->rb) / ((b->re - b->rb) + (a->re - a->rb)) * + (b->score + a->score) + .499); // predicted score from ref +- ++ + if (bwa_verbose >= 4) + fprintf(stderr, "* score=%d;(%d,%d)\n", score, q_s, r_s); +- ++ + if ((double)score / (q_s > r_s? q_s : r_s) < PATCH_MIN_SC_RATIO) return 0; + *_w = w; + return score; +@@ -296,14 +296,14 @@ int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, + int m, i, j; + if (n <= 1) return n; + ks_introsort(mem_ars2, n, a); // sort by the END position, not START! +- ++ + for (i = 0; i < n; ++i) a[i].n_comp = 1; + for (i = 1; i < n; ++i) + { + mem_alnreg_t *p = &a[i]; + if (p->rid != a[i-1].rid || p->rb >= a[i-1].re + opt->max_chain_gap) + continue; // then no need to go into the loop below +- ++ + for (j = i - 1; j >= 0 && p->rid == a[j].rid && p->rb < a[j].re + opt->max_chain_gap; --j) { + mem_alnreg_t *q = &a[j]; + int64_t or_, oq, mr, mq; +@@ -366,10 +366,10 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, + if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && + p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) + return 1; // contained seed; do nothing +- ++ + if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && + p->rbeg >= l_pac) return 0; // don't chain if on different strand +- ++ + x = p->qbeg - last->qbeg; // always non-negtive + y = p->rbeg - last->rbeg; + if (y >= 0 && x - y <= opt->w && y - x <= opt->w && +@@ -378,7 +378,7 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, + if (c->n == c->m) + { + mem_seed_t *auxSeedBuf = NULL; +- int pm = c->m; ++ int pm = c->m; + c->m <<= 1; + if (pm == SEEDS_PER_CHAIN) { // re-new memory + if ((auxSeedBuf = (mem_seed_t *) calloc(c->m, sizeof(mem_seed_t))) == NULL) { fprintf(stderr, "ERROR: out of memory auxSeedBuf\n"); exit(1); } +@@ -390,7 +390,7 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, + if ((auxSeedBuf = (mem_seed_t *) realloc(c->seeds, c->m * sizeof(mem_seed_t))) == NULL) { fprintf(stderr, "ERROR: out of memory auxSeedBuf\n"); exit(1); } + c->seeds = auxSeedBuf; + } +- memset((char*) (c->seeds + c->n), 0, (c->m - c->n) * sizeof(mem_seed_t)); ++ memset((char*) (c->seeds + c->n), 0, (c->m - c->n) * sizeof(mem_seed_t)); + } + c->seeds[c->n++] = *p; + return 1; +@@ -474,7 +474,7 @@ void mem_flt_chained_seeds(const mem_opt_t *opt, const bntseq_t *bns, const uint + { + //double min_l = opt->min_chain_weight? + // MEM_HSP_COEF * opt->min_chain_weight : MEM_MINSC_COEF * log(l_query); +- ++ + int i, j, k;// min_HSP_score = (int)(opt->a * min_l + .499); + //if (min_l > MEM_SEEDSW_COEF * l_query) return; // don't run the following for short reads + +@@ -488,7 +488,7 @@ void mem_flt_chained_seeds(const mem_opt_t *opt, const bntseq_t *bns, const uint + MEM_HSP_COEF * opt->min_chain_weight : MEM_MINSC_COEF * log(l_query); + int min_HSP_score = (int)(opt->a * min_l + .499); + if (min_l > MEM_SEEDSW_COEF * l_query) continue; +- ++ + for (j = k = 0; j < c->n; ++j) + { + mem_seed_t *s = &c->seeds[j]; +@@ -601,7 +601,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn_, mem_chain_t *a_, int tid) + + for (; i < n_chn; ++i) + if (a[i].kept < 3) a[i].kept = 0; +- ++ + for (i = k = 0; i < n_chn; ++i) // free discarded chains + { + mem_chain_t *c = &a[i]; +@@ -631,39 +631,23 @@ SMEM *mem_collect_smem(FMI_search *fmi, const mem_opt_t *opt, + int16_t *query_pos_ar, + uint8_t *enc_qdb, + int32_t *rid, +- mem_cache *mmc, +- int64_t &tot_smem, +- int tid) ++ int64_t &tot_smem) + { + int64_t pos = 0; + int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); + int64_t num_smem1 = 0, num_smem2 = 0, num_smem3 = 0; + int max_readlength = -1; +- ++ + int32_t *query_cum_len_ar = (int32_t *)_mm_malloc(nseq * sizeof(int32_t), 64); +- +- // New addition: checks +- // int64_t tot_seq_len = 0; +- // for (int l=0; l mmc->wsize_mem_s[tid]) { +- // fprintf(stderr, "[%0.4d] REA Re-allocating enc_qdb only\n", tid); +- // mmc->wsize_mem_s[tid] = tot_seq_len + 1024; +- // mmc->enc_qdb[tid] = (uint8_t *) realloc(mmc->enc_qdb[tid], +- // mmc->wsize_mem_s[tid] * sizeof(uint8_t)); +- // enc_qdb = mmc->enc_qdb[tid]; +- // } +- +- int offset = 0, max_read_len = 0; ++ ++ int offset = 0; + for (int l=0; lgetSMEMsAllPosOneThread(enc_qdb, min_intv_ar, rid, nseq, nseq, + seq_, query_cum_len_ar, max_readlength, opt->min_seed_len, + matchArray, &num_smem1); +- // New addition: checks +- // if (num_smem1 > mmc->wsize_mem_r[tid]) { +- // fprintf(stderr, "[%0.4d] REA Re-allocating min_intv, query_pos, & rid\n", tid); +- // mmc->wsize_mem_r[tid] = num_smem1 + 1024; +- // mmc->min_intv_ar[tid] = (int32_t *) realloc(mmc->min_intv_ar[tid], +- // mmc->wsize_mem_r[tid] * sizeof(int32_t)); +- // mmc->query_pos_ar[tid] = (int16_t *) realloc(mmc->query_pos_ar[tid], +- // mmc->wsize_mem_r[tid] * sizeof(int16_t)); +- // mmc->rid[tid] = (int32_t *) realloc(mmc->rid[tid], +- // mmc->wsize_mem_r[tid] * sizeof(int32_t)); +- // } +- +- // fprintf(stderr, "num_smem1: %d\n", num_smem1); +- int64_t mem_lim = 0; ++ ++ + for (int64_t i=0; im, end = p->n +1; + if (end - start < split_len || p->s > opt->split_width) + continue; +- ++ + int len = seq_[p->rid].l_seq; + + rid[pos] = p->rid; +- ++ + query_pos_ar[pos] = (end + start)>>1; + + assert(query_pos_ar[pos] < len); + + min_intv_ar[pos] = p->s + 1; + pos ++; +- // mem_lim += seq_[p->rid].l_seq; +- mem_lim += end - start; + } + +- #if 1 +- // fprintf(stderr, "num_smem2: %d, lim: %d\n", num_smem2, mem_lim + num_smem1 + offset); +- if (mmc->wsize_mem[tid] < mem_lim + num_smem1) { +- fprintf(stderr, "[%0.4d] REA Re-allocating SMEM data structures after num_smem1\n", tid); +- int64_t tmp = mmc->wsize_mem[tid]; +- mmc->wsize_mem[tid] = mem_lim + num_smem1; +- +- SMEM* ptr1 = mmc->matchArray[tid]; +- // ptr2 = w.mmc.min_intv_ar; +- // ptr3 = +- fprintf(stderr, "Realloc size from: %d to :%d\n", tmp, mmc->wsize_mem[tid]); +- mmc->matchArray[tid] = (SMEM *) _mm_malloc(mmc->wsize_mem[tid] * sizeof(SMEM), 64); +- assert(mmc->matchArray[tid] != NULL); +- matchArray = mmc->matchArray[tid]; +- // w.mmc.min_intv_ar[l] = (int32_t *) malloc(w.mmc.wsize_mem[l] * sizeof(int32_t)); +- // w.mmc.query_pos_ar[tid] = (int16_t *) malloc(w.mmc.wsize_mem[l] * sizeof(int16_t)); +- // w.mmc.enc_qdb[l] = (uint8_t *) malloc(w.mmc.wsize_mem[l] * sizeof(uint8_t)); +- // w.mmc.rid[l] = (int32_t *) malloc(w.mmc.wsize_mem[l] * sizeof(int32_t)); +- //w.mmc.lim[l] = (int32_t *) _mm_malloc((BATCH_SIZE + 32) * sizeof(int32_t), 64); +- for (int i=0; imatchArray[tid][i] = ptr1[i]; +- } +- _mm_free(ptr1); +- } +- #endif +- + fmi->getSMEMsOnePosOneThread(enc_qdb, + query_pos_ar, + min_intv_ar, +@@ -751,37 +695,19 @@ SMEM *mem_collect_smem(FMI_search *fmi, const mem_opt_t *opt, + opt->min_seed_len, + matchArray + num_smem1, + &num_smem2); +- // assert(mmc->wsize_mem[tid] > (num_smem1 + num_smem2)); +- // fprintwsize_mem_rf(stderr, "num_smem2: %d\n", num_smem2); ++ + if (opt->max_mem_intv > 0) + { +- #if 1 +- if (mmc->wsize_mem[tid] < num_smem2 + offset*1.0/(opt->min_seed_len + 1) + num_smem1) { +- int64_t tmp = mmc->wsize_mem[tid]; +- mmc->wsize_mem[tid] = num_smem2 + offset*1.0/(opt->min_seed_len + 1) + num_smem1; +- SMEM* ptr1 = mmc->matchArray[tid]; +- fprintf(stderr, "[%0.4d] REA Re-allocating SMEM data structures after num_smem2\n", tid); +- fprintf(stderr, "Realloc2 size from: %d to :%d\n", tmp, mmc->wsize_mem[tid]); +- mmc->matchArray[tid] = (SMEM *) _mm_malloc(mmc->wsize_mem[tid] * sizeof(SMEM), 64); +- assert(mmc->matchArray[tid] != NULL); +- matchArray = mmc->matchArray[tid]; +- for (int i=0; imatchArray[tid][i] = ptr1[i]; +- } +- _mm_free(ptr1); +- } +- #endif + for (int l=0; lmax_mem_intv; + + num_smem3 = fmi->bwtSeedStrategyAllPosOneThread(enc_qdb, min_intv_ar, +- nseq, seq_, query_cum_len_ar, ++ nseq, seq_, query_cum_len_ar, + opt->min_seed_len + 1, +- matchArray + num_smem1 + num_smem2); ++ matchArray + num_smem1 + num_smem2); + } + tot_smem = num_smem1 + num_smem2 + num_smem3; +- // assert(mmc->wsize_mem[tid] > (tot_smem)); +- // fprintf(stderr, "num_smems: %d %d %d, %d\n", num_smem1, num_smem2, num_smem3, tot_smem); ++ + fmi->sortSMEMs(matchArray, &tot_smem, nseq, seq_[0].l_seq, 1); // seq_[0].l_seq - only used for blocking when using nthreads + + pos = 0; +@@ -792,7 +718,7 @@ SMEM *mem_collect_smem(FMI_search *fmi, const mem_opt_t *opt, + pos++; + } while (pos < tot_smem - 1 && matchArray[pos].rid == matchArray[pos + 1].rid); + int64_t n = pos + 1 - smem_ptr; +- ++ + if (n > 0) + ks_introsort(mem_intv1, n, &matchArray[smem_ptr]); + smem_ptr = pos + 1; +@@ -818,19 +744,19 @@ void mem_chain_seeds(FMI_search *fmi, const mem_opt_t *opt, + int64_t i, pos = 0; + int64_t smem_ptr = 0; + int64_t l_pac = bns->l_pac; +- ++ + int num[nseq]; + memset(num, 0, nseq*sizeof(int)); + int smem_buf_size = 6000; + int64_t *sa_coord = (int64_t *) _mm_malloc(sizeof(int64_t) * opt->max_occ * smem_buf_size, 64); + int64_t seedBufCount = 0; +- ++ + for (int l=0; lmin_seed_len) return chain; // if the query is shorter than the seed length, no match +- ++ + uint64_t tim = __rdtsc(); + for (int l=0; lmem.n; ++i) // compute frac_rep +@@ -877,7 +803,7 @@ void mem_chain_seeds(FMI_search *fmi, const mem_opt_t *opt, + pos - smem_ptr + 1, opt->max_occ, tid, id); // sa compressed prefetch + tprof[MEM_SA][tid] += __rdtsc() - tim; + #endif +- ++ + for (i = smem_ptr; i <= pos; i++) + { + SMEM *p = &matchArray[i]; +@@ -892,8 +818,8 @@ void mem_chain_seeds(FMI_search *fmi, const mem_opt_t *opt, + fmi->get_sa_entries(p, sa_coord, &cnt, 1, opt->max_occ); + tprof[MEM_SA][tid] += __rdtsc() - tim; + #endif +- +- cnt = 0; ++ ++ cnt = 0; + for (k = count = 0; k < p->s && count < opt->max_occ; k += step, ++count) + { + mem_chain_t tmp, *lower, *upper; +@@ -905,18 +831,18 @@ void mem_chain_seeds(FMI_search *fmi, const mem_opt_t *opt, + #else + s.rbeg = tmp.pos = sa_coord[cnt++]; + #endif +- ++ + s.qbeg = p->m; + s.score= s.len = slen; +- if (s.rbeg < 0 || s.len < 0) ++ if (s.rbeg < 0 || s.len < 0) + fprintf(stderr, "rbeg: %ld, slen: %d, cnt: %d, n: %d, m: %d, num_smem: %ld\n", + s.rbeg, s.len, cnt-1, p->n, p->m, num_smem); +- ++ + rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); + // bridging multiple reference sequences or the + // forward-reverse boundary; TODO: split the seed; + // don't discard it!!! +- if (rid < 0) continue; ++ if (rid < 0) continue; + if (kb_size(tree)) + { + kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain +@@ -952,10 +878,10 @@ void mem_chain_seeds(FMI_search *fmi, const mem_opt_t *opt, + } + } // seeds + +- smem_ptr = pos + 1; ++ smem_ptr = pos + 1; + size = kb_size(tree); + // tprof[PE21][0] += kb_size(tree) * sizeof(mem_chain_t); +- ++ + kv_resize(mem_chain_t, *chain, size); + + #define traverse_func(p_) (chain->a[chain->n++] = *(p_)) +@@ -965,8 +891,8 @@ void mem_chain_seeds(FMI_search *fmi, const mem_opt_t *opt, + for (i = 0; i < chain->n; ++i) + chain->a[i].frac_rep = (float)l_rep / seq_[l].l_seq; + +- kb_destroy(chn, tree); +- ++ kb_destroy(chn, tree); ++ + } // iterations over input reads + tprof[MEM_SA_BLOCK][tid] += __rdtsc() - tim; + +@@ -982,11 +908,11 @@ int mem_kernel1_core(FMI_search *fmi, + int64_t seedBufSize, + mem_cache *mmc, + int tid) +-{ ++{ + int i; + int64_t num_smem = 0, tot_len = 0; + mem_chain_v *chn; +- ++ + uint64_t tim; + /* convert to 2-bit encoding if we have not done so */ + for (int l=0; lwsize_mem[tid], tot_len); ++ tot_len *= N_SMEM_KERNEL; + // This covers enc_qdb/SMEM reallocs + if (tot_len >= mmc->wsize_mem[tid]) + { + fprintf(stderr, "[%0.4d] Re-allocating SMEM data structures due to enc_qdb\n", tid); + int64_t tmp = mmc->wsize_mem[tid]; + mmc->wsize_mem[tid] = tot_len; +- mmc->wsize_mem_s[tid] = tot_len; +- mmc->wsize_mem_r[tid] = tot_len; + mmc->matchArray[tid] = (SMEM *) _mm_realloc(mmc->matchArray[tid], + tmp, mmc->wsize_mem[tid], sizeof(SMEM)); + //realloc(mmc->matchArray[tid], mmc->wsize_mem[tid] * sizeof(SMEM)); +@@ -1028,29 +951,26 @@ int mem_kernel1_core(FMI_search *fmi, + uint8_t *enc_qdb = mmc->enc_qdb[tid]; + int32_t *rid = mmc->rid[tid]; + int64_t *wsize_mem = &mmc->wsize_mem[tid]; +- +- tim = __rdtsc(); ++ ++ tim = __rdtsc(); + /********************** Kernel 1: FM+SMEMs *************************/ + printf_(VER, "6. Calling mem_collect_smem.., tid: %d\n", tid); +- matchArray = mem_collect_smem(fmi, opt, +- seq_, +- nseq, +- matchArray, +- min_intv_ar, +- query_pos_ar, +- enc_qdb, +- rid, +- mmc, +- num_smem, +- tid); ++ mem_collect_smem(fmi, opt, ++ seq_, ++ nseq, ++ matchArray, ++ min_intv_ar, ++ query_pos_ar, ++ enc_qdb, ++ rid, ++ num_smem); + + if (num_smem >= *wsize_mem){ +- fprintf(stderr, "Error [bug]: num_smem: %ld are more than allocated space %ld.\n", +- num_smem, *wsize_mem); ++ fprintf(stderr, "Error [bug]: num_smem: %ld are more than allocated space.\n", num_smem); + exit(EXIT_FAILURE); + } + printf_(VER, "6. Done! mem_collect_smem, num_smem: %ld\n", num_smem); +- tprof[MEM_COLLECT][tid] += __rdtsc() - tim; ++ tprof[MEM_COLLECT][tid] += __rdtsc() - tim; + + + /********************* Kernel 1.1: SA2REF **********************/ +@@ -1062,7 +982,7 @@ int mem_kernel1_core(FMI_search *fmi, + seedBufSize, + matchArray, + num_smem); +- ++ + printf_(VER, "5. Done mem_chain..\n"); + // tprof[MEM_CHAIN][tid] += __rdtsc() - tim; + +@@ -1077,7 +997,7 @@ int mem_kernel1_core(FMI_search *fmi, + printf_(VER, "7. Done mem_chain_flt..\n"); + // tprof[MEM_ALN_M1][tid] += __rdtsc() - tim; + +- ++ + printf_(VER, "8. Calling mem_flt_chained_seeds..\n"); + for (int l=0; lidx->bns, + fmi->idx->pac, + (uint8_t*) seq_[l].seq, +@@ -1168,16 +1088,16 @@ int mem_kernel2_core(FMI_search *fmi, + } + } + // tprof[POST_SWA][tid] += __rdtsc() - tim; +- ++ + return 1; + } + + static void worker_aln(void *data, int seq_id, int batch_size, int tid) + { + worker_t *w = (worker_t*) data; +- +- printf_(VER, "11. Calling mem_kernel2_core..\n"); +- mem_kernel2_core(w->fmi, w->opt, ++ ++ printf_(VER, "11. Calling mem_kernel2_core..\n"); ++ mem_kernel2_core(w->fmi, w->opt, + w->seqs + seq_id, + w->regs + seq_id, + batch_size, +@@ -1196,7 +1116,7 @@ static void worker_bwt(void *data, int seq_id, int batch_size, int tid) + printf_(VER, "4. Calling mem_kernel1_core..%d %d\n", seq_id, tid); + int seedBufSz = w->seedBufSize; + +- int memSize = w->nreads; ++ int memSize = w->nreads; + if (batch_size < BATCH_SIZE) { + seedBufSz = (memSize - seq_id) * AVG_SEEDS_PER_READ; + // fprintf(stderr, "[%0.4d] Info: adjusted seedBufSz %d\n", tid, seedBufSz); +@@ -1234,7 +1154,7 @@ int64_t sort_classify(mem_cache *mmc, int64_t pcnt, int tid) + } + } + assert(pos8 + pos16 == pcnt); +- ++ + for (int i=pos8; iopt->flag & MEM_F_PE) + { + int64_t pcnt = 0; + int start = seqid; + int end = seqid + batch_size; + int pos = start >> 1; +- +-#if (((!__AVX512BW__) && (!__AVX2__)) || ((!__AVX512BW__) && (__AVX2__))) ++ ++#if (((!__AVX512BW__) && (!__AVX2__)) || ((!__AVX512BW__) && (__AVX2__))) + for (int i=start; i< end; i+=2) + { + // orig mem_sam_pe() function +@@ -1262,7 +1182,7 @@ static void worker_sam(void *data, int seqid, int batch_size, int tid) + (w->n_processed >> 1) + pos++, // check! + &w->seqs[i], + &w->regs[i]); +- ++ + free(w->regs[i].a); + free(w->regs[i+1].a); + } +@@ -1278,13 +1198,13 @@ static void worker_sam(void *data, int seqid, int batch_size, int tid) + (w->n_processed >> 1) + pos++, // check! + &w->seqs[i], + &w->regs[i], +- &w->mmc, ++ &w->mmc, + pcnt, gcnt, + maxRefLen, + maxQerLen, + tid); + } +- ++ + // tprof[SAM1][tid] += __rdtsc() - tim; + int64_t pcnt8 = sort_classify(&w->mmc, pcnt, tid); + +@@ -1292,7 +1212,7 @@ static void worker_sam(void *data, int seqid, int batch_size, int tid) + assert(aln != NULL); + + // processing +- mem_sam_pe_batch(w->opt, &w->mmc, pcnt, pcnt8, aln, maxRefLen, maxQerLen, tid); ++ mem_sam_pe_batch(w->opt, &w->mmc, pcnt, pcnt8, aln, maxRefLen, maxQerLen, tid); + + // post-processing + // tim = __rdtsc(); +@@ -1314,7 +1234,7 @@ static void worker_sam(void *data, int seqid, int batch_size, int tid) + free(w->regs[i].a); + free(w->regs[i+1].a); + } +- //tprof[SAM3][tid] += __rdtsc() - tim; ++ //tprof[SAM3][tid] += __rdtsc() - tim; + _mm_free(aln); // kswr_t + #endif + } +@@ -1326,7 +1246,7 @@ static void worker_sam(void *data, int seqid, int batch_size, int tid) + w->regs[i].a, + w->n_processed + i); + #if V17 // Feature from v0.7.17 of orig. bwa-mem +- if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]); ++ if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]); + #endif + mem_reg2sam(w->opt, w->fmi->idx->bns, w->fmi->idx->pac, &w->seqs[i], + &w->regs[i], 0, 0); +@@ -1344,7 +1264,7 @@ void mem_process_seqs(mem_opt_t *opt, + { + mem_pestat_t pes[4]; + double ctime, rtime; +- ++ + ctime = cputime(); rtime = realtime(); + w.opt = opt; + w.seqs = seqs; w.n_processed = n_processed; +@@ -1352,16 +1272,16 @@ void mem_process_seqs(mem_opt_t *opt, + + //int n_ = (opt->flag & MEM_F_PE) ? n : n; // this requires n%2==0 + int n_ = n; +- +- uint64_t tim = __rdtsc(); ++ ++ uint64_t tim = __rdtsc(); + fprintf(stderr, "[0000] 1. Calling kt_for - worker_bwt\n"); +- ++ + kt_for(worker_bwt, &w, n_); // SMEMs (+SAL) + + fprintf(stderr, "[0000] 2. Calling kt_for - worker_aln\n"); +- ++ + kt_for(worker_aln, &w, n_); // BSW +- tprof[WORKER10][0] += __rdtsc() - tim; ++ tprof[WORKER10][0] += __rdtsc() - tim; + + + // PAIRED_END +@@ -1376,11 +1296,11 @@ void mem_process_seqs(mem_opt_t *opt, + // distribution from data + } + } +- ++ + tim = __rdtsc(); + fprintf(stderr, "[0000] 3. Calling kt_for - worker_sam\n"); +- +- kt_for(worker_sam, &w, n_); // SAM ++ ++ kt_for(worker_sam, &w, n_); // SAM + tprof[WORKER20][0] += __rdtsc() - tim; + + fprintf(stderr, "\t[0000][ M::%s] Processed %d reads in %.3f " +@@ -1422,7 +1342,7 @@ int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id + int i, n_pri; + int_v z = {0,0,0}; + if (n == 0) return 0; +- ++ + for (i = n_pri = 0; i < n; ++i) + { + a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i); +@@ -1528,7 +1448,7 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, + + if (!(opt->flag & MEM_F_ALL)) + XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq); +- ++ + kv_init(aa); + str.l = str.m = 0; str.s = 0; + for (k = l = 0; k < a->n; ++k) +@@ -1545,7 +1465,7 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, + + *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); + assert(q->rid >= 0); // this should not happen with the new code +- q->XA = XA? XA[k] : 0; ++ q->XA = XA? XA[k] : 0; + q->flag |= extra_flag; // flag secondary + if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score + if (l && p->secondary < 0) // if supplementary +@@ -1562,7 +1482,7 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, + mem_aln_t t; + t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); + t.flag |= extra_flag; +- mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m); ++ mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m); + } else { + for (k = 0; k < aa.n; ++k) + mem_aln2sam(opt, bns, &str, s, aa.n, aa.a, k, m); +@@ -1591,7 +1511,7 @@ static inline void add_cigar(const mem_opt_t *opt, mem_aln_t *p, kstring_t *str, + + void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, + bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_) +-{ ++{ + int i, l_name; + mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert + +@@ -1679,7 +1599,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, + str->s[str->l] = 0; + } else kputc('*', str); + } +- ++ + // print optional tags + if (p->n_cigar) { + kputsn("\tNM:i:", 6, str); kputw(p->NM, str); +@@ -1687,7 +1607,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, + } + #if V17 + if (m && m->n_cigar) { kputsn("\tMC:Z:", 6, str); add_cigar(opt, m, str, which); } +-#endif ++#endif + if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } + if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } + if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } +@@ -1716,7 +1636,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, + } + + if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); } +- ++ + if (s->comment) { kputc('\t', str); kputs(s->comment, str); } + if ((opt->flag&MEM_F_REF_HDR) && p->rid >= 0 && bns->anns[p->rid].anno != 0 && bns->anns[p->rid].anno[0] != 0) { + int tmp; +@@ -1841,7 +1761,7 @@ void* _mm_realloc(void *ptr, int64_t csize, int64_t nsize, int16_t dsize) { + assert(nptr != NULL); + memcpy_bwamem(nptr, nsize * dsize, ptr, csize, __FILE__, __LINE__); + _mm_free(ptr); +- ++ + return nptr; + } + +@@ -1858,11 +1778,11 @@ uint8_t *bns_get_seq_v2(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t + if (beg >= l_pac || end <= l_pac) { + *len = end - beg; + assert(end-beg < BATCH_SIZE * SEEDS_PER_READ * sizeof(SeqPair)); +- ++ + //seq = (uint8_t*) malloc(end - beg); + // seq = seqb; + if (beg >= l_pac) { // reverse strand +-#if 0 // orig ++#if 0 // orig + int64_t beg_f = (l_pac<<1) - 1 - end; + int64_t end_f = (l_pac<<1) - 1 - beg; + for (k = end_f; k > beg_f; --k) { +@@ -1880,7 +1800,7 @@ uint8_t *bns_get_seq_v2(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t + } + #else + seq = ref_string + beg; +-#endif ++#endif + } + + } else *len = 0; // if bridging the forward-reverse boundary, return nothing +@@ -1899,7 +1819,7 @@ uint8_t *bns_fetch_seq_v2(const bntseq_t *bns, const uint8_t *pac, + // if (*beg > mid || mid >= *end) + // fprintf(Error: stderr, "%ld %ld %ld\n", *beg, mid, *end); + assert(*beg <= mid && mid < *end); +- ++ + *rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev)); + far_beg = bns->anns[*rid].offset; + far_end = far_beg + bns->anns[*rid].len; +@@ -1912,7 +1832,7 @@ uint8_t *bns_fetch_seq_v2(const bntseq_t *bns, const uint8_t *pac, + *end = *end < far_end? *end : far_end; + + seq = bns_get_seq_v2(bns->l_pac, pac, *beg, *end, &len, ref_string, seqb); +- ++ + if (seq == 0 || *end - *beg != len) { + fprintf(stderr, "[E::%s] begin=%ld, mid=%ld, end=%ld, len=%ld, seq=%p, rid=%d, far_beg=%ld, far_end=%ld\n", + __func__, (long)*beg, (long)mid, (long)*end, (long)len, seq, *rid, (long)far_beg, (long)far_end); +@@ -1932,29 +1852,29 @@ inline void sortPairsLenExt(SeqPair *pairArray, int32_t count, SeqPair *tempArra + + int32_t *hist2 = hist + MAX_SEQ_LEN8; + int32_t *hist3 = hist + MAX_SEQ_LEN8 + MAX_SEQ_LEN16; +- ++ + for(i = 0; i <= MAX_SEQ_LEN8 + MAX_SEQ_LEN16; i+=1) + //_mm256_store_si256((__m256i *)(hist + i), zero256); + hist[i] = 0; +- ++ + int *arr = (int*) calloc (count, sizeof(int)); + assert(arr != NULL); +- ++ + for(i = 0; i < count; i++) + { + SeqPair sp = pairArray[i]; + // int minval = sp.h0 + max_(sp.len1, sp.len2); + int minval = sp.h0 + min_(sp.len1, sp.len2) * score_a; +- if (sp.len1 < MAX_SEQ_LEN8 && sp.len2 < MAX_SEQ_LEN8 && minval < MAX_SEQ_LEN8) ++ if (sp.len1 < MAX_SEQ_LEN8 && sp.len2 < MAX_SEQ_LEN8 && minval < MAX_SEQ_LEN8) + hist[minval]++; + else if(sp.len1 < MAX_SEQ_LEN16 && sp.len2 < MAX_SEQ_LEN16 && minval < MAX_SEQ_LEN16) + hist2[minval] ++; + else + hist3[0] ++; +- ++ + arr[i] = 0; + } +- ++ + int32_t cumulSum = 0; + for(i = 0; i < MAX_SEQ_LEN8; i++) + { +@@ -1975,8 +1895,8 @@ inline void sortPairsLenExt(SeqPair *pairArray, int32_t count, SeqPair *tempArra + SeqPair sp = pairArray[i]; + // int minval = sp.h0 + max_(sp.len1, sp.len2); + int minval = sp.h0 + min_(sp.len1, sp.len2) * score_a; +- +- if (sp.len1 < MAX_SEQ_LEN8 && sp.len2 < MAX_SEQ_LEN8 && minval < MAX_SEQ_LEN8) ++ ++ if (sp.len1 < MAX_SEQ_LEN8 && sp.len2 < MAX_SEQ_LEN8 && minval < MAX_SEQ_LEN8) + { + int32_t pos = hist[minval]; + tempArray[pos] = sp; +@@ -1986,7 +1906,7 @@ inline void sortPairsLenExt(SeqPair *pairArray, int32_t count, SeqPair *tempArra + { + fprintf(stderr, "[%s] Error log: repeat, pos: %d, arr: %d, minval: %d, (%d %d)\n", + __func__, pos, arr[pos], minval, sp.len1, sp.len2); +- exit(EXIT_FAILURE); ++ exit(EXIT_FAILURE); + } + arr[pos] = 1; + } +@@ -2002,8 +1922,8 @@ inline void sortPairsLenExt(SeqPair *pairArray, int32_t count, SeqPair *tempArra + "i: %d, pos: %d, arr: %d, hist2: %d, minval: %d, (%d %d %d) (%d %d %d)\n", + __func__, i, pos, arr[pos], hist2[minval], minval, sp.h0, sp.len1, sp.len2, + spt.h0, spt.len1, spt.len2); +- exit(EXIT_FAILURE); +- } ++ exit(EXIT_FAILURE); ++ } + arr[pos] = i + 1; + } + else { +@@ -2014,7 +1934,7 @@ inline void sortPairsLenExt(SeqPair *pairArray, int32_t count, SeqPair *tempArra + numPairs1 ++; + } + } +- ++ + for(i = 0; i < count; i++) { + pairArray[i] = tempArray[i]; + } +@@ -2026,16 +1946,16 @@ inline void sortPairsLen(SeqPair *pairArray, int32_t count, SeqPair *tempArray, + { + + int32_t i; +-#if ((!__AVX512BW__) & (__AVX2__ | __SSE2__)) ++#if (!__AVX512BW__) + for(i = 0; i <= MAX_SEQ_LEN16; i++) hist[i] = 0; +-#else ++#else + __m512i zero512 = _mm512_setzero_si512(); + for(i = 0; i <= MAX_SEQ_LEN16; i+=16) + { + _mm512_store_si512((__m512i *)(hist + i), zero512); + } + #endif +- ++ + for(i = 0; i < count; i++) + { + SeqPair sp = pairArray[i]; +@@ -2076,20 +1996,20 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + SeqPair *seqPairArrayRight128 = mmc->seqPairArrayRight128[tid]; + int64_t *wsize_pair = &(mmc->wsize[tid]); + +- uint8_t *seqBufLeftRef = mmc->seqBufLeftRef[tid*CACHE_LINE]; ++ uint8_t *seqBufLeftRef = mmc->seqBufLeftRef[tid*CACHE_LINE]; + uint8_t *seqBufRightRef = mmc->seqBufRightRef[tid*CACHE_LINE]; +- uint8_t *seqBufLeftQer = mmc->seqBufLeftQer[tid*CACHE_LINE]; ++ uint8_t *seqBufLeftQer = mmc->seqBufLeftQer[tid*CACHE_LINE]; + uint8_t *seqBufRightQer = mmc->seqBufRightQer[tid*CACHE_LINE]; + int64_t *wsize_buf_ref = &(mmc->wsize_buf_ref[tid*CACHE_LINE]); + int64_t *wsize_buf_qer = &(mmc->wsize_buf_qer[tid*CACHE_LINE]); +- ++ + // int32_t *lim_g = mmc->lim + (BATCH_SIZE + 32) * tid; + int32_t *lim_g = mmc->lim[tid]; +- ++ + mem_seed_t *s; + int64_t l_pac = bns->l_pac, rmax[8] __attribute__((aligned(64))); + // std::vector nexitv(nseq, 0); +- ++ + int numPairsLeft = 0, numPairsRight = 0; + int numPairsLeft1 = 0, numPairsRight1 = 0; + int numPairsLeft128 = 0, numPairsRight128 = 0; +@@ -2103,25 +2023,25 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + uint32_t *srtgg = (uint32_t*) malloc(nseq * SEEDS_PER_READ * fac * sizeof(uint32_t)); + + int spos = 0; +- ++ + // uint64_t timUP = __rdtsc(); + for (int l=0; lm = 0; + for (int j=0; jn; j++) { +@@ -2134,13 +2054,13 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + { + c = &chn->a[j]; + assert(c->seqid == l); +- ++ + int64_t tmp = 0; + if (c->n == 0) continue; +- ++ + _mm_prefetch((const char*) (srtg + spos + 64), _MM_HINT_NTA); + _mm_prefetch((const char*) (lim_g), _MM_HINT_NTA); +- ++ + // get the max possible span + rmax[0] = l_pac<<1; rmax[1] = 0; + +@@ -2156,7 +2076,7 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + rmax[1] = (rmax[1] > e)? rmax[1] : e; + if (t->len > max) max = t->len; + } +- ++ + rmax[0] = rmax[0] > 0? rmax[0] : 0; + rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1; + if (rmax[0] < l_pac && l_pac < rmax[1]) +@@ -2178,30 +2098,30 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + + _mm_prefetch((const char*) rseq, _MM_HINT_NTA); + // _mm_prefetch((const char*) rseq + 64, _MM_HINT_NTA); +- ++ + // assert(c->n < MAX_SEEDS_PER_READ); // temp + if (c->n > srt_size) { + srt_size = c->n + 10; + srt = (uint64_t *) realloc(srt, srt_size * 8); + } +- +- for (int i = 0; i < c->n; ++i) ++ ++ for (int i = 0; i < c->n; ++i) + srt[i] = (uint64_t)c->seeds[i].score<<32 | i; + +- if (c->n > 1) ++ if (c->n > 1) + ks_introsort_64(c->n, srt); +- ++ + // assert((spos + c->n) < SEEDS_PER_READ * FAC * nseq); + if ((spos + c->n) > SEEDS_PER_READ * fac * nseq) { + fac <<= 1; + srtgg = (uint32_t *) realloc(srtgg, nseq * SEEDS_PER_READ * fac * sizeof(uint32_t)); + } +- ++ + for (int i = 0; i < c->n; ++i) + srtg[spos++] = srt[i]; + + lim_g[l+1] += c->n; +- ++ + // uint64_t tim = __rdtsc(); + for (int k=c->n-1; k >= 0; k--) + { +@@ -2211,9 +2131,9 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + // a = kv_pushp(mem_alnreg_t, *av); + a = &av->a[av->n++]; + memset(a, 0, sizeof(mem_alnreg_t)); +- ++ + s->aln = av->n-1; +- ++ + a->w = opt->w; + a->score = a->truesc = -1; + a->rid = c->rid; +@@ -2221,9 +2141,9 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + a->seedlen0 = s->len; + a->c = c; //ptr + a->rb = a->qb = a->re = a->qe = H0_; +- ++ + tprof[PE19][tid] ++; +- ++ + int flag = 0; + std::pair pr; + if (s->qbeg) // left extension +@@ -2232,12 +2152,10 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + sp.h0 = s->len * opt->a; + sp.seqid = c->seqid; + sp.regid = av->n - 1; +- ++ + if (numPairsLeft >= *wsize_pair) { + fprintf(stderr, "[0000][%0.4d] Re-allocating seqPairArrays, in Left\n", tid); +- *wsize_pair += 1024; +- // assert(*wsize_pair > numPairsLeft); +- *wsize_pair += numPairsLeft + 1024; ++ *wsize_pair += 1024; + seqPairArrayAux = (SeqPair *) realloc(seqPairArrayAux, + (*wsize_pair + MAX_LINE_LEN) + * sizeof(SeqPair)); +@@ -2252,10 +2170,10 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + mmc->seqPairArrayRight128[tid] = seqPairArrayRight128; + } + +- ++ + sp.idq = leftQerOffset; + sp.idr = leftRefOffset; +- ++ + leftQerOffset += s->qbeg; + if (leftQerOffset >= *wsize_buf_qer) + { +@@ -2263,20 +2181,18 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + tid, __func__); + int64_t tmp = *wsize_buf_qer; + *wsize_buf_qer *= 2; +- assert(*wsize_buf_qer > leftQerOffset); +- + uint8_t *seqBufQer_ = (uint8_t*) +- _mm_realloc(seqBufLeftQer, tmp, *wsize_buf_qer, sizeof(uint8_t)); ++ _mm_realloc(seqBufLeftQer, tmp, *wsize_buf_qer, sizeof(uint8_t)); + mmc->seqBufLeftQer[tid*CACHE_LINE] = seqBufLeftQer = seqBufQer_; +- ++ + seqBufQer_ = (uint8_t*) +- _mm_realloc(seqBufRightQer, tmp, *wsize_buf_qer, sizeof(uint8_t)); +- mmc->seqBufRightQer[tid*CACHE_LINE] = seqBufRightQer = seqBufQer_; ++ _mm_realloc(seqBufRightQer, tmp, *wsize_buf_qer, sizeof(uint8_t)); ++ mmc->seqBufRightQer[tid*CACHE_LINE] = seqBufRightQer = seqBufQer_; + } +- ++ + uint8_t *qs = seqBufLeftQer + sp.idq; + for (int i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; +- ++ + tmp = s->rbeg - rmax[0]; + leftRefOffset += tmp; + if (leftRefOffset >= *wsize_buf_ref) +@@ -2286,31 +2202,31 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + int64_t tmp = *wsize_buf_ref; + *wsize_buf_ref *= 2; + uint8_t *seqBufRef_ = (uint8_t*) +- _mm_realloc(seqBufLeftRef, tmp, *wsize_buf_ref, sizeof(uint8_t)); ++ _mm_realloc(seqBufLeftRef, tmp, *wsize_buf_ref, sizeof(uint8_t)); + mmc->seqBufLeftRef[tid*CACHE_LINE] = seqBufLeftRef = seqBufRef_; +- ++ + seqBufRef_ = (uint8_t*) +- _mm_realloc(seqBufRightRef, tmp, *wsize_buf_ref, sizeof(uint8_t)); +- mmc->seqBufRightRef[tid*CACHE_LINE] = seqBufRightRef = seqBufRef_; ++ _mm_realloc(seqBufRightRef, tmp, *wsize_buf_ref, sizeof(uint8_t)); ++ mmc->seqBufRightRef[tid*CACHE_LINE] = seqBufRightRef = seqBufRef_; + } +- +- uint8_t *rs = seqBufLeftRef + sp.idr; ++ ++ uint8_t *rs = seqBufLeftRef + sp.idr; + for (int64_t i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; //seq1 +- ++ + sp.len2 = s->qbeg; + sp.len1 = tmp; + int minval = sp.h0 + min_(sp.len1, sp.len2) * opt->a; +- ++ + if (sp.len1 < MAX_SEQ_LEN8 && sp.len2 < MAX_SEQ_LEN8 && minval < MAX_SEQ_LEN8) { + numPairsLeft128++; + } + else if (sp.len1 < MAX_SEQ_LEN16 && sp.len2 < MAX_SEQ_LEN16 && minval < MAX_SEQ_LEN16){ + numPairsLeft16++; + } +- else { ++ else { + numPairsLeft1++; + } +- ++ + seqPairArrayLeft128[numPairsLeft] = sp; + numPairsLeft ++; + a->qb = s->qbeg; a->rb = s->rbeg; +@@ -2325,7 +2241,7 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + { + int64_t qe = s->qbeg + s->len; + int64_t re = s->rbeg + s->len - rmax[0]; +- assert(re >= 0); ++ assert(re >= 0); + SeqPair sp; + + sp.h0 = H0_; //random number +@@ -2336,8 +2252,6 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + { + fprintf(stderr, "[0000] [%0.4d] Re-allocating seqPairArrays Right\n", tid); + *wsize_pair += 1024; +- // assert(*wsize_pair > numPairsRight); +- *wsize_pair += numPairsLeft + 1024; + seqPairArrayAux = (SeqPair *) realloc(seqPairArrayAux, + (*wsize_pair + MAX_LINE_LEN) + * sizeof(SeqPair)); +@@ -2351,13 +2265,13 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + * sizeof(SeqPair)); + mmc->seqPairArrayRight128[tid] = seqPairArrayRight128; + } +- ++ + sp.len2 = l_query - qe; + sp.len1 = rmax[1] - rmax[0] - re; + + sp.idq = rightQerOffset; + sp.idr = rightRefOffset; +- ++ + rightQerOffset += sp.len2; + if (rightQerOffset >= *wsize_buf_qer) + { +@@ -2365,15 +2279,13 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + tid, __func__); + int64_t tmp = *wsize_buf_qer; + *wsize_buf_qer *= 2; +- assert(*wsize_buf_qer > rightQerOffset); +- + uint8_t *seqBufQer_ = (uint8_t*) +- _mm_realloc(seqBufLeftQer, tmp, *wsize_buf_qer, sizeof(uint8_t)); ++ _mm_realloc(seqBufLeftQer, tmp, *wsize_buf_qer, sizeof(uint8_t)); + mmc->seqBufLeftQer[tid*CACHE_LINE] = seqBufLeftQer = seqBufQer_; +- ++ + seqBufQer_ = (uint8_t*) +- _mm_realloc(seqBufRightQer, tmp, *wsize_buf_qer, sizeof(uint8_t)); +- mmc->seqBufRightQer[tid*CACHE_LINE] = seqBufRightQer = seqBufQer_; ++ _mm_realloc(seqBufRightQer, tmp, *wsize_buf_qer, sizeof(uint8_t)); ++ mmc->seqBufRightQer[tid*CACHE_LINE] = seqBufRightQer = seqBufQer_; + } + + rightRefOffset += sp.len1; +@@ -2384,25 +2296,25 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + int64_t tmp = *wsize_buf_ref; + *wsize_buf_ref *= 2; + uint8_t *seqBufRef_ = (uint8_t*) +- _mm_realloc(seqBufLeftRef, tmp, *wsize_buf_ref, sizeof(uint8_t)); ++ _mm_realloc(seqBufLeftRef, tmp, *wsize_buf_ref, sizeof(uint8_t)); + mmc->seqBufLeftRef[tid*CACHE_LINE] = seqBufLeftRef = seqBufRef_; +- ++ + seqBufRef_ = (uint8_t*) +- _mm_realloc(seqBufRightRef, tmp, *wsize_buf_ref, sizeof(uint8_t)); +- mmc->seqBufRightRef[tid*CACHE_LINE] = seqBufRightRef = seqBufRef_; ++ _mm_realloc(seqBufRightRef, tmp, *wsize_buf_ref, sizeof(uint8_t)); ++ mmc->seqBufRightRef[tid*CACHE_LINE] = seqBufRightRef = seqBufRef_; + } +- ++ + tprof[PE23][tid] += sp.len1 + sp.len2; + + uint8_t *qs = seqBufRightQer + sp.idq; + uint8_t *rs = seqBufRightRef + sp.idr; +- ++ + for (int i = 0; i < sp.len2; ++i) qs[i] = query[qe + i]; + + for (int i = 0; i < sp.len1; ++i) rs[i] = rseq[re + i]; //seq1 + + int minval = sp.h0 + min_(sp.len1, sp.len2) * opt->a; +- ++ + if (sp.len1 < MAX_SEQ_LEN8 && sp.len2 < MAX_SEQ_LEN8 && minval < MAX_SEQ_LEN8) { + numPairsRight128++; + } +@@ -2437,16 +2349,16 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + } + } + // tprof[MEM_ALN2_UP][tid] += __rdtsc() - timUP; +- ++ + + int32_t *hist = (int32_t *)_mm_malloc((MAX_SEQ_LEN8 + MAX_SEQ_LEN16 + 32) * + sizeof(int32_t), 64); +- ++ + /* Sorting based score is required as that affects the use of SIMD lanes */ + sortPairsLenExt(seqPairArrayLeft128, numPairsLeft, seqPairArrayAux, hist, + numPairsLeft128, numPairsLeft16, numPairsLeft1, opt->a); + assert(numPairsLeft == (numPairsLeft128 + numPairsLeft16 + numPairsLeft1)); +- ++ + + // SWA + // uint64_t timL = __rdtsc(); +@@ -2457,17 +2369,17 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + BandedPairWiseSW bswLeft(opt->o_del, opt->e_del, opt->o_ins, + opt->e_ins, opt->zdrop, opt->pen_clip5, + opt->mat, opt->a, opt->b, nthreads); +- ++ + BandedPairWiseSW bswRight(opt->o_del, opt->e_del, opt->o_ins, + opt->e_ins, opt->zdrop, opt->pen_clip3, + opt->mat, opt->a, opt->b, nthreads); +- ++ + int i; + // Left + SeqPair *pair_ar = seqPairArrayLeft128 + numPairsLeft128 + numPairsLeft16; + SeqPair *pair_ar_aux = seqPairArrayAux; + int nump = numPairsLeft1; +- ++ + // scalar + for ( i=0; iseqid].a[sp->regid]); // prev + int prev = a->score; +@@ -2514,7 +2426,7 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + a->seedcov += t->len; + } + } +- ++ + } else { + pair_ar_aux[num++] = *sp; + } +@@ -2531,7 +2443,7 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + + pair_ar = seqPairArrayLeft128 + numPairsLeft128; + pair_ar_aux = seqPairArrayAux; +- ++ + nump = numPairsLeft16; + for ( i=0; iseqid].a[sp->regid]); // prev +@@ -2563,7 +2475,7 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + int prev = a->score; + a->score = sp->score; + +- ++ + if (a->score == prev || sp->max_off < (w >> 1) + (w >> 2) || + i+1 == MAX_BAND_TRY) + { +@@ -2599,13 +2511,13 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + //****************** Left - vector int8 *********************** + pair_ar = seqPairArrayLeft128; + pair_ar_aux = seqPairArrayAux; +- ++ + nump = numPairsLeft128; + for ( i=0; iw << i; + // int64_t tim = __rdtsc(); +- ++ + #if ((!__AVX512BW__) && (!__AVX2__) && (!__SSE2__)) + bswLeft.scalarBandedSWAWrapper(pair_ar, seqBufLeftRef, seqBufLeftQer, nump, nthreads, w); + #else +@@ -2616,22 +2528,22 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + nump, + nthreads, + w); +-#endif +- ++#endif ++ + tprof[PE1][0] += nump; + tprof[PE2][0] ++; + // tprof[MEM_ALN2_D][tid] += __rdtsc() - tim; + + int num = 0; + for (int l=0; lseqid].a[sp->regid]); // prev + + int prev = a->score; + a->score = sp->score; +- ++ + if (a->score == prev || sp->max_off < (w >> 1) + (w >> 2) || + i+1 == MAX_BAND_TRY) + { +@@ -2665,12 +2577,12 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + } + + // tprof[CLEFT][tid] += __rdtsc() - timL; +- ++ + // uint64_t timR = __rdtsc(); + // ********************************************************** + // Right, scalar + for (int l=0; lseqid].a[sp->regid]); // prev + sp->h0 = a->score; +@@ -2688,7 +2600,7 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + for ( i=0; iw << i; +- // tim = __rdtsc(); ++ // tim = __rdtsc(); + bswRight.scalarBandedSWAWrapper(pair_ar, + seqBufRightRef, + seqBufRightQer, +@@ -2702,12 +2614,12 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + + for (int l=0; lseqid].a[sp->regid]); // prev + int prev = a->score; + a->score = sp->score; +- ++ + // no further banding + if (a->score == prev || sp->max_off < (w >> 1) + (w >> 2) || + i+1 == MAX_BAND_TRY) +@@ -2745,7 +2657,7 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + pair_ar = seqPairArrayRight128 + numPairsRight128; + pair_ar_aux = seqPairArrayAux; + nump = numPairsRight16; +- ++ + for ( i=0; iw << i; +@@ -2765,18 +2677,18 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + tprof[PE7][0] += nump; + tprof[PE8][0] ++; + // tprof[MEM_ALN2_C][tid] += __rdtsc() - tim; +- ++ + int num = 0; + + for (int l=0; lseqid].a[sp->regid]); // prev + //OutScore *o = outScoreArray + l; + int prev = a->score; + a->score = sp->score; +- ++ + // no further banding + if (a->score == prev || sp->max_off < (w >> 1) + (w >> 2) || + i+1 == MAX_BAND_TRY) +@@ -2815,14 +2727,14 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + pair_ar = seqPairArrayRight128; + pair_ar_aux = seqPairArrayAux; + nump = numPairsRight128; +- ++ + for ( i=0; iw << i; + // uint64_t tim = __rdtsc(); +- ++ + #if ((!__AVX512BW__) && (!__AVX2__) && (!__SSE2__)) +- bswRight.scalarBandedSWAWrapper(pair_ar, seqBufRightRef, seqBufRightQer, nump, nthreads, w); ++ bswRight.scalarBandedSWAWrapper(pair_ar, seqBufRightRef, seqBufRightQer, nump, nthreads, w); + #else + sortPairsLen(pair_ar, nump, seqPairArrayAux, hist); + bswRight.getScores8(pair_ar, +@@ -2831,8 +2743,8 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + nump, + nthreads, + w); +-#endif +- ++#endif ++ + tprof[PE3][0] += nump; + tprof[PE4][0] ++; + // tprof[MEM_ALN2_E][tid] += __rdtsc() - tim; +@@ -2840,7 +2752,7 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + + for (int l=0; lseqid].a[sp->regid]); // prev + //OutScore *o = outScoreArray + l; +@@ -2881,22 +2793,22 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + + _mm_free(hist); + // tprof[CRIGHT][tid] += __rdtsc() - timR; +- ++ + if (numPairsLeft >= *wsize_pair || numPairsRight >= *wsize_pair) + { // refine it! + fprintf(stderr, "Error: Unexpected behaviour!!!\n"); + fprintf(stderr, "Error: assert failed for seqPair size, " +- "numPairsLeft: %d, numPairsRight %d, lim: %d\nExiting.\n", +- numPairsLeft, numPairsRight, *wsize_pair); ++ "numPairsLeft: %d, numPairsRight %d\nExiting.\n", ++ numPairsLeft, numPairsRight); + exit(EXIT_FAILURE); + } + /* Discard seeds and hence their alignemnts */ +- ++ + lim_g[0] = 0; + for (int l=1; ln; j++) + { + c = &chn->a[j]; +@@ -2919,7 +2831,7 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + + uint32_t *srt2 = srtg + s_start; + s_start += c->n; +- ++ + int k = 0; + for (k = c->n-1; k >= 0; k--) + { +@@ -2939,12 +2851,12 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + || s->qbeg + s->len > p->qe) { + v++; continue; // not fully contained + } +- ++ + if (s->len - p->seedlen0 > .1 * l_query) { v++; continue;} + // qd: distance ahead of the seed on query; rd: on reference + qd = s->qbeg - p->qb; rd = s->rbeg - p->rb; + // the maximal gap allowed in regions ahead of the seed +- max_gap = cal_max_gap(opt, qd < rd? qd : rd); ++ max_gap = cal_max_gap(opt, qd < rd? qd : rd); + w = max_gap < p->w? max_gap : p->w; // bounded by the band width + if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit + // similar to the previous four lines, but this time we look at the region behind +@@ -2952,10 +2864,10 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + max_gap = cal_max_gap(opt, qd < rd? qd : rd); + w = max_gap < p->w? max_gap : p->w; + if (qd - rd < w && rd - qd < w) break; +- ++ + v++; + } +- ++ + // the seed is (almost) contained in an existing alignment; + // further testing is needed to confirm it is not leading + // to a different aln +@@ -2969,7 +2881,7 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + //if (t->done == H0_) continue; //check for interferences!!! + // only check overlapping if t is long enough; + // TODO: more efficient by early stopping +- if (t->len < s->len * .95) continue; ++ if (t->len < s->len * .95) continue; + if (s->qbeg <= t->qbeg && s->qbeg + s->len - t->qbeg >= s->len>>2 && + t->qbeg - s->qbeg != t->rbeg - s->rbeg) break; + if (t->qbeg <= s->qbeg && t->qbeg + t->len - s->qbeg >= s->len>>2 && +@@ -2982,7 +2894,7 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + tprof[PE18][tid]++; + continue; + } +- } ++ } + lim[l]++; + } + } +@@ -2990,5 +2902,5 @@ void mem_chain2aln_across_reads_V2(const mem_opt_t *opt, const bntseq_t *bns, + free(srtgg); + free(srt); + free(lim); +- // tprof[MEM_ALN2_DOWN][tid] += __rdtsc() - tim; ++ // tprof[MEM_ALN2_DOWN][tid] += __rdtsc() - tim; + } +diff --git a/src/bwamem.h b/src/bwamem.h +old mode 100755 +new mode 100644 +index 12329cb..8189ad6 +--- a/src/bwamem.h ++++ b/src/bwamem.h +@@ -206,8 +206,6 @@ typedef struct + uint8_t *enc_qdb[MAX_THREADS]; + + int64_t wsize_mem[MAX_THREADS]; +- int64_t wsize_mem_s[MAX_THREADS]; +- int64_t wsize_mem_r[MAX_THREADS]; + } mem_cache; + + // chain moved to .h +diff --git a/src/fastmap.cpp b/src/fastmap.cpp +index 5cbb2dc..c701201 100644 +--- a/src/fastmap.cpp ++++ b/src/fastmap.cpp +@@ -52,7 +52,7 @@ void __cpuid(unsigned int i, unsigned int cpuid[4]) { + #ifdef _WIN32 + __cpuid((int *) cpuid, (int)i); + +-#else ++#elif defined(__x86_64__) || defined(__i386__) + asm volatile + ("cpuid" : "=a" (cpuid[0]), "=b" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3]) + : "0" (i), "2" (0)); +@@ -62,6 +62,7 @@ void __cpuid(unsigned int i, unsigned int cpuid[4]) { + + int HTStatus() + { ++#if defined(__x86_64__) || defined(__i386__) + unsigned int cpuid[4]; + char platform_vendor[12]; + __cpuid(0, cpuid); +@@ -93,13 +94,16 @@ int HTStatus() + fprintf(stderr, "CPUs support hyperThreading !!\n"); + + return ht; ++#else ++ return 0; ++#endif + } + + + /*** Memory pre-allocations ***/ + void memoryAlloc(ktp_aux_t *aux, worker_t &w, int32_t nreads, int32_t nthreads) + { +- mem_opt_t *opt = aux->opt; ++ mem_opt_t *opt = aux->opt; + int32_t memSize = nreads; + int32_t readLen = READ_LEN; + +@@ -123,7 +127,7 @@ void memoryAlloc(ktp_aux_t *aux, worker_t &w, int32_t nreads, int32_t nthreads) + fprintf(stderr, "------------------------------------------\n"); + fprintf(stderr, "1. Memory pre-allocation for Chaining: %0.4lf MB\n", allocMem/1e6); + +- ++ + /* SWA mem allocation */ + int64_t wsize = BATCH_SIZE * SEEDS_PER_READ; + for(int l=0; ln_threads * 2+ +- (wsize * MAX_SEQ_LEN_QER * sizeof(int8_t) + MAX_LINE_LEN) * opt->n_threads * 2 + +- wsize * sizeof(SeqPair) * opt->n_threads * 3; ++ (wsize * MAX_SEQ_LEN_QER * sizeof(int8_t) + MAX_LINE_LEN) * opt->n_threads * 2 + ++ wsize * sizeof(SeqPair) * opt->n_threads * 3; + fprintf(stderr, "2. Memory pre-allocation for BSW: %0.4lf MB\n", allocMem/1e6); + + for (int l=0; ltask_size, sz, ret->n_seqs); ++ aux->task_size, sz, ret->n_seqs); + + if (ret->seqs == 0) { + free(ret); +@@ -226,9 +227,9 @@ ktp_data_t *kt_pipeline(void *shared, int step, void *data, mem_opt_t *opt, work + fprintf(stderr, "\t[0000][ M::%s] read %d sequences (%ld bp)...\n", + __func__, ret->n_seqs, (long)size); + } +- ++ + return ret; +- } // Step 0 ++ } // Step 0 + else if (step == 1) /* Step 2: Main processing-engine */ + { + static int task = 0; +@@ -241,8 +242,8 @@ ktp_data_t *kt_pipeline(void *shared, int step, void *data, mem_opt_t *opt, work + w.chain_ar = (mem_chain_v*) malloc (w.nreads * sizeof(mem_chain_v)); + w.seedBuf = (mem_seed_t *) calloc(sizeof(mem_seed_t), w.nreads * AVG_SEEDS_PER_READ); + assert(w.regs != NULL); assert(w.chain_ar != NULL); assert(w.seedBuf != NULL); +- } +- ++ } ++ + fprintf(stderr, "[0000] Calling mem_process_seqs.., task: %d\n", task++); + + uint64_t tim = __rdtsc(); +@@ -256,7 +257,7 @@ ktp_data_t *kt_pipeline(void *shared, int step, void *data, mem_opt_t *opt, work + + fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences.....\n", + __func__, n_sep[0], n_sep[1]); +- ++ + if (n_sep[0]) { + tmp_opt.flag &= ~MEM_F_PE; + /* single-end sequences, in the mixture */ +@@ -266,7 +267,7 @@ ktp_data_t *kt_pipeline(void *shared, int step, void *data, mem_opt_t *opt, work + sep[0], + 0, + w); +- ++ + for (int i = 0; i < n_sep[0]; ++i) + ret->seqs[sep[0][i].id].sam = sep[0][i].sam; + } +@@ -279,7 +280,7 @@ ktp_data_t *kt_pipeline(void *shared, int step, void *data, mem_opt_t *opt, work + sep[1], + aux->pes0, + w); +- ++ + for (int i = 0; i < n_sep[1]; ++i) + ret->seqs[sep[1][i].id].sam = sep[1][i].sam; + } +@@ -293,15 +294,15 @@ ktp_data_t *kt_pipeline(void *shared, int step, void *data, mem_opt_t *opt, work + ret->seqs, + aux->pes0, + w); +- } ++ } + tprof[MEM_PROCESS2][0] += __rdtsc() - tim; + +- aux->n_processed += ret->n_seqs; + return ret; +- } ++ } + /* Step 3: Write output */ + else if (step == 2) + { ++ aux->n_processed += ret->n_seqs; + uint64_t tim = __rdtsc(); + + for (int i = 0; i < ret->n_seqs; ++i) +@@ -320,15 +321,15 @@ ktp_data_t *kt_pipeline(void *shared, int step, void *data, mem_opt_t *opt, work + + return 0; + } // step 2 +- ++ + return 0; + } + + static void *ktp_worker(void *data) +-{ ++{ + ktp_worker_t *w = (ktp_worker_t*) data; + ktp_t *p = w->pl; +- ++ + while (w->step < p->n_steps) { + // test whether we can kick off the job with this worker + int pthread_ret = pthread_mutex_lock(&p->mutex); +@@ -372,7 +373,7 @@ static int process(void *shared, gzFile gfp, gzFile gfp2, int pipe_threads) + mem_opt_t *opt = aux->opt; + int32_t nthreads = opt->n_threads; // global variable for profiling! + w.nthreads = opt->n_threads; +- ++ + #if NUMA_ENABLED + int deno = 1; + int tc = numa_num_task_cpus(); +@@ -381,7 +382,7 @@ static int process(void *shared, gzFile gfp, gzFile gfp2, int pipe_threads) + fprintf(stderr, "num_cpus: %d, num_numas: %d, configured cpus: %d\n", tc, tn, tcc); + int ht = HTStatus(); + if (ht) deno = 2; +- ++ + if (nthreads < tcc/tn/deno) { + fprintf(stderr, "Enabling single numa domain...\n\n"); + // numa_set_preferred(0); +@@ -409,7 +410,7 @@ static int process(void *shared, gzFile gfp, gzFile gfp2, int pipe_threads) + int num_sockets = num_total_logical_cpus / num_logical_cpus; + fprintf(stderr, "#sockets: %d, #cores/socket: %d, #logical_cpus: %d, #ht/core: %d\n", + num_sockets, num_logical_cpus/num_ht, num_total_logical_cpus, num_ht); +- ++ + for (int i=0; iactual_chunk_size/ READ_LEN + 10; +- ++ + /* All memory allocation */ + memoryAlloc(aux, w, nreads, nthreads); + fprintf(stderr, "* Threads used (compute): %d\n", nthreads); +- ++ + /* pipeline using pthreads */ + ktp_t aux_; + int p_nt = pipe_threads; // 2; + int n_steps = 3; +- ++ + w.ref_string = aux->ref_string; + w.fmi = aux->fmi; + w.nreads = nreads; + // w.memSize = nreads; +- ++ + aux_.n_workers = p_nt; + aux_.n_steps = n_steps; + aux_.shared = aux; +@@ -484,7 +485,7 @@ static int process(void *shared, gzFile gfp, gzFile gfp2, int pipe_threads) + fprintf(stderr, "* No. of pipeline threads: %d\n\n", p_nt); + aux_.workers = (ktp_worker_t*) malloc(p_nt * sizeof(ktp_worker_t)); + assert(aux_.workers != NULL); +- ++ + for (int i = 0; i < p_nt; ++i) { + ktp_worker_t *wr = &aux_.workers[i]; + wr->step = 0; wr->pl = &aux_; wr->data = 0; +@@ -493,13 +494,13 @@ static int process(void *shared, gzFile gfp, gzFile gfp2, int pipe_threads) + wr->opt = opt; + wr->w = &w; + } +- ++ + pthread_t *ptid = (pthread_t *) calloc(p_nt, sizeof(pthread_t)); + assert(ptid != NULL); +- ++ + for (int i = 0; i < p_nt; ++i) + pthread_create(&ptid[i], 0, ktp_worker, (void*) &aux_.workers[i]); +- ++ + for (int i = 0; i < p_nt; ++i) + pthread_join(ptid[i], 0); + +@@ -511,14 +512,14 @@ static int process(void *shared, gzFile gfp, gzFile gfp2, int pipe_threads) + free(ptid); + free(aux_.workers); + /***** pipeline ends ******/ +- ++ + fprintf(stderr, "[0000] Computation ends..\n"); +- +- /* Dealloc memory allcoated in the header section */ ++ ++ /* Dealloc memory allcoated in the header section */ + free(w.chain_ar); + free(w.regs); + free(w.seedBuf); +- ++ + for(int l=0; l.alt file)\n"); + fprintf(stderr, " -5 for split alignment, take the alignment with the smallest coordinate as primary\n"); + fprintf(stderr, " -q don't modify mapQ of supplementary alignments\n"); +- fprintf(stderr, " -K INT process INT input bases in each batch regardless of nThreads (for reproducibility) []\n"); ++ fprintf(stderr, " -K INT process INT input bases in each batch regardless of nThreads (for reproducibility) []\n"); + fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); + fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); + fprintf(stderr, " -h INT[,INT] if there are 80%% of the max score, output all in XA [%d,%d]\n", opt->max_XA_hits, opt->max_XA_hits_alt); +@@ -619,7 +620,7 @@ int main_mem(int argc, char *argv[]) + int fixed_chunk_size = -1; + char *p, *rg_line = 0, *hdr_line = 0; + const char *mode = 0; +- ++ + mem_opt_t *opt, opt0; + gzFile fp, fp2 = 0; + void *ko = 0, *ko2 = 0; +@@ -628,16 +629,16 @@ int main_mem(int argc, char *argv[]) + ktp_aux_t aux; + bool is_o = 0; + uint8_t *ref_string; +- ++ + memset(&aux, 0, sizeof(ktp_aux_t)); + memset(pes, 0, 4 * sizeof(mem_pestat_t)); + for (i = 0; i < 4; ++i) pes[i].failed = 1; +- ++ + // opterr = 0; + aux.fp = stdout; + aux.opt = opt = mem_opt_init(); + memset(&opt0, 0, sizeof(mem_opt_t)); +- ++ + /* Parse input arguments */ + // comment: added option '5' in the list + while ((c = getopt(argc, argv, "51qpaMCSPVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:o:f:")) >= 0) +@@ -780,7 +781,7 @@ int main_mem(int argc, char *argv[]) + return 1; + } + } +- ++ + /* Check output file name */ + if (rg_line) + { +@@ -792,7 +793,7 @@ int main_mem(int argc, char *argv[]) + if (optind + 2 != argc && optind + 3 != argc) { + usage(opt); + free(opt); +- if (is_o) ++ if (is_o) + fclose(aux.fp); + return 1; + } +@@ -841,52 +842,52 @@ int main_mem(int argc, char *argv[]) + return 1; + } + } else update_a(opt, &opt0); +- ++ + /* Matrix for SWA */ + bwa_fill_scmat(opt->a, opt->b, opt->mat); +- ++ + /* Load bwt2/FMI index */ + uint64_t tim = __rdtsc(); +- +- fprintf(stderr, "* Ref file: %s\n", argv[optind]); ++ ++ fprintf(stderr, "* Ref file: %s\n", argv[optind]); + aux.fmi = new FMI_search(argv[optind]); + aux.fmi->load_index(); + tprof[FMI][0] += __rdtsc() - tim; +- ++ + // reading ref string from the file + tim = __rdtsc(); + fprintf(stderr, "* Reading reference genome..\n"); +- ++ + char binary_seq_file[PATH_MAX]; + strcpy_s(binary_seq_file, PATH_MAX, argv[optind]); + strcat_s(binary_seq_file, PATH_MAX, ".0123"); + //sprintf(binary_seq_file, "%s.0123", argv[optind]); +- ++ + fprintf(stderr, "* Binary seq file = %s\n", binary_seq_file); + FILE *fr = fopen(binary_seq_file, "r"); +- ++ + if (fr == NULL) { + fprintf(stderr, "Error: can't open %s input file\n", binary_seq_file); + exit(EXIT_FAILURE); + } +- ++ + int64_t rlen = 0; +- fseek(fr, 0, SEEK_END); ++ fseek(fr, 0, SEEK_END); + rlen = ftell(fr); + ref_string = (uint8_t*) _mm_malloc(rlen, 64); + aux.ref_string = ref_string; + rewind(fr); +- ++ + /* Reading ref. sequence */ + err_fread_noeof(ref_string, 1, rlen, fr); +- ++ + uint64_t timer = __rdtsc(); + tprof[REF_IO][0] += timer - tim; +- ++ + fclose(fr); + fprintf(stderr, "* Reference genome size: %ld bp\n", rlen); + fprintf(stderr, "* Done reading reference genome !!\n\n"); +- ++ + if (ignore_alt) + for (i = 0; i < aux.fmi->idx->bns->n_seqs; ++i) + aux.fmi->idx->bns->anns[i].is_alt = 0; +@@ -896,7 +897,7 @@ int main_mem(int argc, char *argv[]) + if (ko == 0) { + fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]); + free(opt); +- if (is_o) ++ if (is_o) + fclose(aux.fp); + delete aux.fmi; + // kclose(ko); +@@ -905,7 +906,7 @@ int main_mem(int argc, char *argv[]) + // fp = gzopen(argv[optind + 1], "r"); + fp = gzdopen(fd, "r"); + aux.ks = kseq_init(fp); +- ++ + // PAIRED_END + /* Handling Paired-end reads */ + aux.ks2 = 0; +@@ -923,13 +924,13 @@ int main_mem(int argc, char *argv[]) + free(ko); + err_gzclose(fp); + kseq_destroy(aux.ks); +- if (is_o) +- fclose(aux.fp); ++ if (is_o) ++ fclose(aux.fp); + delete aux.fmi; + kclose(ko); + // kclose(ko2); + return 1; +- } ++ } + // fp2 = gzopen(argv[optind + 2], "r"); + fp2 = gzdopen(fd2, "r"); + aux.ks2 = kseq_init(fp2); +@@ -952,7 +953,7 @@ int main_mem(int argc, char *argv[]) + + /* Relay process function */ + process(&aux, fp, fp2, no_mt_io? 1:2); +- ++ + tprof[PROCESS][0] += __rdtsc() - tim; + + // free memory +@@ -960,7 +961,7 @@ int main_mem(int argc, char *argv[]) + _mm_free(ref_string); + free(hdr_line); + free(opt); +- kseq_destroy(aux.ks); ++ kseq_destroy(aux.ks); + err_gzclose(fp); kclose(ko); + + // PAIRED_END +@@ -968,17 +969,18 @@ int main_mem(int argc, char *argv[]) + kseq_destroy(aux.ks2); + err_gzclose(fp2); kclose(ko2); + } +- ++ + if (is_o) { + fclose(aux.fp); + } + + // new bwt/FMI +- delete(aux.fmi); ++ delete(aux.fmi); + + /* Display runtime profiling stats */ + tprof[MEM][0] = __rdtsc() - tprof[MEM][0]; + display_stats(nt); +- ++ + return 0; + } ++ +diff --git a/src/ksw.cpp b/src/ksw.cpp +index ad9bc50..9369713 100644 +--- a/src/ksw.cpp ++++ b/src/ksw.cpp +@@ -30,7 +30,6 @@ + #include + #include + #include +-#include + #include "ksw.h" + #include "macro.h" + +diff --git a/src/ksw.h b/src/ksw.h +index 54bcef8..3179fc4 100644 +--- a/src/ksw.h ++++ b/src/ksw.h +@@ -26,7 +26,8 @@ + #define __AC_KSW_H + + #include +-#include ++#define SIMDE_ENABLE_NATIVE_ALIASES ++#include + + #define KSW_XBYTE 0x10000 + #define KSW_XSTOP 0x20000 +diff --git a/src/kswv.h b/src/kswv.h +index 11da4d7..aed1d1a 100644 +--- a/src/kswv.h ++++ b/src/kswv.h +@@ -39,7 +39,8 @@ Authors: Vasimuddin Md ; Sanchit Misra ++#define SIMDE_ENABLE_NATIVE_ALIASES ++#include + #endif + + #ifdef __GNUC__ +diff --git a/src/main.cpp b/src/main.cpp +index abee98d..6eba82b 100644 +--- a/src/main.cpp ++++ b/src/main.cpp +@@ -85,6 +85,10 @@ int main(int argc, char* argv[]) + fprintf(stderr, "Executing in SSE4.2 mode!!\n"); + #elif __SSE4_1__ + fprintf(stderr, "Executing in SSE4.1 mode!!\n"); ++#elif __SSE2__ ++ fprintf(stderr, "Executing in SSE2 mode!!\n"); ++#else ++ fprintf(stderr, "Executing in Scalar mode!!\n"); + #endif + fprintf(stderr, "-----------------------------\n"); + +diff --git a/src/runsimd.cpp b/src/runsimd.cpp +index 03c0e91..e0798f2 100644 +--- a/src/runsimd.cpp ++++ b/src/runsimd.cpp +@@ -61,7 +61,7 @@ void __cpuidex(int cpuid[4], int func_id, int subfunc_id) + __asm__ volatile ("cpuid" + : "=a" (cpuid[0]), "=b" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3]) + : "0" (func_id), "2" (subfunc_id)); +-#else // on 32bit, ebx can NOT be used as PIC code ++#elif defined(__i386__) // on 32bit, ebx can NOT be used as PIC code + __asm__ volatile ("xchgl %%ebx, %1; cpuid; xchgl %%ebx, %1" + : "=a" (cpuid[0]), "=r" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3]) + : "0" (func_id), "2" (subfunc_id)); +diff --git a/src/utils.h b/src/utils.h +index fbf8439..494b515 100644 +--- a/src/utils.h ++++ b/src/utils.h +@@ -48,6 +48,7 @@ + + #define xassert(cond, msg) if ((cond) == 0) _err_fatal_simple_core(__func__, msg) + ++// clang and gcc 11+ define __rdtsc intrinsics on x86/64 + #if defined(__GNUC__) && __GNUC__ < 11 && !defined(__clang__) + #if defined(__i386__) + static inline unsigned long long __rdtsc(void) +@@ -65,6 +66,46 @@ static inline unsigned long long __rdtsc(void) + } + #endif + #endif ++// From https://github.com/google/benchmark/blob/37177a84b7e8d33696ea1e1854513cb0de3b4dc3/src/cycleclock.h ++// Apache 2.0 license ++#if defined(__aarch64__) ++ // System timer of ARMv8 runs at a different frequency than the CPU's. ++ // The frequency is fixed, typically in the range 1-50MHz. It can be ++ // read at CNTFRQ special register. We assume the OS has set up ++ // the virtual timer properly. ++static inline unsigned long long __rdtsc(void) ++{ ++ int64_t virtual_timer_value; ++ asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value)); ++ return virtual_timer_value; ++} ++#elif defined(__ARM_ARCH) ++ // V6 is the earliest arch that has a standard cyclecount ++ // Native Client validator doesn't allow MRC instructions. ++#if (__ARM_ARCH >= 6) ++static inline unsigned long long __rdtsc(void) ++{ ++ uint32_t pmccntr; ++ uint32_t pmuseren; ++ uint32_t pmcntenset; ++ // Read the user mode perf monitor counter access permissions. ++ asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren)); ++ if (pmuseren & 1) { // Allows reading perfmon counters for user mode code. ++ asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset)); ++ if (pmcntenset & 0x80000000ul) { // Is it counting? ++ asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr)); ++ // The counter is set up to count every 64th cycle ++ return static_cast(pmccntr) * 64; // Should optimize to << 6 ++ } ++ } ++ struct timeval tv; ++ gettimeofday(&tv, nullptr); ++ return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; ++} ++#else ++#error __ARM_ARCH < 6 does not have a standard cyclecount ++#endif ++#endif + + typedef struct { + uint64_t x, y; + diff --git a/recipes/bwa-mem2/build.sh b/recipes/bwa-mem2/build.sh index 9dcc42665c445..8e25856454051 100644 --- a/recipes/bwa-mem2/build.sh +++ b/recipes/bwa-mem2/build.sh @@ -9,7 +9,13 @@ if [[ $OSTYPE == "darwin"* ]]; then sed -i.bak 's/memset_s/memset8_s/g' ext/safestringlib/safeclib/memset_s.c sed -i.bak 's/memset_s/memset8_s/g' ext/safestringlib/safeclib/wmemset_s.c fi -LIBS="${LDFLAGS}" make CC="${CC}" CXX="${CXX}" multi -mkdir -p $PREFIX/bin -cp bwa-mem2* $PREFIX/bin +case ${ARCH} in + x86_64) LIBS="${LDFLAGS}" make CC="${CC}" CXX="${CXX}" multi ;; + aarch64) LIBS="${LDFLAGS}" make CC="${CC}" CXX="${CXX}" arch=armv8.1-a ;; +esac + +mkdir -p "$PREFIX/bin" || exit 1 +cp bwa-mem2* "$PREFIX/bin" || exit 1 + + diff --git a/recipes/bwa-mem2/meta.yaml b/recipes/bwa-mem2/meta.yaml index 1c71c3ab43c85..0c728c2dc035f 100644 --- a/recipes/bwa-mem2/meta.yaml +++ b/recipes/bwa-mem2/meta.yaml @@ -11,7 +11,9 @@ source: - Makefile.patch build: - number: 5 + number: 7 + run_exports: + - {{ pin_subpackage('bwa-mem2', max_pin="x") }} requirements: build: @@ -24,11 +26,11 @@ requirements: test: commands: - bwa-mem2 version - - bwa-mem2.sse41 version - - bwa-mem2.sse42 version - - bwa-mem2.avx version - - bwa-mem2.avx2 version - - bwa-mem2.avx512bw version + - bwa-mem2.sse41 version # [not aarch64] + - bwa-mem2.sse42 version # [not aarch64] + - bwa-mem2.avx version # [not aarch64] + - bwa-mem2.avx2 version # [not aarch64] + - bwa-mem2.avx512bw version # [not aarch64] about: home: https://github.com/bwa-mem2/bwa-mem2 @@ -39,3 +41,5 @@ about: extra: identifiers: - doi:10.1109/IPDPS.2019.00041 + additional-platforms: + - linux-aarch64 \ No newline at end of file