From 8125011c1861fa097094d95d9832ce1b11739cd0 Mon Sep 17 00:00:00 2001 From: "Shutov, Oleg" Date: Mon, 7 Nov 2022 10:15:02 -0500 Subject: [PATCH 1/2] VDB-5087: bitmagick patch --- tools/tax/src/bm/bm.h | 695 ++++++++++---- tools/tax/src/bm/bmaggregator.h | 627 ++++++++----- tools/tax/src/bm/bmalgo.h | 36 - tools/tax/src/bm/bmalgo_impl.h | 66 +- tools/tax/src/bm/bmalloc.h | 50 +- tools/tax/src/bm/bmavx2.h | 363 ++++++-- tools/tax/src/bm/bmblocks.h | 403 +++++++-- tools/tax/src/bm/bmbmatrix.h | 267 ++++-- tools/tax/src/bm/bmbuffer.h | 55 +- tools/tax/src/bm/bmbvimport.h | 12 +- tools/tax/src/bm/bmconst.h | 26 +- tools/tax/src/bm/bmdbg.h | 32 +- tools/tax/src/bm/bmfunc.h | 853 ++++++++++++----- tools/tax/src/bm/bmfwd.h | 2 +- tools/tax/src/bm/bmserial.h | 11 +- tools/tax/src/bm/bmsparsevec.h | 122 ++- tools/tax/src/bm/bmsparsevec_algo.h | 1208 ++++++++++++++++++------- tools/tax/src/bm/bmsparsevec_compr.h | 16 +- tools/tax/src/bm/bmsparsevec_serial.h | 171 ++-- tools/tax/src/bm/bmsse2.h | 345 ++++++- tools/tax/src/bm/bmsse4.h | 417 ++++++++- tools/tax/src/bm/bmsse_util.h | 2 +- tools/tax/src/bm/bmstrsparsevec.h | 592 ++++++++++-- tools/tax/src/bm/bmundef.h | 13 + tools/tax/src/bm/bmutil.h | 33 +- tools/tax/src/bm/bmxor.h | 31 +- 26 files changed, 4920 insertions(+), 1528 deletions(-) diff --git a/tools/tax/src/bm/bm.h b/tools/tax/src/bm/bm.h index 725b3321..e5973393 100644 --- a/tools/tax/src/bm/bm.h +++ b/tools/tax/src/bm/bm.h @@ -786,42 +786,8 @@ class bvector @ingroup bvector @internal */ - class mem_pool_guard - { - public: - mem_pool_guard() BMNOEXCEPT : bv_(0) - {} - - mem_pool_guard(allocator_pool_type& pool, bvector& bv) BMNOEXCEPT - : bv_(&bv) - { - bv.set_allocator_pool(&pool); - } - ~mem_pool_guard() - { - if (bv_) - bv_->set_allocator_pool(0); - } - - /// check if vector has no assigned allocator and set one - void assign_if_not_set(allocator_pool_type& pool, - bvector& bv) BMNOEXCEPT - { - if (!bv.get_allocator_pool()) // alloc pool not set yet - { - BM_ASSERT(!bv_); - bv_ = &bv; - bv_->set_allocator_pool(&pool); - } - } - - private: - mem_pool_guard(const mem_pool_guard&) = delete; - void operator=(const mem_pool_guard&) = delete; - private: - bvector* bv_; ///< garded object - }; - + typedef + bm::alloc_pool_guard > mem_pool_guard; friend class iterator_base; friend class enumerator; @@ -906,7 +872,6 @@ class bvector /*! \brief Copy constructor for range copy [left..right] - \sa copy_range */ bvector(const bvector& bvect, size_type left, size_type right) @@ -921,24 +886,50 @@ class bvector copy_range_no_check(bvect, left, right); } + /*! + \brief Copy-constructor for mutable/immutable initialization + */ + bvector(const bvector& bvect, bm::finalization is_final) + : blockman_(bvect.blockman_.glevel_len_, bvect.blockman_.max_bits_, bvect.blockman_.alloc_), + new_blocks_strat_(bvect.new_blocks_strat_), + size_(bvect.size_) + { + if (!bvect.blockman_.is_init()) + return; + if (is_final == bm::finalization::READONLY) + blockman_.copy_to_arena(bvect.blockman_); + else + blockman_.copy(bvect.blockman_); + } + ~bvector() BMNOEXCEPT {} + /*! - \brief Explicit post-construction initialization + \brief Explicit post-construction initialization. + Must be caled to make sure safe use of *_no_check() methods */ void init(); + /*! + \brief Explicit post-construction initialization. + Must be caled right after construction strickly before any modificating calls + to make sure safe use of *_no_check() methods. + This init can do pre-allocation of top level structures. + + @param top_size - request to do pre-allocation of the top level of a sparse bit-vector tree + (can be up to 256 for 32-bit mode) + @param alloc_subs - if true also allocates second level structures + */ + void init(unsigned top_size, bool alloc_subs); + + /*! \brief Copy assignment operator */ bvector& operator=(const bvector& bvect) { - if (this != &bvect) - { - blockman_.deinit_tree(); - blockman_.copy(bvect.blockman_); - resize(bvect.size()); - } + this->copy(bvect, bm::finalization::UNDEFINED); return *this; } @@ -965,9 +956,7 @@ class bvector std::initializer_list::const_iterator it_start = il.begin(); std::initializer_list::const_iterator it_end = il.end(); for (; it_start < it_end; ++it_start) - { this->set_bit_no_check(*it_start); - } } /*! @@ -979,6 +968,16 @@ class bvector return *this; } #endif + + /*! + \brief Copy bvector from the argument bvector + \param bvect - bit-vector to copy from + \param is_final - BM_READONLY - copies as immutable, BM_READWRITE - copies as mutable + even if the argument bvect is read-only vector, + BM_UNDEFINED - follow the argument type as is + */ + void copy(const bvector& bvect, bm::finalization is_final); + /*! \brief Move bvector content from another bvector */ @@ -1046,6 +1045,22 @@ class bvector allocator_pool_type* get_allocator_pool() BMNOEXCEPT { return blockman_.get_allocator().get_pool(); } + // -------------------------------------------------------------------- + /*! @name Read-only / immutable vector methods */ + //@{ + + /// Turn current vector to read-only (immutable vector). + /// After calling this method any modification (non-const methods) will cause undefined behavior + /// (likely crash or assert) + /// + /// \sa is_ro + void freeze(); + + /// Returns true if vector is read-only + bool is_ro() const BMNOEXCEPT { return blockman_.arena_; } + + //@} + // -------------------------------------------------------------------- /*! @name Bit access/modification methods */ //@{ @@ -1146,7 +1161,15 @@ class bvector void clear(const size_type* ids, size_type ids_size, bm::sort_order so=bm::BM_UNKNOWN); - + /*! + \brief swap values of bits + + @param idx1 - index of bit to swap with + @param idx2 - index of bit to swap with + */ + void swap(size_type idx1, size_type idx2); + + /*! \brief Set bit without checking preconditions (size, etc) @@ -1235,13 +1258,13 @@ class bvector \param free_mem if "true" (default) bvector frees the memory, otherwise sets blocks to 0. */ - void clear(bool free_mem = true) { blockman_.set_all_zero(free_mem); } + void clear(bool free_mem = true) BMNOEXCEPT; /*! \brief Clears every bit in the bitvector. \return *this; */ - bvector& reset() { clear(true); return *this; } + bvector& reset() BMNOEXCEPT { clear(true); return *this; } /*! \brief Flips bit n @@ -1766,6 +1789,7 @@ class bvector */ bm::bvector& bit_or(const bm::bvector& bv) { + BM_ASSERT(!is_ro()); combine_operation_or(bv); return *this; } @@ -1778,6 +1802,7 @@ class bvector bm::bvector& bit_and(const bm::bvector& bv, optmode opt_mode = opt_none) { + BM_ASSERT(!is_ro()); combine_operation_and(bv, opt_mode); return *this; } @@ -1788,6 +1813,7 @@ class bvector */ bm::bvector& bit_xor(const bm::bvector& bv) { + BM_ASSERT(!is_ro()); combine_operation_xor(bv); return *this; } @@ -1798,6 +1824,7 @@ class bvector */ bm::bvector& bit_sub(const bm::bvector& bv) { + BM_ASSERT(!is_ro()); combine_operation_sub(bv); return *this; } @@ -1876,11 +1903,6 @@ class bvector */ void calc_stat(struct bm::bvector::statistics* st) const BMNOEXCEPT; - /*! - @brief Calculates bitvector arena statistics. - @internal - */ - void calc_arena_stat(bm::bv_arena_statistics* st) const BMNOEXCEPT; /*! \brief Sets new blocks allocation strategy. @@ -2055,7 +2077,19 @@ class bvector Import sorted integers (set bits). (Fast, no checks). @internal */ - void import_sorted(const size_type* ids, const size_type ids_size); + void import_sorted(const size_type* ids, + const size_type ids_size, bool opt_flag); + + /** + \brief Set range without validity/bounds checking + */ + void set_range_no_check(size_type left, + size_type right); + /** + \brief Clear range without validity/bounds checking + */ + void clear_range_no_check(size_type left, + size_type right); //@} @@ -2073,7 +2107,7 @@ class bvector void import_block(const size_type* ids, block_idx_type nblock, size_type start, size_type stop); -//private: + size_type check_or_next(size_type prev) const BMNOEXCEPT; @@ -2157,23 +2191,7 @@ class bvector size_type left, size_type right); - /// calculate arena statistics, calculate and copy all blocks there - /// - void copy_to_arena(typename blocks_manager_type::arena* ar); - private: - - /** - \brief Set range without validity/bounds checking - */ - void set_range_no_check(size_type left, - size_type right); - /** - \brief Clear range without validity/bounds checking - */ - void clear_range_no_check(size_type left, - size_type right); - /** \brief Clear outside the range without validity/bounds checking */ @@ -2261,12 +2279,64 @@ inline bvector operator- (const bvector& bv1, template void bvector::init() { + BM_ASSERT(!is_ro()); if (!blockman_.is_init()) blockman_.init_tree(); } // ----------------------------------------------------------------------- +template +void bvector::init(unsigned top_size, bool alloc_subs) +{ + BM_ASSERT(!is_ro()); + if (!blockman_.is_init()) + blockman_.init_tree(top_size); + if (alloc_subs) + for (unsigned nb = 0; nb < top_size; ++nb) + blockman_.alloc_top_subblock(nb); +} + + +// ----------------------------------------------------------------------- + +template +void bvector::copy(const bvector& bvect, bm::finalization is_final) +{ + if (this != &bvect) + { + blockman_.deinit_tree(); + switch (is_final) + { + case bm::finalization::UNDEFINED: + if (bvect.is_ro()) + { + blockman_.copy_to_arena(bvect.blockman_); + size_ = bvect.size(); + } + else + { + blockman_.copy(bvect.blockman_); + resize(bvect.size()); + } + break; + case bm::finalization::READONLY: + blockman_.copy_to_arena(bvect.blockman_); + size_ = bvect.size(); + break; + case bm::finalization::READWRITE: + blockman_.copy(bvect.blockman_); + resize(bvect.size()); + break; + default: + BM_ASSERT(0); + break; + } // switch + } +} + +// ----------------------------------------------------------------------- + template void bvector::move_from(bvector& bvect) BMNOEXCEPT { @@ -2283,6 +2353,7 @@ void bvector::move_from(bvector& bvect) BMNOEXCEPT template void bvector::keep_range(size_type left, size_type right) { + BM_ASSERT(!is_ro()); if (!blockman_.is_init()) return; // nothing to do @@ -2291,6 +2362,7 @@ void bvector::keep_range(size_type left, size_type right) keep_range_no_check(left, right); } + // ----------------------------------------------------------------------- template @@ -2298,16 +2370,13 @@ bvector& bvector::set_range(size_type left, size_type right, bool value) { - if (!blockman_.is_init()) - { - if (!value) - return *this; // nothing to do - } + BM_ASSERT(!is_ro()); + + if (!blockman_.is_init() && !value) + return *this; // nothing to do if (right < left) - { return set_range(right, left, value); - } BM_ASSERT_THROW(right < bm::id_max, BM_ERR_RANGE); if (right >= size_) // this vect shorter than the arg. @@ -2394,6 +2463,8 @@ bool bvector::any() const BMNOEXCEPT template void bvector::resize(size_type new_size) { + BM_ASSERT(!is_ro()); + if (size_ == new_size) return; // nothing to do if (size_ < new_size) // size grows { @@ -2415,6 +2486,8 @@ void bvector::resize(size_type new_size) template void bvector::sync_size() { + BM_ASSERT(!is_ro()); + if (size_ >= bm::id_max) return; bvector::size_type last; @@ -2904,6 +2977,13 @@ bvector::gap_count_to(const bm::gap_word_t* gap_block, } */ bm::id64_t sub = rs_idx.sub_count(nb); + if (!sub) + { + c = 0; + BM_ASSERT(c == bm::gap_bit_count_to(gap_block, (gap_word_t)nbit_right)); + return c; + } + unsigned sub_cnt = unsigned(sub); unsigned first = bm::gap_word_t(sub_cnt); unsigned second = (sub_cnt >> 16); @@ -3461,6 +3541,7 @@ bvector::count_range_no_check(size_type left, else cnt_l = left; // == 0 cnt_r = this->count_to(right, rs_idx); + BM_ASSERT(cnt_r >= cnt_l); return cnt_r - cnt_l; } @@ -3469,6 +3550,8 @@ bvector::count_range_no_check(size_type left, template bvector& bvector::invert() { + BM_ASSERT(!is_ro()); + if (!size_) return *this; // cannot invert a set of power 0 @@ -3554,6 +3637,8 @@ void bvector::optimize(bm::word_t* temp_block, optmode opt_mode, statistics* stat) { + BM_ASSERT(!is_ro()); + if (!blockman_.is_init()) { if (stat) @@ -3567,7 +3652,7 @@ void bvector::optimize(bm::word_t* temp_block, { stat->reset(); ::memcpy(stat->gap_levels, - blockman_.glen(), sizeof(gap_word_t) * bm::gap_levels); + blockman_.glen(), sizeof(gap_word_t) * bm::gap_levels); stat->max_serialize_mem = (unsigned)sizeof(bm::id_t) * 4; } blockman_.optimize_tree(temp_block, opt_mode, stat); @@ -3590,6 +3675,7 @@ void bvector::optimize_range( bm::word_t* temp_block, optmode opt_mode) { + BM_ASSERT(!is_ro()); BM_ASSERT(left <= right); BM_ASSERT(temp_block); @@ -3640,6 +3726,8 @@ void bvector::optimize_gap_size() template void bvector::set_gap_levels(const gap_word_t* glevel_len) { + BM_ASSERT(!is_ro()); + if (blockman_.is_init()) { word_t*** blk_root = blockman_.top_blocks_root(); @@ -3933,9 +4021,12 @@ void bvector::calc_stat( if (BM_IS_GAP(blk)) { const bm::gap_word_t* gap_blk = BMGAP_PTR(blk); - unsigned cap = bm::gap_capacity(gap_blk, blockman_.glen()); + unsigned cap; unsigned len = gap_length(gap_blk); - st->add_gap_block(cap, len); + cap = is_ro() ? len + : bm::gap_capacity(gap_blk, blockman_.glen()); + unsigned level = bm::gap_level(gap_blk); + st->add_gap_block(cap, len, level); } else // bit block st->add_bit_block(); @@ -3956,76 +4047,10 @@ void bvector::calc_stat( st->memory_used += unsigned(sizeof(*this) - sizeof(blockman_)); blocks_mem += st->ptr_sub_blocks * (sizeof(void*) * bm::set_sub_array_size); st->memory_used += blocks_mem; + if (is_ro()) + st->memory_used += sizeof(typename blocks_manager_type::arena); st->bv_count = 1; -} -// ----------------------------------------------------------------------- - -template -void bvector::calc_arena_stat(bm::bv_arena_statistics* st) const BMNOEXCEPT -{ - BM_ASSERT(st); - st->reset(); - bm::word_t*** blk_root = blockman_.top_blocks_root(); - - if (!blk_root) - return; - unsigned top_size = blockman_.top_block_size(); - for (unsigned i = 0; i < top_size; ++i) - { - const bm::word_t* const* blk_blk = blk_root[i]; - if (!blk_blk) - { - ++i; - bool found = bm::find_not_null_ptr(blk_root, i, top_size, &i); - if (!found) - break; - blk_blk = blk_root[i]; - BM_ASSERT(blk_blk); - if (!blk_blk) - break; - } - if ((bm::word_t*)blk_blk == FULL_BLOCK_FAKE_ADDR) - continue; - st->ptr_sub_blocks_sz += bm::set_sub_array_size; - for (unsigned j = 0; j < bm::set_sub_array_size; ++j) - { - const bm::word_t* blk = blk_blk[j]; - if (IS_VALID_ADDR(blk)) - { - if (BM_IS_GAP(blk)) - { - const bm::gap_word_t* gap_blk = BMGAP_PTR(blk); - unsigned len = bm::gap_length(gap_blk); - BM_ASSERT(gap_blk[len-1] == 65535); - st->gap_blocks_sz += len; - } - else // bit block - st->bit_blocks_sz += bm::set_block_size; - } - } // for j - } // for i - -} - -// ----------------------------------------------------------------------- - -template -void bvector::copy_to_arena(typename blocks_manager_type::arena* ar) -{ - bm::word_t*** blk_root = blockman_.top_blocks_root(); - if (!blk_root) - return; - - bm::bv_arena_statistics arena_st; - calc_arena_stat(&arena_st); - blockman_.alloc_arena(ar, arena_st, blockman_.get_allocator()); - - bm::bv_arena_statistics st; - st.reset(); - - blockman_.copy_to_arena(ar, arena_st, st); - } // ----------------------------------------------------------------------- @@ -4064,6 +4089,8 @@ template void bvector::set(const size_type* ids, size_type ids_size, bm::sort_order so) { + BM_ASSERT(!is_ro()); + if (!ids || !ids_size) return; // nothing to do if (!blockman_.is_init()) @@ -4079,6 +4106,8 @@ template void bvector::keep(const size_type* ids, size_type ids_size, bm::sort_order so) { + BM_ASSERT(!is_ro()); + if (!ids || !ids_size || !blockman_.is_init()) { clear(); @@ -4103,10 +4132,26 @@ void bvector::keep(const size_type* ids, size_type ids_size, // ----------------------------------------------------------------------- +template +void bvector::clear(bool free_mem) BMNOEXCEPT +{ + if (is_ro()) + { + BM_ASSERT(free_mem); + blockman_.destroy_arena(); + } + else + blockman_.set_all_zero(free_mem); +} + +// ----------------------------------------------------------------------- + template void bvector::clear(const size_type* ids, size_type ids_size, bm::sort_order so) { + BM_ASSERT(!is_ro()); + if (!ids || !ids_size || !blockman_.is_init()) { return; @@ -4132,6 +4177,8 @@ void bvector::clear(const size_type* ids, template bvector& bvector::set() { + BM_ASSERT(!is_ro()); + set_range(0, size_ - 1, true); return *this; } @@ -4141,6 +4188,8 @@ bvector& bvector::set() template bvector& bvector::set(size_type n, bool val) { + BM_ASSERT(!is_ro()); + set_bit(n, val); return *this; } @@ -4164,6 +4213,7 @@ bool bvector::set_bit_conditional(size_type n, bool val, bool condition) template bool bvector::set_bit_and(size_type n, bool val) { + BM_ASSERT(!is_ro()); BM_ASSERT(n < size_); BM_ASSERT_THROW(n < size_, BM_ERR_RANGE); @@ -4177,6 +4227,7 @@ bool bvector::set_bit_and(size_type n, bool val) template bool bvector::set_bit(size_type n, bool val) { + BM_ASSERT(!is_ro()); BM_ASSERT_THROW(n < bm::id_max, BM_ERR_RANGE); if (!blockman_.is_init()) @@ -4195,6 +4246,8 @@ template void bvector::import(const size_type* ids, size_type size_in, bm::sort_order sorted_idx) { + BM_ASSERT(!is_ro()); + size_type n, start(0), stop(size_in); block_idx_type nblock; @@ -4236,7 +4289,8 @@ void bvector::import(const size_type* ids, size_type size_in, template void bvector::import_sorted(const size_type* ids, - const size_type size_in) + const size_type size_in, + bool opt_flag) { BM_ASSERT(size_in); BM_ASSERT(ids[0] < bm::id_max); // limit is 2^31-1 (for 32-bit mode) @@ -4253,11 +4307,11 @@ void bvector::import_sorted(const size_type* ids, { import_block(ids, nblock, 0, stop); unsigned nbit = unsigned(ids[size_in-1] & bm::set_block_mask); - if (nbit == 65535) // last bit in block + if (opt_flag && nbit == 65535) // last bit in block { unsigned i, j; bm::get_block_coord(nblock, i, j); - blockman_.optimize_bit_block(i, j); + blockman_.optimize_bit_block(i, j, opt_compress); } } else @@ -4277,18 +4331,21 @@ void bvector::import_sorted(const size_type* ids, nblock = (ids[stop] >> bm::set_block_shift); } while (start < size_in); - // multi-block sorted import, lets optimize - n = ids[start]; - nblock = (n >> bm::set_block_shift); - nblock_end = (ids[size_in-1] >> bm::set_block_shift); - unsigned nbit = unsigned(ids[size_in-1] & bm::set_block_mask); - nblock_end += bool(nbit == 65535); - do + + if (opt_flag) // multi-block sorted import, lets optimize { - unsigned i, j; - bm::get_block_coord(nblock++, i, j); - blockman_.optimize_bit_block(i, j); - } while (nblock < nblock_end); + n = ids[start]; + nblock = (n >> bm::set_block_shift); + nblock_end = (ids[size_in-1] >> bm::set_block_shift); + unsigned nbit = unsigned(ids[size_in-1] & bm::set_block_mask); + nblock_end += bool(nbit == 65535); + do + { + unsigned i, j; + bm::get_block_coord(nblock++, i, j); + blockman_.optimize_bit_block(i, j, opt_compress); + } while (nblock < nblock_end); + } } } @@ -4331,9 +4388,201 @@ void bvector::import_block(const size_type* ids, // ----------------------------------------------------------------------- +template +void bvector::swap(size_type idx1, size_type idx2) +{ + BM_ASSERT(!is_ro()); + BM_ASSERT_THROW(idx1 < bm::id_max, BM_ERR_RANGE); + BM_ASSERT_THROW(idx2 < bm::id_max, BM_ERR_RANGE); + + block_idx_type nb1 = (idx1 >> bm::set_block_shift); + block_idx_type nb2 = (idx2 >> bm::set_block_shift); + + bm::word_t* block1, *block2; + unsigned nbit1, nbit2; + + if (nb1 == nb2) // same block hit + { + unsigned i0, j0; + bm::get_block_coord(nb1, i0, j0); + block1 = blockman_.get_block_ptr(i0, j0); + if (!block1 || (block1==FULL_BLOCK_FAKE_ADDR)) // nothing to do? + return; + + nbit1 = unsigned(idx1 & bm::set_block_mask); + nbit2 = unsigned(idx2 & bm::set_block_mask); + if (BM_IS_GAP(block1)) + { + bm::gap_word_t* gblk = BMGAP_PTR(block1); + bool b1 = bm::gap_test_unr(gblk, nbit1); + bool b2 = bm::gap_test_unr(gblk, nbit2); + if (b1 != b2) + { + this->gap_block_set_no_ret(gblk, b2, nb1, nbit1); + block2 = blockman_.get_block_ptr(i0, j0); + if (block1 == block2) // same block + this->gap_block_set_no_ret(gblk, b1, nb1, nbit2); + else + set_bit_no_check(idx2, b1); + } + return; + } + unsigned nword1 = unsigned(nbit1 >> bm::set_word_shift); + unsigned nword2 = unsigned(nbit2 >> bm::set_word_shift); + nbit1 &= bm::set_word_mask; nbit2 &= bm::set_word_mask; + bool b1 = block1[nword1] & (1u << nbit1); + bool b2 = block1[nword2] & (1u << nbit2); + if (b1 != b2) + { + nbit1 = 1u << nbit1; nbit2 = 1u << nbit2; + auto w = block1[nword1]; + (b2) ? w |= nbit1 : w &= ~nbit1; + block1[nword1] = w; + w = block1[nword2]; + (b1) ? w |= nbit2 : w &= ~nbit2; + block1[nword2] = w; + } + return; + } // if (same block) + + { + unsigned i0, j0; + bm::get_block_coord(nb1, i0, j0); + block1 = blockman_.get_block_ptr(i0, j0); + bm::get_block_coord(nb2, i0, j0); + block2 = blockman_.get_block_ptr(i0, j0); + } + if (block1 == block2) // nothing to do + return; + + bm::gap_word_t *gblk1{0}, *gblk2{0}; + unsigned cpos1{0}, cpos2; + bool b1, b2, b1real, b2real; + + if (!block1) + { + b1 = false; b1real = false; + } + else + if (block1 == FULL_BLOCK_FAKE_ADDR) + { + b1 = true; b1real = false; + } + else + { + b1real = true; + nbit1 = unsigned(idx1 & bm::set_block_mask); + if (BM_IS_GAP(block1)) + { + gblk1 = BMGAP_PTR(block1); + unsigned is_set; + cpos1 = bm::gap_bfind(gblk1, nbit1, &is_set); + b1 = is_set; + } + else // bit block + { + unsigned nword1 = unsigned(nbit1 >> bm::set_word_shift); + b1 = block1[nword1] & (1u << (nbit1 & bm::set_word_mask)); + } + } + + if (!block2) + { + b2 = false; b2real = false; + } + else + if (block2 == FULL_BLOCK_FAKE_ADDR) + { + b2 = true; b2real = false; + } + else + { + b2real = true; + nbit2 = unsigned(idx2 & bm::set_block_mask); + if (BM_IS_GAP(block2)) + { + gblk2 = BMGAP_PTR(block2); + unsigned is_set; + cpos2 = bm::gap_bfind(gblk2, nbit2, &is_set); + b2 = is_set; + } + else // bit block + { + unsigned nword2 = unsigned(nbit2 >> bm::set_word_shift); + b2 = block2[nword2] & (1u << (nbit2 & bm::set_word_mask)); + } + } + + if (b1 == b2) + return; + + if (b1real) + { + if (BM_IS_GAP(block1)) + { + unsigned new_len, old_len; + unsigned is_set = b1; + old_len = bm::gap_length(gblk1)-1; + new_len = bm::gap_set_value_cpos(b2, gblk1, nbit1, &is_set, cpos1); + if (old_len < new_len) + { + unsigned threshold = bm::gap_limit(gblk1, blockman_.glen()); + if (new_len > threshold) + blockman_.extend_gap_block(nb1, gblk1); + } + } + else // bit block + { + unsigned nword1 = unsigned(nbit1 >> bm::set_word_shift); + nbit1 = 1u << (nbit1 & bm::set_word_mask); + auto w = block1[nword1]; + (b2) ? w |= nbit1 : w &= ~nbit1; + block1[nword1] = w; + } + } + else // block + { + set_bit_no_check(idx1, b2); + } + + if (b2real) + { + if (BM_IS_GAP(block2)) + { + unsigned new_len, old_len; + unsigned is_set = b2; + old_len = bm::gap_length(gblk2)-1; + new_len = bm::gap_set_value_cpos(b1, gblk2, nbit2, &is_set, cpos2); + if (old_len < new_len) + { + unsigned threshold = bm::gap_limit(gblk2, blockman_.glen()); + if (new_len > threshold) + blockman_.extend_gap_block(nb2, gblk2); + } + } + else // bit block + { + unsigned nword2 = unsigned(nbit2 >> bm::set_word_shift); + nbit2 = 1u << (nbit2 & bm::set_word_mask); + auto w = block2[nword2]; + (b1) ? w |= nbit2 : w &= ~nbit2; + block2[nword2] = w; + } + } + else + { + set_bit_no_check(idx2, b1); + } + + +} + +// ----------------------------------------------------------------------- + template bool bvector::set_bit_no_check(size_type n, bool val) { + BM_ASSERT(!is_ro()); BM_ASSERT_THROW(n < bm::id_max, BM_ERR_RANGE); // calculate logical block number @@ -4382,6 +4631,7 @@ bool bvector::set_bit_no_check(size_type n, bool val) template void bvector::set_bit_no_check(size_type n) { + BM_ASSERT(!is_ro()); BM_ASSERT_THROW(n < bm::id_max, BM_ERR_RANGE); const bool val = true; // set bit @@ -4415,6 +4665,7 @@ void bvector::set_bit_no_check(size_type n) template void bvector::clear_bit_no_check(size_type n) { + BM_ASSERT(!is_ro()); BM_ASSERT_THROW(n < bm::id_max, BM_ERR_RANGE); const bool val = false; // clear bit @@ -4485,6 +4736,7 @@ void bvector::gap_block_set_no_ret(bm::gap_word_t* gap_blk, template bool bvector::inc(size_type n) { + BM_ASSERT(!is_ro()); // calculate logical block number block_idx_type nblock = (n >> bm::set_block_shift); bm::word_t* blk = @@ -4493,7 +4745,6 @@ bool bvector::inc(size_type n) BM_ASSERT(IS_VALID_ADDR(blk)); unsigned nbit = unsigned(n & bm::set_block_mask); - unsigned is_set; if (BM_IS_GAP(blk)) { @@ -4505,12 +4756,10 @@ bool bvector::inc(size_type n) { unsigned nword = unsigned(nbit >> bm::set_word_shift); nbit &= bm::set_word_mask; - bm::word_t* word = blk + nword; - bm::word_t mask = (((bm::word_t)1) << nbit); + const bm::word_t mask = (((bm::word_t)1) << nbit); is_set = ((*word) & mask); - - *word = (is_set) ? (*word & ~mask) : (*word | mask); + *word ^= mask; // flip the bit } return is_set; } @@ -4572,6 +4821,7 @@ bool bvector::set_bit_conditional_impl(size_type n, template bool bvector::and_bit_no_check(size_type n, bool val) { + BM_ASSERT(!is_ro()); // calculate logical block number block_idx_type nblock = (n >> bm::set_block_shift); @@ -5143,6 +5393,7 @@ template typename bvector::size_type bvector::check_or_next_extract(size_type prev) { + BM_ASSERT(!is_ro()); if (!blockman_.is_init()) return 0; // TODO: optimization @@ -5157,6 +5408,7 @@ bvector::check_or_next_extract(size_type prev) template bool bvector::shift_right() { + BM_ASSERT(!is_ro()); return insert(0, false); } @@ -5165,6 +5417,7 @@ bool bvector::shift_right() template bool bvector::shift_left() { + BM_ASSERT(!is_ro()); bool b = this->test(0); this->erase(0); return b; @@ -5175,6 +5428,7 @@ bool bvector::shift_left() template bool bvector::insert(size_type n, bool value) { + BM_ASSERT(!is_ro()); BM_ASSERT_THROW(n < bm::id_max, BM_ERR_RANGE); if (size_ < bm::id_max) @@ -5192,31 +5446,58 @@ bool bvector::insert(size_type n, bool value) int block_type; bm::word_t carry_over = 0; - if (!n && !value) // regular shift-right by 1 bit - {} - else // process target block insertion + // 1: process target block insertion + if (value || n) { unsigned i, j; bm::get_block_coord(nb, i, j); bm::word_t* block = blockman_.get_block_ptr(i, j); - if (!block && !value) // nothing to do - {} + const unsigned nbit = unsigned(n & bm::set_block_mask); + if (!block) + { + if (value) + { + block = blockman_.check_allocate_block(nb, get_new_blocks_strat()); + goto insert_bit_check; + } + } else { - if (!block) - block = blockman_.check_allocate_block(nb, BM_BIT); - if (BM_IS_GAP(block) || IS_FULL_BLOCK(block)) - block = blockman_.deoptimize_block(nb); // TODO: optimize GAP block insert - BM_ASSERT(IS_VALID_ADDR(block)); + insert_bit_check: + if (BM_IS_GAP(block)) { - unsigned nbit = unsigned(n & bm::set_block_mask); - carry_over = bm::bit_block_insert(block, nbit, value); + unsigned new_block_len; + bm::gap_word_t* gap_blk = BMGAP_PTR(block); + carry_over = bm::gap_insert(gap_blk, nbit, value, &new_block_len); + unsigned threshold = bm::gap_limit(gap_blk, blockman_.glen()); + if (new_block_len > threshold) + blockman_.extend_gap_block(nb, gap_blk); + } + else + { + if (IS_FULL_BLOCK(block)) + { + if (!value) + { + block = blockman_.deoptimize_block(nb); + goto insert_bit; + } + carry_over = 1; + } + else // BIT block + { + insert_bit: + BM_ASSERT(IS_VALID_ADDR(block)); + carry_over = bm::bit_block_insert(block, nbit, value); + } } } ++nb; } - + + // 2: shift right everything else + // unsigned i0, j0; bm::get_block_coord(nb, i0, j0); @@ -5316,9 +5597,7 @@ bool bvector::insert(size_type n, bool value) carry_over = bm::gap_shift_r1(gap_blk, carry_over, &new_block_len); unsigned threshold = bm::gap_limit(gap_blk, blockman_.glen()); if (new_block_len > threshold) - { blockman_.extend_gap_block(nblock, gap_blk); - } continue; } } @@ -5352,6 +5631,7 @@ bool bvector::insert(size_type n, bool value) template void bvector::erase(size_type n) { + BM_ASSERT(!is_ro()); BM_ASSERT_THROW(n < bm::id_max, BM_ERR_RANGE); if (!blockman_.is_init()) @@ -5521,9 +5801,17 @@ bool bvector::test_first_block_bit(block_idx_type nb) const BMNOEXCEPT template void bvector::merge(bm::bvector& bv) { + BM_ASSERT(!is_ro()); + if (!bv.blockman_.is_init()) // nothing to OR return; + if (bv.is_ro()) // argument is immutable, just use OR + { + this->bit_or(bv); + return; + } + unsigned top_blocks = blockman_.top_block_size(); if (size_ < bv.size_) // this vect shorter than the arg. { @@ -5605,6 +5893,8 @@ bvector::bit_or(const bm::bvector& bv1, const bm::bvector& bv2, typename bm::bvector::optmode opt_mode) { + BM_ASSERT(!is_ro()); + if (blockman_.is_init()) blockman_.deinit_tree(); @@ -5679,7 +5969,7 @@ bvector::bit_or(const bm::bvector& bv1, continue; bool need_opt = combine_operation_block_or(i, j, arg_blk1, arg_blk2); if (need_opt && opt_mode == opt_compress) - blockman_.optimize_bit_block(i, j); + blockman_.optimize_bit_block(i, j, opt_mode); any_blocks |= bool(blk_blk[j]); } while (++j < bm::set_sub_array_size); @@ -5702,6 +5992,8 @@ bvector::bit_xor(const bm::bvector& bv1, const bm::bvector& bv2, typename bm::bvector::optmode opt_mode) { + BM_ASSERT(!is_ro()); + if (blockman_.is_init()) blockman_.deinit_tree(); @@ -5790,7 +6082,7 @@ bvector::bit_xor(const bm::bvector& bv1, bool need_opt = combine_operation_block_xor(i, j, arg_blk1, arg_blk2); if (need_opt && opt_mode == opt_compress) - blockman_.optimize_bit_block(i, j); + blockman_.optimize_bit_block(i, j, opt_mode); any_blocks |= bool(blk_blk[j]); } while (++j < bm::set_sub_array_size); @@ -5813,6 +6105,8 @@ bvector::bit_and(const bm::bvector& bv1, const bm::bvector& bv2, typename bm::bvector::optmode opt_mode) { + BM_ASSERT(!is_ro()); + if (&bv1 == &bv2) { *this = bv1; @@ -5883,7 +6177,7 @@ bvector::bit_and(const bm::bvector& bv1, bool need_opt = combine_operation_block_and(i, j, arg_blk1, arg_blk2); if (need_opt && opt_mode == opt_compress) - blockman_.optimize_bit_block(i, j); + blockman_.optimize_bit_block(i, j, opt_mode); any_blocks |= bool(blk_blk[j]); } while (++j < bm::set_sub_array_size); @@ -5906,6 +6200,8 @@ bvector::bit_or_and(const bm::bvector& bv1, const bm::bvector& bv2, typename bm::bvector::optmode opt_mode) { + BM_ASSERT(!is_ro()); + if (&bv1 == &bv2) { this->bit_or(bv1); @@ -5994,7 +6290,7 @@ bvector::bit_or_and(const bm::bvector& bv1, combine_operation_block_and_or(i, j, arg_blk1, arg_blk2); } if (need_opt && opt_mode == opt_compress) - blockman_.optimize_bit_block(i, j); + blockman_.optimize_bit_block(i, j, opt_mode); any_blocks |= bool(blk_blk[j]); @@ -6021,6 +6317,8 @@ bvector::bit_sub(const bm::bvector& bv1, const bm::bvector& bv2, typename bm::bvector::optmode opt_mode) { + BM_ASSERT(!is_ro()); + if (blockman_.is_init()) blockman_.deinit_tree(); @@ -6086,7 +6384,7 @@ bvector::bit_sub(const bm::bvector& bv1, bool need_opt = combine_operation_block_sub(i, j, arg_blk1, arg_blk2); if (need_opt && opt_mode == opt_compress) - blockman_.optimize_bit_block(i, j); + blockman_.optimize_bit_block(i, j, opt_mode); any_blocks |= bool(blk_blk[j]); } while (++j < bm::set_sub_array_size); @@ -6276,7 +6574,7 @@ void bvector::combine_operation_xor(const bm::bvector& bv) { \ combine_operation_block_and(i, j+x, blk, arg_blk); \ if (opt_mode == opt_compress) \ - blockman_.optimize_bit_block(i, j+x); \ + blockman_.optimize_bit_block(i, j+x, opt_mode); \ } \ else \ blockman_.zero_block(i, j+x); \ @@ -6474,7 +6772,7 @@ void bvector::combine_operation( } bm::word_t*** blk_root = blockman_.top_blocks_root(); - unsigned block_idx = 0; + unsigned block_idx = 0; (void) block_idx; unsigned i, j; // calculate effective top size to avoid overscan @@ -7185,9 +7483,9 @@ void bvector::combine_operation_block_and( bm::bit_block_copy(new_blk, arg_blk); // TODO: copy+digest in one pass bm::id64_t d0 = bm::calc_block_digest0(new_blk); - bm::gap_and_to_bitset(new_blk, gap_blk, d0); + bm::id64_t d0_1 = bm::gap_and_to_bitset(new_blk, gap_blk, d0); - bm::id64_t d0_1 = bm::update_block_digest0(new_blk, d0); +// bm::id64_t d0_1 = bm::update_block_digest0(new_blk, d0); BM_ASSERT(bm::word_bitcount64(d0_1) <= bm::word_bitcount64(d0)); if (!d0_1) { @@ -7742,6 +8040,17 @@ void bvector::copy_range_no_check(const bvector& bvect, //--------------------------------------------------------------------- +template +void bvector::freeze() +{ + if (is_ro()) + return; // nothing to do read-only vector already + bvector bv_ro(*this, bm::finalization::READONLY); + swap(bv_ro); +} + +//--------------------------------------------------------------------- + template void bvector::throw_bad_alloc() { diff --git a/tools/tax/src/bm/bmaggregator.h b/tools/tax/src/bm/bmaggregator.h index c689a46b..d7a33eed 100644 --- a/tools/tax/src/bm/bmaggregator.h +++ b/tools/tax/src/bm/bmaggregator.h @@ -1,7 +1,7 @@ #ifndef BMAGGREGATOR__H__INCLUDED__ #define BMAGGREGATOR__H__INCLUDED__ /* -Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com) +Copyright(c) 2002-2022 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -442,6 +442,13 @@ class aggregator */ bool combine_and_sub(bvector_type& bv_target, bool any); + /** + Aggregate added group of vectors using fused logical AND-SUB. + search traget is back_inserter + */ + template + bool combine_and_sub_bi(BII bi); + /** Aggregate added group of vectors using fused logical AND-SUB, find the first match @@ -468,9 +475,10 @@ class aggregator /** Set search hint for the range, where results needs to be searched (experimental for internal use). + @return true if range is one-block bound @internal */ - void set_range_hint(size_type from, size_type to) BMNOEXCEPT; + bool set_range_hint(size_type from, size_type to) BMNOEXCEPT; /** Reset range hint to false @@ -512,15 +520,21 @@ class aggregator \param src_and_size - size of AND group \param bv_src_sub - array of pointers on bit-vectors for SUBstract \param src_sub_size - size of SUB group - \param any - flag if caller needs any results asap (incomplete results) - + \param any - flag if caller needs any results asap (incomplete results) + \return true when found */ bool combine_and_sub(bvector_type& bv_target, const bvector_type_const_ptr* bv_src_and, size_t src_and_size, const bvector_type_const_ptr* bv_src_sub, size_t src_sub_size, bool any); - + + template + bool combine_and_sub(BII bi, + const bvector_type_const_ptr* bv_src_and, size_t src_and_size, + const bvector_type_const_ptr* bv_src_sub, size_t src_sub_size); + + bool find_first_and_sub(size_type& idx, const bvector_type_const_ptr* bv_src_and, size_t src_and_size, const bvector_type_const_ptr* bv_src_sub, size_t src_sub_size); @@ -643,11 +657,10 @@ class aggregator const bvector_type_const_ptr* bv_src, size_t src_size); digest_type combine_and_sub(unsigned i, unsigned j, - const size_t* and_idx, const bvector_type_const_ptr* bv_src_and, size_t src_and_size, - const size_t* sub_idx, const bvector_type_const_ptr* bv_src_sub, size_t src_sub_size, - int* is_result_full); + int* is_result_full, + bool find_all); void prepare_shift_right_and(bvector_type& bv_target, const bvector_type_const_ptr* bv_src, @@ -701,12 +714,12 @@ class aggregator }; - bm::word_t* sort_input_blocks_or(const size_t* src_idx, + bm::word_t* sort_input_blocks_or(//const size_t* src_idx, const bvector_type_const_ptr* bv_src, size_t src_size, unsigned i, unsigned j); - bm::word_t* sort_input_blocks_and(const size_t* src_idx, + bm::word_t* sort_input_blocks_and(//const size_t* src_idx, const bvector_type_const_ptr* bv_src, size_t src_size, unsigned i, unsigned j); @@ -721,7 +734,9 @@ class aggregator void process_gap_blocks_or(const arena& ar/*size_t block_count*/); - digest_type process_bit_blocks_and(const arena& ar, /*size_t block_count,*/ digest_type digest); + digest_type process_bit_blocks_and(const arena& ar, + digest_type digest, + bool find_all); digest_type process_gap_blocks_and(const arena& ar, /*size_t block_count,*/ digest_type digest); @@ -822,6 +837,13 @@ class aggregator bool range_set_ = false; ///< range flag size_type range_from_ = bm::id_max; ///< search from size_type range_to_ = bm::id_max; ///< search to + bm::gap_word_t range_gap_blk_[5] {0,}; ///< temp GAP range block + + // single bit reduction flag + bool is_single_bit_ = false; ///< single bit flag + unsigned single_bit_idx_ = 0; + + typename bvector_type::optmode opt_mode_; ///< perform search result optimization bool compute_count_; ///< compute search result count @@ -941,16 +963,34 @@ void aggregator::reset_range_hint() BMNOEXCEPT { range_set_ = false; range_from_ = range_to_ = bm::id_max; + range_gap_blk_[0] = 0; + is_single_bit_ = false; } // ------------------------------------------------------------------------ template -void aggregator::set_range_hint(size_type from, size_type to) BMNOEXCEPT +bool aggregator::set_range_hint(size_type from, size_type to) BMNOEXCEPT { range_from_ = from; range_to_ = to; range_set_ = true; + typename bvector_type::block_idx_type + nb_from {from >> bm::set_block_shift}, nb_to {to >> bm::set_block_shift}; + if (nb_from == nb_to) + { + gap_init_range_block( + range_gap_blk_, + (gap_word_t)unsigned(from & bm::set_block_mask), + (gap_word_t)unsigned(to & bm::set_block_mask), + (gap_word_t)1); + return true; // one block hit + } + else + { + range_gap_blk_[0] = 0; + } + return false; // range crosses the blocks boundaries } // ------------------------------------------------------------------------ @@ -980,6 +1020,7 @@ size_t aggregator::add(const bvector_type* bv, unsigned agr_group) template void aggregator::combine_or(bvector_type& bv_target) { + BM_ASSERT(!bv_target.is_ro()); // immutable vector used as a target combine_or(bv_target, ag_.arg_bv0.data(), ag_.arg_bv0.size()); } @@ -988,6 +1029,7 @@ void aggregator::combine_or(bvector_type& bv_target) template void aggregator::combine_and(bvector_type& bv_target) { + BM_ASSERT(!bv_target.is_ro()); // immutable vector used as a target //combine_and(bv_target, ag_.arg_bv0.data(), ag_.arg_bv0.size()); // implemented ad AND-SUB (with an empty MINUS set) combine_and_sub(bv_target, @@ -1001,6 +1043,7 @@ void aggregator::combine_and(bvector_type& bv_target) template bool aggregator::combine_and_sub(bvector_type& bv_target) { + BM_ASSERT(!bv_target.is_ro()); // immutable vector used as a target return combine_and_sub(bv_target, ag_.arg_bv0.data(), ag_.arg_bv0.size(), ag_.arg_bv1.data(), ag_.arg_bv1.size(), @@ -1012,12 +1055,24 @@ bool aggregator::combine_and_sub(bvector_type& bv_target) template bool aggregator::combine_and_sub(bvector_type& bv_target, bool any) { + BM_ASSERT(!bv_target.is_ro()); // immutable vector used as a target return combine_and_sub(bv_target, ag_.arg_bv0.data(), ag_.arg_bv0.size(), ag_.arg_bv1.data(), ag_.arg_bv1.size(), any); } +// ------------------------------------------------------------------------ + +template template +bool aggregator::combine_and_sub_bi(BII bi) +{ + return combine_and_sub(bi, + ag_.arg_bv0.data(), ag_.arg_bv0.size(), + ag_.arg_bv1.data(), ag_.arg_bv1.size()); +} + + // ------------------------------------------------------------------------ template @@ -1033,6 +1088,7 @@ bool aggregator::find_first_and_sub(size_type& idx) template void aggregator::combine_shift_right_and(bvector_type& bv_target) { + BM_ASSERT(!bv_target.is_ro()); // immutable vector used as a target count_ = 0; ar_->reset_all_blocks(); combine_shift_right_and(bv_target, ag_.arg_bv0.data(), ag_.arg_bv0.size(),//arg_group0_size, @@ -1045,6 +1101,7 @@ template void aggregator::combine_or(bvector_type& bv_target, const bvector_type_const_ptr* bv_src, size_t src_size) { + BM_ASSERT(!bv_target.is_ro()); // immutable vector used as a target if (!src_size) { bv_target.clear(); @@ -1071,6 +1128,7 @@ void aggregator::combine_and(bvector_type& bv_target, const bvector_type_const_ptr* bv_src, size_t src_size) { + BM_ASSERT(!bv_target.is_ro()); // immutable vector used as a target if (src_size == 1) { const bvector_type* bv = bv_src[0]; @@ -1106,6 +1164,7 @@ bool aggregator::combine_and_sub(bvector_type& bv_target, const bvector_type_const_ptr* bv_src_sub, size_t src_sub_size, bool any) { + BM_ASSERT(!bv_target.is_ro()); // immutable vector used as a target bool global_found = false; if (!bv_src_and || !src_and_size) @@ -1131,9 +1190,9 @@ bool aggregator::combine_and_sub(bvector_type& bv_target, { int is_res_full; digest_type digest = combine_and_sub(i, j, - 0, bv_src_and, src_and_size, - 0, bv_src_sub, src_sub_size, - &is_res_full); + /*0,*/ bv_src_and, src_and_size, + /*0,*/ bv_src_sub, src_sub_size, + &is_res_full, !any); if (is_res_full) { bman_target.check_alloc_top_subblock(i); @@ -1160,6 +1219,73 @@ bool aggregator::combine_and_sub(bvector_type& bv_target, return global_found; } +// ------------------------------------------------------------------------ + +template template +bool aggregator::combine_and_sub(BII bi, + const bvector_type_const_ptr* bv_src_and, size_t src_and_size, + const bvector_type_const_ptr* bv_src_sub, size_t src_sub_size) +{ + bool global_found = false; + + if (!bv_src_and || !src_and_size) + return false; + + unsigned top_blocks = 0; + + // pre-scan to calculate top size + for (unsigned i = 0; i < src_and_size; ++i) + { + const bvector_type* bv = bv_src_and[i]; + BM_ASSERT(bv); + unsigned arg_top_blocks = bv->get_blocks_manager().top_block_size(); + if (arg_top_blocks > top_blocks) + top_blocks = arg_top_blocks; + } // for i + for (unsigned i = 0; i < src_sub_size; ++i) + { + const bvector_type* bv = bv_src_sub[i]; + BM_ASSERT(bv); + unsigned arg_top_blocks = bv->get_blocks_manager().top_block_size(); + if (arg_top_blocks > top_blocks) + top_blocks = arg_top_blocks; + } // for i + + bm::bit_visitor_back_inserter_adaptor bit_functor(bi); + for (unsigned i = 0; i < top_blocks; ++i) + { + const unsigned set_array_max = + find_effective_sub_block_size(i, bv_src_and, src_and_size, + bv_src_sub, src_sub_size); + for (unsigned j = 0; j < set_array_max; ++j) + { + int is_res_full; + digest_type digest = combine_and_sub(i, j, + /*0,*/ bv_src_and, src_and_size, + /*0,*/ bv_src_sub, src_sub_size, + &is_res_full, true); + size_type r = size_type(i) * bm::set_sub_array_size; + size_type base_idx = (r+j)*bm::bits_in_block; + if (is_res_full) + { + for (size_type k = 0; k < 65536; ++k) + *bi = base_idx + k; + } + else + { + bool found = digest; + global_found |= found; + if (found) + bm::for_each_bit_blk(tb_ar_->tb1, base_idx, bit_functor); + } + } // for j + } // for i + return global_found; + +} + + + // ------------------------------------------------------------------------ template template @@ -1230,10 +1356,7 @@ void aggregator::combine_and_sub(TPipe& pipe) continue; const bvector_type_const_ptr* bv_src_and = ag->arg_bv0.data(); - const size_t* bv_src_and_idx = ag->arg_idx0.data(); - const bvector_type_const_ptr* bv_src_sub = ag->arg_bv1.data(); - const size_t* bv_src_sub_idx = ag->arg_idx1.data(); size_t src_sub_size = ag->arg_bv1.size(); if constexpr (TPipe::options_type::is_compute_counts()) @@ -1245,11 +1368,11 @@ void aggregator::combine_and_sub(TPipe& pipe) int is_res_full; digest_type digest = combine_and_sub(i, j, - bv_src_and_idx, bv_src_and, src_and_size, - bv_src_sub_idx, bv_src_sub, src_sub_size, - &is_res_full); + &is_res_full, + true // find all + ); if (digest || is_res_full) { if (pipe.bv_or_target_) @@ -1336,9 +1459,6 @@ bool aggregator::find_first_and_sub(size_type& idx, const bvector_type_const_ptr* bv_src_and, size_t src_and_size, const bvector_type_const_ptr* bv_src_sub, size_t src_sub_size) { - if (!bv_src_and || !src_and_size) - return false; - unsigned top_blocks = max_top_blocks(bv_src_and, src_and_size); unsigned top_blocks2 = max_top_blocks(bv_src_sub, src_sub_size); @@ -1360,9 +1480,10 @@ bool aggregator::find_first_and_sub(size_type& idx, unsigned i = top_from; unsigned j = unsigned(nblock_from & bm::set_array_mask); digest_type digest = combine_and_sub(i, j, - 0, bv_src_and, src_and_size, - 0, bv_src_sub, src_sub_size, - &is_res_full); + bv_src_and, src_and_size, + bv_src_sub, src_sub_size, + &is_res_full, false // first + ); // is_res_full is not needed here, since it is just 1 block if (digest) { @@ -1390,13 +1511,9 @@ bool aggregator::find_first_and_sub(size_type& idx, if (range_set_) { if (i == top_from) - { j = nblock_from & bm::set_array_mask; - } if (i == top_to) - { set_array_max = 1 + unsigned(nblock_to & bm::set_array_mask); - } } else { @@ -1417,9 +1534,9 @@ bool aggregator::find_first_and_sub(size_type& idx, { int is_res_full; digest_type digest = combine_and_sub(i, j, - 0, bv_src_and, src_and_size, - 0, bv_src_sub, src_sub_size, - &is_res_full); + /*0,*/ bv_src_and, src_and_size, + /*0,*/ bv_src_sub, src_sub_size, + &is_res_full, false); if (digest) { unsigned block_bit_idx = 0; @@ -1429,7 +1546,6 @@ bool aggregator::find_first_and_sub(size_type& idx, return found; } } // for j - //while (++j < set_array_max); } // for i return false; } @@ -1516,7 +1632,7 @@ void aggregator::combine_or(unsigned i, unsigned j, bv_target.get_blocks_manager(); ar_->reset_or_blocks(); - bm::word_t* blk = sort_input_blocks_or(0, bv_src, src_size, i, j); + bm::word_t* blk = sort_input_blocks_or(/*0,*/ bv_src, src_size, i, j); BM_ASSERT(blk == 0 || blk == FULL_BLOCK_FAKE_ADDR); @@ -1556,7 +1672,7 @@ void aggregator::combine_and(unsigned i, unsigned j, { BM_ASSERT(src_and_size); - bm::word_t* blk = sort_input_blocks_and(0, bv_src, src_and_size, i, j); + bm::word_t* blk = sort_input_blocks_and(/*0,*/ bv_src, src_and_size, i, j); BM_ASSERT(blk == 0 || blk == FULL_BLOCK_FAKE_ADDR); if (!blk) // nothing to do - golden block(!) return; @@ -1580,17 +1696,14 @@ void aggregator::combine_and(unsigned i, unsigned j, } // AND bit-blocks // - bm::id64_t digest = ~0ull; - digest = process_bit_blocks_and(*ar_, digest); + bm::id64_t digest = process_bit_blocks_and(*ar_, ~0ull, true); if (!digest) return; // AND all GAP blocks (if any) // if (arg_blk_and_gap_count) - { digest = process_gap_blocks_and(*ar_, digest); - } if (digest) // we have results , allocate block and copy from temp { blocks_manager_type& bman_target = bv_target.get_blocks_manager(); @@ -1606,17 +1719,15 @@ template typename aggregator::digest_type aggregator::combine_and_sub( unsigned i, unsigned j, - const size_t* and_idx, const bvector_type_const_ptr* bv_src_and, size_t src_and_size, - const size_t* sub_idx, const bvector_type_const_ptr* bv_src_sub, size_t src_sub_size, - int* is_result_full) + int* is_result_full, bool find_all) { - BM_ASSERT(src_and_size); BM_ASSERT(is_result_full); + is_single_bit_ = false; *is_result_full = 0; - bm::word_t* blk = sort_input_blocks_and(and_idx, bv_src_and, src_and_size, i, j); + bm::word_t* blk = sort_input_blocks_and(/*and_idx,*/ bv_src_and, src_and_size, i, j); BM_ASSERT(blk == 0 || blk == FULL_BLOCK_FAKE_ADDR); if (!blk) return 0; // nothing to do - golden block(!) @@ -1630,7 +1741,7 @@ aggregator::combine_and_sub( ar_->reset_or_blocks(); if (src_sub_size) { - blk = sort_input_blocks_or(sub_idx, bv_src_sub, src_sub_size, i, j); + blk = sort_input_blocks_or(/*sub_idx,*/ bv_src_sub, src_sub_size, i, j); BM_ASSERT(blk == 0 || blk == FULL_BLOCK_FAKE_ADDR); if (blk == FULL_BLOCK_FAKE_ADDR) return 0; // nothing to do - golden block(!) @@ -1648,32 +1759,53 @@ aggregator::combine_and_sub( } } - digest_type digest = ~0ull; - // AND-SUB bit-blocks // - digest = process_bit_blocks_and(*ar_, digest); + digest_type digest = process_bit_blocks_and(*ar_, ~0ull, find_all); if (!digest) return digest; digest = process_bit_blocks_sub(*ar_, digest); - if (!digest) + + // if just 1 bit left after bit-blocks processing we can + // use short variant of GAP blocks AND-SUB + // + switch(bm::word_bitcount64(digest)) + { + case 0: return digest; - + case 1: + if (is_single_bit_) + { + size_t arg_blk_gap_count = ar_->v_arg_and_blk_gap.size(); + for (size_t k = 0; k < arg_blk_gap_count; ++k) + if (!bm::gap_test_unr(ar_->v_arg_and_blk_gap[k], single_bit_idx_)) + return 0; // AND 0 causes result to turn 0 + arg_blk_gap_count = ar_->v_arg_or_blk_gap.size(); + for (size_t k = 0; k < arg_blk_gap_count; ++k) + if (bm::gap_test_unr(ar_->v_arg_or_blk_gap[k], single_bit_idx_)) + return 0; // AND-NOT causes search result to turn 0 + return digest; + } + break; + default: break; + } // switch + // AND all GAP block // digest = process_gap_blocks_and(*ar_, digest); if (!digest) return digest; - digest = process_gap_blocks_sub(*ar_, digest); + is_single_bit_ = false; + return digest; } // ------------------------------------------------------------------------ template -void aggregator::process_gap_blocks_or(const arena& ar)//size_t arg_blk_gap_count) +void aggregator::process_gap_blocks_or(const arena& ar) { size_t arg_blk_gap_count = ar.v_arg_or_blk_gap.size(); bm::word_t* blk = tb_ar_->tb1; @@ -1689,33 +1821,29 @@ aggregator::process_gap_blocks_and(const arena& ar, digest_type digest) { bm::word_t* blk = tb_ar_->tb1; - size_t arg_blk_gap_count = ar.v_arg_and_blk_gap.size(); - bool single_bit_found; - unsigned single_bit_idx; - for (size_t k = 0; k < arg_blk_gap_count; ++k) + const size_t arg_blk_gap_count = ar.v_arg_and_blk_gap.size(); + + for (size_t k = 0; (k < arg_blk_gap_count) && digest; ++k) { - bm::gap_and_to_bitset(blk, ar.v_arg_and_blk_gap[k], digest); - digest = bm::update_block_digest0(blk, digest); - if (!digest) - { - BM_ASSERT(bm::bit_is_all_zero(blk)); - break; - } - if (bm::word_bitcount64(digest) == 1) + digest = bm::gap_and_to_bitset(blk, ar.v_arg_and_blk_gap[k], digest); + switch(bm::word_bitcount64(digest)) { - single_bit_found = bm::bit_find_first_if_1(blk, &single_bit_idx, digest); - if (single_bit_found) + case 0: + return digest; + case 1: + is_single_bit_ = bm::bit_find_first_if_1(blk, &single_bit_idx_, digest); + if (is_single_bit_) { for (++k; k < arg_blk_gap_count; ++k) - { - bool b = bm::gap_test_unr(ar.v_arg_and_blk_gap[k], single_bit_idx); - if (!b) + if (!bm::gap_test_unr(ar.v_arg_and_blk_gap[k], single_bit_idx_)) return 0; // AND 0 causes result to turn 0 - } // for k - break; + return digest; } + break; + default: break; } - } + } // for k + BM_ASSERT(digest || bm::bit_is_all_zero(blk)); return digest; } @@ -1726,35 +1854,38 @@ typename aggregator::digest_type aggregator::process_gap_blocks_sub(const arena& ar, digest_type digest) { - size_t arg_blk_gap_count = ar.v_arg_or_blk_gap.size(); + const size_t arg_blk_gap_count = ar.v_arg_or_blk_gap.size(); bm::word_t* blk = tb_ar_->tb1; - bool single_bit_found; - unsigned single_bit_idx; - for (size_t k = 0; k < arg_blk_gap_count; ++k) + + if (is_single_bit_) { - bm::gap_sub_to_bitset(blk, ar.v_arg_or_blk_gap[k], digest); - digest = bm::update_block_digest0(blk, digest); - if (!digest) - { - BM_ASSERT(bm::bit_is_all_zero(blk)); - break; - } - // check if logical operation reduced to a corner case of one single bit - if (bm::word_bitcount64(digest) == 1) + for (size_t k = 0; k < arg_blk_gap_count; ++k) + if (bm::gap_test_unr(ar.v_arg_or_blk_gap[k], single_bit_idx_)) + return 0; // AND-NOT causes search result to turn 0 + return digest; + } + + for (size_t k = 0; digest && (k < arg_blk_gap_count); ++k) + { + digest = bm::gap_sub_to_bitset(blk, ar.v_arg_or_blk_gap[k], digest); + switch(bm::word_bitcount64(digest)) { - single_bit_found = bm::bit_find_first_if_1(blk, &single_bit_idx, digest); - if (single_bit_found) + case 0: + return digest; + case 1: + is_single_bit_ = bm::bit_find_first_if_1(blk, &single_bit_idx_, digest); + if (is_single_bit_) { for (++k; k < arg_blk_gap_count; ++k) - { - bool b = bm::gap_test_unr(ar.v_arg_or_blk_gap[k], single_bit_idx); - if (b) + if (bm::gap_test_unr(ar.v_arg_or_blk_gap[k], single_bit_idx_)) return 0; // AND-NOT causes search result to turn 0 - } // for k - break; + return digest; } + break; + default: break; } } // for k + BM_ASSERT(digest || bm::bit_is_all_zero(blk)); return digest; } @@ -1861,14 +1992,15 @@ bool aggregator::process_bit_blocks_or(blocks_manager_type& bman_target, template typename aggregator::digest_type aggregator::process_bit_blocks_and(const arena& ar, - digest_type digest) + digest_type digest, + bool find_all) { bm::word_t* blk = tb_ar_->tb1; - size_t k = 0; size_t arg_blk_count = ar.v_arg_and_blk.size(); - const word_t** args = ar.v_arg_and_blk.data(); + size_t k = 0; + block_idx_type nb_from = (range_from_ >> bm::set_block_shift); block_idx_type nb_to = (range_to_ >> bm::set_block_shift); if (range_set_ && (nb_from == nb_to)) @@ -1877,7 +2009,22 @@ aggregator::process_bit_blocks_and(const arena& ar, unsigned nbit_to = unsigned(range_to_ & bm::set_block_mask); digest_type digest0 = bm::digest_mask(nbit_from, nbit_to); digest &= digest0; - bm::block_init_digest0(blk, digest); + + if (arg_blk_count > 1) // 2 or more + { + if (find_all) + digest = bm::bit_block_init_and_2way(blk, + args[k], args[k+1], + digest); + else + digest = bm::bit_block_and_2way(blk, + args[k], args[k+1], digest); + k += 2; + } + else + { + bm::block_init_digest0(blk, digest); + } } else { @@ -1896,48 +2043,77 @@ aggregator::process_bit_blocks_and(const arena& ar, } // switch } - size_t unroll_factor, len, len_unr; - unsigned single_bit_idx; - - unroll_factor = 4; - len = arg_blk_count - k; - len_unr = len - (len % unroll_factor); - for (; k < len_unr; k += unroll_factor) - { - digest = - bm::bit_block_and_5way(blk, - args[k], args[k + 1], - args[k + 2], args[k + 3], - digest); - if (!digest) // all zero - return digest; - if (bm::word_bitcount64(digest) == 1) + const size_t unroll_factor = 4; + for (; k + unroll_factor < arg_blk_count; k += unroll_factor) + { + digest = bm::bit_block_and_5way(blk, + args[k], args[k+1], args[k+2], args[k+3], + digest); + switch (bm::word_bitcount64(digest)) { - bool found = bm::bit_find_first_if_1(blk, &single_bit_idx, digest); - if (found) + case 0: return 0; + case 1: + is_single_bit_ = bm::bit_find_first_if_1(blk, &single_bit_idx_, digest); + if (is_single_bit_) { - unsigned nword = unsigned(single_bit_idx >> bm::set_word_shift); - unsigned mask = 1u << (single_bit_idx & bm::set_word_mask); - for (++k; k < arg_blk_count; ++k) + k += unroll_factor; + sbit_check: + const unsigned nword = unsigned(single_bit_idx_ >> bm::set_word_shift); + const unsigned mask = 1u << (single_bit_idx_ & bm::set_word_mask); + for (; k + unroll_factor < arg_blk_count; k += unroll_factor) { - if (!(mask & args[k][nword])) - { - blk[nword] = 0; + bm::word_t acc = mask & args[k][nword] & args[k+1][nword] & + args[k+2][nword] & args[k+3][nword]; + if (!acc) return 0; - } } // for k - break; + for (; k + 2 < arg_blk_count; k += 2) + { + bm::word_t acc = mask & args[k][nword] & args[k+1][nword]; + if (!acc) + return 0; + } // for k + + bm::word_t acc = mask; + for (; k < arg_blk_count; ++k) + acc &= args[k][nword]; + if (!(mask & acc)) + return 0; + return digest; } - } + break; + default: break; + } // switch } // for k + for (; k + 2 < arg_blk_count; k += 2) + { + digest = bm::bit_block_and_3way(blk, args[k], args[k+1], digest); + switch(bm::word_bitcount64(digest)) + { + case 0: return digest; + case 1: + is_single_bit_ = bm::bit_find_first_if_1(blk, + &single_bit_idx_, digest); + if (is_single_bit_) { ++k; goto sbit_check; } + break; + default: break; + } // switch + } for (; k < arg_blk_count; ++k) - {/* - if (ar_->v_arg_and_blk[k] == FULL_BLOCK_REAL_ADDR) - continue;*/ + { digest = bm::bit_block_and(blk, args[k], digest); - if (!digest) // all zero - return digest; + switch(bm::word_bitcount64(digest)) + { + case 0: return digest; + case 1: + is_single_bit_ = bm::bit_find_first_if_1(blk, + &single_bit_idx_, digest); + if (is_single_bit_) + { ++k; goto sbit_check; } + break; + default: break; + } // switch } // for k return digest; } @@ -1949,42 +2125,82 @@ typename aggregator::digest_type aggregator::process_bit_blocks_sub(const arena& ar, digest_type digest) { - size_t arg_blk_count = ar.v_arg_or_blk.size(); + size_t arg_blk_count = ar.v_arg_or_blk.size(); bm::word_t* blk = tb_ar_->tb1; - unsigned single_bit_idx; const word_t** args = ar.v_arg_or_blk.data(); + const size_t unroll_factor = 4; size_t k = 0; - for (; k < arg_blk_count; ++k) + + if (is_single_bit_) + goto sbit_check; + + for (; k + unroll_factor < arg_blk_count; k += unroll_factor) { - /* - if (ar.v_arg_or_blk[k] == FULL_BLOCK_REAL_ADDR) // golden block - { - digest = 0; - break; - } */ - digest = bm::bit_block_sub(blk, args[k], digest); - if (!digest) // all zero - break; - if (bm::word_bitcount64(digest) == 1) + digest = bm::bit_block_sub_5way(blk, + args[k], args[k+1],args[k+2], args[k+3], + digest); + switch(bm::word_bitcount64(digest)) { - bool found = bm::bit_find_first_if_1(blk, &single_bit_idx, digest); - if (found) + case 0: + return digest; + case 1: + is_single_bit_ = bm::bit_find_first_if_1(blk, + &single_bit_idx_, digest); + if (is_single_bit_) { - const unsigned mask = 1u << (single_bit_idx & bm::set_word_mask); - unsigned nword = unsigned(single_bit_idx >> bm::set_word_shift); - for (++k; k < arg_blk_count; ++k) + k += unroll_factor; + sbit_check: + const unsigned mask = + 1u << (single_bit_idx_ & bm::set_word_mask); + const unsigned nword = + unsigned(single_bit_idx_ >> bm::set_word_shift); + bm::word_t acc = 0; + for (; k + unroll_factor < arg_blk_count; k += unroll_factor) { - if (mask & args[k][nword]) - { - blk[nword] = 0; + acc = args[k][nword] | args[k+1][nword] | + args[k+2][nword] | args[k+3][nword]; + if (mask & acc) return 0; - } } // for k + for (; k < arg_blk_count; ++k) + acc |= args[k][nword]; + if (mask & acc) + return 0; return digest; } - } + break; + default: break; + } // switch } // for k + for (; k + 2 < arg_blk_count; k += 2) + { + digest = bm::bit_block_sub_3way(blk, args[k], args[k+1], digest); + switch(bm::word_bitcount64(digest)) + { + case 0: return digest; + case 1: + is_single_bit_ = bm::bit_find_first_if_1(blk, + &single_bit_idx_, digest); + if (is_single_bit_) { ++k; goto sbit_check; } + break; + default: break; + } // switch + } + for (; k < arg_blk_count; ++k) + { + digest = bm::bit_block_sub(blk, args[k], digest); + switch(bm::word_bitcount64(digest)) + { + case 0: return digest; + case 1: + is_single_bit_ = bm::bit_find_first_if_1(blk, &single_bit_idx_, digest); + if (is_single_bit_) + { ++k; goto sbit_check; } + break; + default: break; + } // switch + } // for return digest; } @@ -2042,17 +2258,16 @@ aggregator::max_top_blocks(const bvector_type_const_ptr* bv_src, size_t src_size) BMNOEXCEPT { unsigned top_blocks = 1; - - // pre-scan to do target size harmonization - for (unsigned i = 0; i < src_size; ++i) + for (unsigned i = 0; i < src_size; ++i) // pre-scan: target size sync { - const bvector_type* bv = bv_src[i]; - if (!bv) - continue; - const typename bvector_type::blocks_manager_type& bman_arg = bv->get_blocks_manager(); - unsigned arg_top_blocks = bman_arg.top_block_size(); - if (arg_top_blocks > top_blocks) - top_blocks = arg_top_blocks; + if (const bvector_type* bv = bv_src[i]) + { + const typename bvector_type::blocks_manager_type& bman_arg = + bv->get_blocks_manager(); + unsigned arg_top_blocks = bman_arg.top_block_size(); + if (arg_top_blocks > top_blocks) + top_blocks = arg_top_blocks; + } } // for i return top_blocks; } @@ -2061,33 +2276,22 @@ aggregator::max_top_blocks(const bvector_type_const_ptr* bv_src, template bm::word_t* aggregator::sort_input_blocks_or( - const size_t* src_idx, const bvector_type_const_ptr* bv_src, size_t src_size, unsigned i, unsigned j) { - bm::word_t* blk = 0; - ar_->v_arg_or_blk.reset(); ar_->v_arg_or_blk_gap.reset(); + auto bit_arr = ar_->v_arg_or_blk.resize_no_copy(src_size); + auto gap_arr = ar_->v_arg_or_blk_gap.resize_no_copy(src_size); + + size_t bc(0), gc(0); + for (size_t k = 0; k < src_size; ++k) { const bm::word_t* arg_blk = bv_src[k]->get_blocks_manager().get_block_ptr(i, j); if (BM_IS_GAP(arg_blk)) { - (void)src_idx; -#if (0) - if (bcache_ptr_) - { - BM_ASSERT(bv == bcache_ptr_->bv_inp_vect_[src_idx[k]]); - bm::word_t* bit_blk = cache_gap_block(arg_blk, src_idx, k, i, j); - if (bit_blk) - { - ar_->v_arg_or_blk.push_back(bit_blk); // use cached bit-block for operation - continue; - } - } // bcache_ptr_ -#endif - ar_->v_arg_or_blk_gap.push_back(BMGAP_PTR(arg_blk)); + gap_arr[gc++] = BMGAP_PTR(arg_blk); } else // FULL or bit block { @@ -2095,22 +2299,26 @@ bm::word_t* aggregator::sort_input_blocks_or( continue; if (arg_blk == FULL_BLOCK_FAKE_ADDR) return FULL_BLOCK_FAKE_ADDR; - ar_->v_arg_or_blk.push_back(arg_blk); + bit_arr[bc++] = arg_blk; } } // for k - return blk; + + ar_->v_arg_or_blk.resize_no_check(bc); + ar_->v_arg_or_blk_gap.resize_no_check(gc); + + return 0; } // ------------------------------------------------------------------------ template bm::word_t* aggregator::sort_input_blocks_and( - const size_t* src_idx, const bvector_type_const_ptr* bv_src, size_t src_size, unsigned i, unsigned j) { ar_->v_arg_tmp_blk.resize_no_copy(src_size); + auto blocks_arr = ar_->v_arg_tmp_blk.data(); for (size_t k = 0; k < src_size; ++k) { @@ -2121,39 +2329,16 @@ bm::word_t* aggregator::sort_input_blocks_and( } bool has_full_blk = false; - bm::word_t* blk = FULL_BLOCK_FAKE_ADDR; - auto& bit_v = ar_->v_arg_and_blk; - auto& gap_v = ar_->v_arg_and_blk_gap; - bit_v.resize_no_copy(src_size); - gap_v.resize_no_copy(src_size); + auto bit_arr = ar_->v_arg_and_blk.resize_no_copy(src_size + 1); + auto gap_arr = ar_->v_arg_and_blk_gap.resize_no_copy(src_size + 1); size_t bc(0), gc(0); - auto bit_arr = bit_v.data(); - auto gap_arr = gap_v.data(); for (size_t k = 0; k < src_size; ++k) { const bm::word_t* arg_blk = blocks_arr[k]; if (BM_IS_GAP(arg_blk)) { const bm::gap_word_t* gap_blk = BMGAP_PTR(arg_blk); - (void)src_idx; -#if (0) - if (bcache_ptr_) - { - unsigned len = bm::gap_length(gap_blk); - size_t bv_idx = src_idx[k]; - auto cnt = bcache_ptr_->cnt_vect_[bv_idx]; - if (cnt && len > 1024 && bc < (src_size / 4)) - { - bm::word_t* bit_blk = cache_gap_block(arg_blk, src_idx, k, i, j); - if (bit_blk) - { - bit_arr[bc++] = bit_blk; // use cached bit-block for operation - continue; - } - } - } // bcache_ptr_ -#endif gap_arr[gc++] = gap_blk; continue; } @@ -2164,16 +2349,20 @@ bm::word_t* aggregator::sort_input_blocks_and( continue; } bit_arr[bc++] = arg_blk; - } // for k - bit_v.resize_no_copy(bc); - gap_v.resize_no_copy(gc); + if (range_gap_blk_[0]) // block specific AND filter exists + { + BM_ASSERT(range_set_); + gap_arr[gc++] = range_gap_blk_; + } + ar_->v_arg_and_blk_gap.resize_no_check(gc); if (has_full_blk && (!bc && !gc)) - bit_v.push_back(FULL_BLOCK_REAL_ADDR); + bit_arr[bc++] = FULL_BLOCK_REAL_ADDR; + ar_->v_arg_and_blk.resize_no_check(bc); - return blk; + return FULL_BLOCK_FAKE_ADDR; } // ------------------------------------------------------------------------ @@ -2218,14 +2407,16 @@ template void aggregator::combine_or_horizontal(bvector_type& bv_target, const bvector_type_const_ptr* bv_src, size_t src_size) { + BM_ASSERT(!bv_target.is_ro()); // immutable vector used as a target BM_ASSERT(src_size); + if (src_size == 0) { bv_target.clear(); return; } const bvector_type* bv = bv_src[0]; - bv_target = *bv; + bv_target.copy(*bv, bm::finalization::READWRITE); for (unsigned i = 1; i < src_size; ++i) { bv = bv_src[i]; @@ -2240,6 +2431,7 @@ template void aggregator::combine_and_horizontal(bvector_type& bv_target, const bvector_type_const_ptr* bv_src, size_t src_size) { + BM_ASSERT(!bv_target.is_ro()); // immutable vector used as a target BM_ASSERT(src_size); if (src_size == 0) @@ -2248,7 +2440,7 @@ void aggregator::combine_and_horizontal(bvector_type& bv_target, return; } const bvector_type* bv = bv_src[0]; - bv_target = *bv; + bv_target.copy(*bv, bm::finalization::READWRITE); for (unsigned i = 1; i < src_size; ++i) { @@ -2268,6 +2460,7 @@ void aggregator::combine_and_sub_horizontal(bvector_type& bv_target, size_t src_sub_size) { BM_ASSERT(src_and_size); + BM_ASSERT(!bv_target.is_ro()); // immutable vector used as a target combine_and_horizontal(bv_target, bv_src_and, src_and_size); @@ -2440,8 +2633,8 @@ unsigned aggregator::process_shift_right_and( } BM_ASSERT(bm::calc_block_digest0(blk) == digest); - bm::gap_and_to_bitset(blk, BMGAP_PTR(arg_blk), digest); - digest = bm::update_block_digest0(blk, digest); + digest = bm::gap_and_to_bitset(blk, BMGAP_PTR(arg_blk), digest); + //digest = bm::update_block_digest0(blk, digest); } else // 2 bit-blocks { diff --git a/tools/tax/src/bm/bmalgo.h b/tools/tax/src/bm/bmalgo.h index d10b72f7..c0d7b9ff 100644 --- a/tools/tax/src/bm/bmalgo.h +++ b/tools/tax/src/bm/bmalgo.h @@ -281,43 +281,7 @@ int for_each_bit_range(const BV& bv, #undef BM_SCANNER_OP -/// functor-adaptor for C-style callbacks -/// -/// @internal -/// -template -struct bit_visitor_callback_adaptor -{ - typedef VCBT bit_visitor_callback_type; - - bit_visitor_callback_adaptor(void* h, bit_visitor_callback_type cb_func) - : handle_(h), func_(cb_func) - {} - int add_bits(size_type offset, const unsigned char* bits, unsigned size) - { - for (unsigned i = 0; i < size; ++i) - { - int ret = func_(handle_, offset + bits[i]); - if (ret < 0) - return ret; - } - return 0; - } - int add_range(size_type offset, size_type size) - { - for (size_type i = 0; i < size; ++i) - { - int ret = func_(handle_, offset + i); - if (ret < 0) - return ret; - } - return 0; - } - - void* handle_; - bit_visitor_callback_type func_; -}; /// Functor for bit-copy (for testing) diff --git a/tools/tax/src/bm/bmalgo_impl.h b/tools/tax/src/bm/bmalgo_impl.h index e8b5dd5c..75428383 100644 --- a/tools/tax/src/bm/bmalgo_impl.h +++ b/tools/tax/src/bm/bmalgo_impl.h @@ -83,7 +83,6 @@ distance_metric operation2metric(set_operation op) BMNOEXCEPT \brief Distance metric descriptor, holds metric code and result. \sa distance_operation */ - struct distance_metric_descriptor { #ifdef BM64ADDR @@ -113,6 +112,71 @@ struct distance_metric_descriptor } }; +/// functor-adaptor for C-style callbacks +/// +/// @internal +/// +template +struct bit_visitor_callback_adaptor +{ + typedef VCBT bit_visitor_callback_type; + + bit_visitor_callback_adaptor(void* h, bit_visitor_callback_type cb_func) + : handle_(h), func_(cb_func) + {} + + int add_bits(size_type offset, const unsigned char* bits, unsigned size) + { + for (unsigned i = 0; i < size; ++i) + { + int ret = func_(handle_, offset + bits[i]); + if (ret < 0) + return ret; + } + return 0; + } + int add_range(size_type offset, size_type size) + { + for (size_type i = 0; i < size; ++i) + { + int ret = func_(handle_, offset + i); + if (ret < 0) + return ret; + } + return 0; + } + + void* handle_; + bit_visitor_callback_type func_; +}; + +/// functor-adaptor for back-inserter +/// +/// @internal +/// +template +struct bit_visitor_back_inserter_adaptor +{ + + bit_visitor_back_inserter_adaptor(BII bi) + : bi_(bi) + {} + + int add_bits(size_type offset, const unsigned char* bits, unsigned size) + { + for (unsigned i = 0; i < size; ++i) + *bi_ = offset + bits[i]; + return 0; + } + int add_range(size_type offset, size_type size) + { + for (size_type i = 0; i < size; ++i) + *bi_ = offset + i; + return 0; + } + + BII bi_; +}; /*! diff --git a/tools/tax/src/bm/bmalloc.h b/tools/tax/src/bm/bmalloc.h index a4478557..81a04340 100644 --- a/tools/tax/src/bm/bmalloc.h +++ b/tools/tax/src/bm/bmalloc.h @@ -115,7 +115,16 @@ class ptr_allocator */ static void* allocate(size_t n, const void *) { - void* ptr = ::malloc(n * sizeof(void*)); + void* ptr; +#if defined(BM_ALLOC_ALIGN) + #ifdef _MSC_VER + ptr = (bm::word_t*) ::_aligned_malloc(n * sizeof(void*), BM_ALLOC_ALIGN); + #else + ptr = (bm::word_t*) ::_mm_malloc(n * sizeof(void*), BM_ALLOC_ALIGN); + #endif +#else + ptr = (bm::word_t*) ::malloc(n * sizeof(void*)); +#endif if (!ptr) throw std::bad_alloc(); return ptr; @@ -127,7 +136,15 @@ class ptr_allocator */ static void deallocate(void* p, size_t) BMNOEXCEPT { +#ifdef BM_ALLOC_ALIGN + # ifdef _MSC_VER + ::_aligned_free(p); + #else + ::_mm_free(p); + # endif +#else ::free(p); +#endif } }; @@ -175,6 +192,11 @@ class pointer_pool_array return 0; return pool_ptr_[--size_]; } + + /// return stack size + /// + unsigned size() const BMNOEXCEPT { return size_; } + private: void allocate_pool(size_t pool_size) { @@ -190,7 +212,7 @@ class pointer_pool_array } private: void** pool_ptr_; ///< array of pointers in the pool - unsigned size_; ///< current size + unsigned size_; ///< current size }; /** @@ -206,10 +228,10 @@ class alloc_pool public: alloc_pool() {} - ~alloc_pool() - { - free_pools(); - } + ~alloc_pool() { free_pools(); } + + void set_block_limit(size_t limit) BMNOEXCEPT + { block_limit_ = limit; } bm::word_t* alloc_bit_block() { @@ -222,6 +244,14 @@ class alloc_pool void free_bit_block(bm::word_t* block) BMNOEXCEPT { BM_ASSERT(IS_VALID_ADDR(block)); + if (block_limit_) // soft limit set + { + if (block_pool_.size() >= block_limit_) + { + block_alloc_.deallocate(block, bm::set_block_size); + return; + } + } if (!block_pool_.push(block)) block_alloc_.deallocate(block, bm::set_block_size); } @@ -237,9 +267,14 @@ class alloc_pool } while (block); } + /// return stack size + /// + unsigned size() const BMNOEXCEPT { return block_pool_.size(); } + protected: pointer_pool_array block_pool_; BA block_alloc_; + size_t block_limit_ = 0; ///< soft limit for the pool of blocks }; @@ -322,7 +357,7 @@ class mem_alloc /*! @brief Frees bit block allocated by alloc_bit_block. */ - void free_bit_block(bm::word_t* block, unsigned alloc_factor = 1) BMNOEXCEPT + void free_bit_block(bm::word_t* block, size_t alloc_factor = 1) BMNOEXCEPT { BM_ASSERT(IS_VALID_ADDR(block)); if (alloc_pool_p_ && alloc_factor == 1) @@ -443,7 +478,6 @@ void aligned_free(void* ptr) BMNOEXCEPT -#undef BM_ALLOC_ALIGN } // namespace bm diff --git a/tools/tax/src/bm/bmavx2.h b/tools/tax/src/bm/bmavx2.h index 2f468d13..3ffc1b55 100644 --- a/tools/tax/src/bm/bmavx2.h +++ b/tools/tax/src/bm/bmavx2.h @@ -1,7 +1,7 @@ #ifndef BMAVX2__H__INCLUDED__ #define BMAVX2__H__INCLUDED__ /* -Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com) +Copyright(c) 2002-2022 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -719,6 +719,54 @@ bool avx2_and_digest_5way(__m256i* BMRESTRICT dst, return _mm256_testz_si256(m1A, m1A); } +/*! + @brief AND block digest stride + @ingroup AVX2 +*/ +inline +bool avx2_and_digest_3way(__m256i* BMRESTRICT dst, + const __m256i* BMRESTRICT src1, + const __m256i* BMRESTRICT src2) +{ + __m256i m1A, m1B, m1C, m1D; + + { + __m256i s1_0, s2_0, s1_1, s2_1; + + s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0); + s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1); + m1A = _mm256_and_si256(s1_0, s2_0); + m1B = _mm256_and_si256(s1_1, s2_1); + s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2); + s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3); + m1C = _mm256_and_si256(s1_0, s2_0); + m1D = _mm256_and_si256(s1_1, s2_1); + } + { + __m256i dst0, dst1; + dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1); + + m1A = _mm256_and_si256(m1A, dst0); + m1B = _mm256_and_si256(m1B, dst1); + + dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3); + + m1C = _mm256_and_si256(m1C, dst0); + m1D = _mm256_and_si256(m1D, dst1); + } + _mm256_store_si256(dst + 0, m1A); + _mm256_store_si256(dst + 1, m1B); + _mm256_store_si256(dst + 2, m1C); + _mm256_store_si256(dst + 3, m1D); + + m1A = _mm256_or_si256(m1A, m1B); + m1C = _mm256_or_si256(m1C, m1D); + m1A = _mm256_or_si256(m1A, m1C); + + return _mm256_testz_si256(m1A, m1A); +} + + /*! @brief AND array elements against another array (unaligned) *dst &= *src @@ -1254,6 +1302,171 @@ bool avx2_sub_digest_2way(__m256i* BMRESTRICT dst, +/*! + @brief SUB block digest stride + @ingroup AVX2 +*/ +inline +bool avx2_sub_digest_5way(__m256i* BMRESTRICT dst, + const __m256i* BMRESTRICT src1, + const __m256i* BMRESTRICT src2, + const __m256i* BMRESTRICT src3, + const __m256i* BMRESTRICT src4) +{ + __m256i m1A, m1B, m1C, m1D; + __m256i m1E, m1F, m1G, m1H; + const __m256i maskF = _mm256_set1_epi32(~0u); // brosdcast 0xFF + + { + __m256i s1_0, s2_0, s1_1, s2_1; + + s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0); + s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1); + s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF); + s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF); + + m1A = _mm256_and_si256(s1_0, s2_0); m1B = _mm256_and_si256(s1_1, s2_1); + + s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2); + s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3); + s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF); + s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF); + + m1C = _mm256_and_si256(s1_0, s2_0); + m1D = _mm256_and_si256(s1_1, s2_1); + } + { + __m256i s3_0, s4_0, s3_1, s4_1; + + s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0); + s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1); + s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF); + s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF); + + m1E = _mm256_and_si256(s3_0, s4_0); + m1F = _mm256_and_si256(s3_1, s4_1); + + m1A = _mm256_and_si256(m1A, m1E); + m1B = _mm256_and_si256(m1B, m1F); + + s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2); + s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3); + s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF); + s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF); + + m1G = _mm256_and_si256(s3_0, s4_0); + m1H = _mm256_and_si256(s3_1, s4_1); + } + { + __m256i dst0, dst1; + dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1); + + m1C = _mm256_and_si256(m1C, m1G); + m1D = _mm256_and_si256(m1D, m1H); + m1A = _mm256_and_si256(m1A, dst0); + m1B = _mm256_and_si256(m1B, dst1); + + dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3); + + m1C = _mm256_and_si256(m1C, dst0); + m1D = _mm256_and_si256(m1D, dst1); + } + _mm256_store_si256(dst + 0, m1A); + _mm256_store_si256(dst + 1, m1B); + _mm256_store_si256(dst + 2, m1C); + _mm256_store_si256(dst + 3, m1D); + + m1A = _mm256_or_si256(m1A, m1B); + m1C = _mm256_or_si256(m1C, m1D); + m1A = _mm256_or_si256(m1A, m1C); + + return _mm256_testz_si256(m1A, m1A); +} + + +/*! + @brief SUB block digest stride + @ingroup AVX2 +*/ +inline +bool avx2_sub_digest_3way(__m256i* BMRESTRICT dst, + const __m256i* BMRESTRICT src1, + const __m256i* BMRESTRICT src2) +{ + __m256i m1A, m1B, m1C, m1D; +// __m256i m1E, m1F, m1G, m1H; + const __m256i maskF = _mm256_set1_epi32(~0u); // brosdcast 0xFF + + { + __m256i s1_0, s2_0, s1_1, s2_1; + + s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0); + s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1); + s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF); + s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF); + + m1A = _mm256_and_si256(s1_0, s2_0); m1B = _mm256_and_si256(s1_1, s2_1); + + s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2); + s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3); + s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF); + s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF); + + m1C = _mm256_and_si256(s1_0, s2_0); + m1D = _mm256_and_si256(s1_1, s2_1); + } + /* + { + __m256i s3_0, s4_0, s3_1, s4_1; + + s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0); + s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1); + s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF); + s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF); + + m1E = _mm256_and_si256(s3_0, s4_0); + m1F = _mm256_and_si256(s3_1, s4_1); + + m1A = _mm256_and_si256(m1A, m1E); + m1B = _mm256_and_si256(m1B, m1F); + + s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2); + s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3); + s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF); + s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF); + + m1G = _mm256_and_si256(s3_0, s4_0); + m1H = _mm256_and_si256(s3_1, s4_1); + } + */ + { + __m256i dst0, dst1; + dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1); + +// m1C = _mm256_and_si256(m1C, m1G); +// m1D = _mm256_and_si256(m1D, m1H); + m1A = _mm256_and_si256(m1A, dst0); + m1B = _mm256_and_si256(m1B, dst1); + + dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3); + + m1C = _mm256_and_si256(m1C, dst0); + m1D = _mm256_and_si256(m1D, dst1); + } + _mm256_store_si256(dst + 0, m1A); + _mm256_store_si256(dst + 1, m1B); + _mm256_store_si256(dst + 2, m1C); + _mm256_store_si256(dst + 3, m1D); + + m1A = _mm256_or_si256(m1A, m1B); + m1C = _mm256_or_si256(m1C, m1D); + m1A = _mm256_or_si256(m1A, m1C); + + return _mm256_testz_si256(m1A, m1A); +} + + + /*! @brief AVX2 block memset *dst = value @@ -2178,10 +2391,11 @@ bool avx2_bit_find_first_diff(const __m256i* BMRESTRICT block1, @ingroup AVX2 */ inline -bool avx2_bit_find_first(const __m256i* BMRESTRICT block, unsigned* pos) +bool avx2_bit_find_first(const __m256i* BMRESTRICT block, unsigned off, unsigned* pos) { unsigned BM_ALIGN32 simd_buf[8] BM_ALIGN32ATTR; + block = (const __m256i*)((bm::word_t*)(block) + off); const __m256i* block_end = (const __m256i*)((bm::word_t*)(block) + bm::set_block_size); __m256i maskZ = _mm256_setzero_si256(); @@ -2203,7 +2417,7 @@ bool avx2_bit_find_first(const __m256i* BMRESTRICT block, unsigned* pos) unsigned widx = bsf >> 2; // (bsf / 4); unsigned w = simd_buf[widx]; bsf = bm::bsf_asm32(w); // find first bit != 0 - *pos = (simd_lane * 256) + (widx * 32) + bsf; + *pos = (off * 32) + (simd_lane * 256) + (widx * 32) + bsf; return true; } // invert to fing (w != 0) @@ -2214,7 +2428,7 @@ bool avx2_bit_find_first(const __m256i* BMRESTRICT block, unsigned* pos) unsigned widx = bsf >> 2; // (bsf / 4); unsigned w = simd_buf[widx]; bsf = bm::bsf_asm32(w); // find first bit != 0 - *pos = ((++simd_lane) * 256) + (widx * 32) + bsf; + *pos = (off * 32) + ((++simd_lane) * 256) + (widx * 32) + bsf; return true; } @@ -2707,6 +2921,7 @@ int avx2_cmpge_u16(__m256i vect16, unsigned short value) return -1; } + /** Hybrid binary search, starts as binary, then switches to scan @@ -2716,43 +2931,60 @@ int avx2_cmpge_u16(__m256i vect16, unsigned short value) \param buf - GAP buffer pointer. \param pos - index of the element. \param is_set - output. GAP value (0 or 1). - \return GAP index. + \return GAP index OR bit-test @ingroup AVX2 */ -inline +template unsigned avx2_gap_bfind(const unsigned short* BMRESTRICT buf, unsigned pos, unsigned* BMRESTRICT is_set) { - BM_ASSERT(is_set); + BM_ASSERT(is_set || RET_TEST); - const unsigned linear_cutoff = 48; + const unsigned linear_cutoff = 64;//48; const unsigned unroll_factor = 16; BM_ASSERT(pos < bm::gap_max_bits); unsigned res; unsigned start = 1; - unsigned end = 1 + ((*buf) >> 3); - unsigned arr_end = end; + unsigned end = ((*buf) >> 3); - if (end - start < unroll_factor) // too small for a full AVX stride + const unsigned arr_end = end + 1; + if (end <= unroll_factor) // too small for a full AVX stride { - for (; start < end; ++start) - { + for (; true; ++start) if (buf[start] >= pos) - { - res = ((*buf) & 1) ^ ((start-1) & 1); - *is_set = res; - return start; - } - } // for + goto ret; BM_ASSERT(0); } - while (start != end) + do { unsigned dsize = end - start; + for (; dsize >= 64; dsize = end - start) + { + unsigned mid = (start + end) >> 1; + if (buf[mid] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + BM_ASSERT(buf[end] >= pos); + } // for + + dsize = end - start + 1; if (dsize < linear_cutoff) { // set wider scan window to possibly over-read the range, @@ -2764,54 +2996,56 @@ unsigned avx2_gap_bfind(const unsigned short* BMRESTRICT buf, __m256i mPos = _mm256_set1_epi16((unsigned short)pos); __m256i vect16, mSub, mge_mask; - unsigned len_unr = start + (dsize - (dsize % unroll_factor)); - for (; start < len_unr; start += unroll_factor) + for (unsigned len_unr = start + (dsize - (dsize % unroll_factor)); + start < len_unr; start += unroll_factor) { - vect16 = _mm256_loadu_si256((__m256i*)(&buf[start])); // 16x u16s + vect16 = _mm256_loadu_si256((__m256i*)(&buf[start])); //16x u16s mSub = _mm256_subs_epu16(mPos, vect16); mge_mask = _mm256_cmpeq_epi16(mSub, mZ); - int mask = _mm256_movemask_epi8(mge_mask); - if (mask) + if (int mask = _mm256_movemask_epi8(mge_mask); mask) { - int lz = _tzcnt_u32(mask) / 2; - start += lz; - res = ((*buf) & 1) ^ ((start-1) & 1); - *is_set = res; - return start; + int lz = _tzcnt_u32(mask); + start += (lz >> 1); + goto ret; } - } // for k - unsigned tail = unroll_factor - (end - start); - if (start > tail+1) + } // for +// if (unsigned tail = unroll_factor-(end-start); start > tail+1) { - start -= tail; // rewind back, but stay within block - vect16 = _mm256_loadu_si256((__m256i*)(&buf[start])); // 16x u16s + start = end - 15; + BM_ASSERT(buf[start + 15] >= pos); + vect16 = _mm256_loadu_si256((__m256i*)(&buf[start])); //16x u16s mSub = _mm256_subs_epu16(mPos, vect16); mge_mask = _mm256_cmpeq_epi16(mSub, mZ); int mask = _mm256_movemask_epi8(mge_mask); - BM_ASSERT(mask); // the rersult MUST be here at this point - - int lz = _tzcnt_u32(mask) / 2; - start += lz; - res = ((*buf) & 1) ^ ((start-1) & 1); - *is_set = res; - return start; + BM_ASSERT(mask); // the result MUST be here at this point + int lz = _tzcnt_u32(mask); + start += (lz >> 1); + goto ret; } - for (; start < end; ++start) - { + for (; true; ++start) if (buf[start] >= pos) - break; - } // for - break; + goto ret; + BM_ASSERT(0); } - unsigned curr = (start + end) >> 1; - if (buf[curr] < pos) - start = curr + 1; + + if (unsigned mid = (start + end) >> 1; buf[mid] < pos) + start = mid + 1; else - end = curr; - } // while + end = mid; + if (unsigned mid = (start + end) >> 1; buf[mid] < pos) + start = mid + 1; + else + end = mid; + } while (1); +ret: res = ((*buf) & 1) ^ ((start-1) & 1); - *is_set = res; - return start; + if constexpr(RET_TEST) + return res; + else + { + *is_set = res; + return start; + } } @@ -2822,9 +3056,7 @@ unsigned avx2_gap_bfind(const unsigned short* BMRESTRICT buf, inline unsigned avx2_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos) { - unsigned is_set; - bm::avx2_gap_bfind(buf, pos, &is_set); - return is_set; + return bm::avx2_gap_bfind(buf, pos, 0); } /** @@ -3236,6 +3468,9 @@ void avx2_bit_block_xor_2way(bm::word_t* target_block, #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \ avx2_and_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4)) +#define VECT_AND_DIGEST_3WAY(dst, src1, src2) \ + avx2_and_digest_3way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2)) + #define VECT_OR_BLOCK(dst, src) \ avx2_or_block((__m256i*) dst, (__m256i*) (src)) @@ -3260,6 +3495,12 @@ void avx2_bit_block_xor_2way(bm::word_t* target_block, #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \ avx2_sub_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2)) +#define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \ + avx2_sub_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4)) + +#define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \ + avx2_sub_digest_3way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2)) + #define VECT_XOR_BLOCK(dst, src) \ avx2_xor_block((__m256i*) dst, (__m256i*) (src)) @@ -3323,8 +3564,8 @@ void avx2_bit_block_xor_2way(bm::word_t* target_block, #define VECT_BIT_TO_GAP(dest, src, dest_len) \ avx2_bit_to_gap(dest, src, dest_len) -#define VECT_BIT_FIND_FIRST(src1, pos) \ - avx2_bit_find_first((__m256i*) src1, pos) +#define VECT_BIT_FIND_FIRST(src1, off, pos) \ + avx2_bit_find_first((__m256i*) src1, off, pos) #define VECT_BIT_FIND_DIFF(src1, src2, pos) \ avx2_bit_find_first_diff((__m256i*) src1, (__m256i*) (src2), pos) @@ -3338,6 +3579,10 @@ void avx2_bit_block_xor_2way(bm::word_t* target_block, #define VECT_GAP_BFIND(buf, pos, is_set) \ avx2_gap_bfind(buf, pos, is_set) +#define VECT_GAP_TEST(buf, pos) \ + avx2_gap_test(buf, pos) + + #define VECT_BIT_COUNT_DIGEST(blk, d) \ avx2_bit_block_count(blk, d) diff --git a/tools/tax/src/bm/bmblocks.h b/tools/tax/src/bm/bmblocks.h index 15389d21..3f47d350 100644 --- a/tools/tax/src/bm/bmblocks.h +++ b/tools/tax/src/bm/bmblocks.h @@ -51,21 +51,25 @@ class blocks_manager typedef bm::id_t id_type; typedef bm::id_t block_idx_type; #endif + typedef id_type size_type; - /// Allocation arena + /// Allocation arena for ReadOnly vectors /// /// @internal struct arena { - bm::word_t* blocks_; ///< bit-blocks area - bm::gap_word_t* gap_blocks_; ///< GAP blocks area - bm::word_t** blk_blks_; ///< PTR sub-blocks area - bv_arena_statistics st_; ///< statistics and sizes + void* a_ptr_; ///< main allocated pointer + bm::word_t*** top_blocks_; ///< top descriptor + bm::word_t* blocks_; ///< bit-blocks area + bm::gap_word_t* gap_blocks_; ///< GAP blocks area + bm::word_t** blk_blks_; ///< PTR sub-blocks area + bm::bv_arena_statistics st_; ///< statistics and sizes /// Set all arena fields to zero void reset() - { blocks_ = 0; gap_blocks_ = 0; blk_blks_ = 0; st_.reset(); } + { a_ptr_ = 0; top_blocks_ = 0; blocks_ = 0; gap_blocks_ = 0; blk_blks_ = 0; + st_.reset(); } }; @@ -338,21 +342,16 @@ class blocks_manager public: blocks_manager() - : max_bits_(bm::id_max), - top_blocks_(0), - temp_block_(0), - alloc_(Alloc()) + : alloc_(Alloc()) { ::memcpy(glevel_len_, bm::gap_len_table::_len, sizeof(glevel_len_)); top_block_size_ = 1; } blocks_manager(const gap_word_t* glevel_len, - id_type max_bits, + id_type max_bits, const Alloc& alloc = Alloc()) : max_bits_(max_bits), - top_blocks_(0), - temp_block_(0), alloc_(alloc) { ::memcpy(glevel_len_, glevel_len, sizeof(glevel_len_)); @@ -361,22 +360,23 @@ class blocks_manager blocks_manager(const blocks_manager& blockman) : max_bits_(blockman.max_bits_), - top_blocks_(0), top_block_size_(blockman.top_block_size_), - temp_block_(0), alloc_(blockman.alloc_) { ::memcpy(glevel_len_, blockman.glevel_len_, sizeof(glevel_len_)); if (blockman.is_init()) - this->copy(blockman); + { + if (blockman.arena_) + this->copy_to_arena(blockman); + else + this->copy(blockman); + } } #ifndef BM_NO_CXX11 blocks_manager(blocks_manager&& blockman) BMNOEXCEPT : max_bits_(blockman.max_bits_), - top_blocks_(0), top_block_size_(blockman.top_block_size_), - temp_block_(0), alloc_(blockman.alloc_) { ::memcpy(glevel_len_, blockman.glevel_len_, sizeof(glevel_len_)); @@ -388,7 +388,7 @@ class blocks_manager { if (temp_block_) alloc_.free_bit_block(temp_block_); - destroy_tree(); + deinit_tree(); } /*! \brief Swaps content @@ -405,11 +405,11 @@ class blocks_manager bm::xor_swap(this->max_bits_, bm.max_bits_); bm::xor_swap(this->top_block_size_, bm.top_block_size_); + arena* ar = arena_; arena_ = bm.arena_; bm.arena_ = ar; + BM_ASSERT(sizeof(glevel_len_) / sizeof(glevel_len_[0]) == bm::gap_levels); // paranoiya check for (unsigned i = 0; i < bm::gap_levels; ++i) - { bm::xor_swap(glevel_len_[i], bm.glevel_len_[i]); - } } /*! \brief implementation of moving semantics @@ -706,7 +706,7 @@ class blocks_manager /** set all-Zero block pointers for [start..end] */ - void set_all_zero(block_idx_type nb, block_idx_type nb_to) + void set_all_zero(block_idx_type nb, block_idx_type nb_to) BMNOEXCEPT { BM_ASSERT(nb <= nb_to); @@ -1128,7 +1128,7 @@ class blocks_manager { block = alloc_.alloc_bit_block(); // initialize block depending on its previous status - bit_block_set(block, block_flag ? 0xFF : 0); + bm::bit_block_set(block, block_flag ? ~0u : 0); set_block(nb, block); } else // gap block requested @@ -1188,7 +1188,6 @@ class blocks_manager */ bm::word_t** check_alloc_top_subblock(unsigned nblk_blk) { - if (!top_blocks_[nblk_blk]) return alloc_top_subblock(nblk_blk); if (top_blocks_[nblk_blk] == (bm::word_t**)FULL_BLOCK_FAKE_ADDR) @@ -1401,20 +1400,10 @@ class blocks_manager copy_bit_block(i, j, src_block); } - /** - Optimize bit-block - */ - void optimize_bit_block(block_idx_type nb) - { - unsigned i, j; - bm::get_block_coord(nb, i, j); - optimize_bit_block(i, j); - } - /** Optimize bit-block at i-j position */ - void optimize_bit_block(unsigned i, unsigned j) + void optimize_bit_block(unsigned i, unsigned j, int opt_mode) { bm::word_t* block = get_block_ptr(i, j); if (IS_VALID_ADDR(block)) @@ -1429,6 +1418,9 @@ class blocks_manager return_tempblock(block); return; } + if (opt_mode < 3) // less than opt_compress + return; + unsigned threashold = this->glen(bm::gap_max_level)-4; if (gap_count < threashold) // compressable { @@ -1601,7 +1593,7 @@ class blocks_manager /** Free block, make it zero pointer in the tree */ - void zero_block(block_idx_type nb) + void zero_block(block_idx_type nb) BMNOEXCEPT { unsigned i, j; get_block_coord(nb, i, j); @@ -1614,7 +1606,7 @@ class blocks_manager /** Free block, make it zero pointer in the tree */ - void zero_block(unsigned i, unsigned j) + void zero_block(unsigned i, unsigned j) BMNOEXCEPT { BM_ASSERT(top_blocks_ && i < top_block_size_); @@ -1625,14 +1617,19 @@ class blocks_manager blk_blk = alloc_top_subblock(i, FULL_BLOCK_FAKE_ADDR); bm::word_t* block = blk_blk[j]; + blk_blk[j] = 0; if (IS_VALID_ADDR(block)) { if (BM_IS_GAP(block)) + { alloc_.free_gap_block(BMGAP_PTR(block), glen()); + } else + { alloc_.free_bit_block(block); + } } - blk_blk[j] = 0; + if (j == bm::set_sub_array_size-1) { // back scan if top sub-block can also be dropped @@ -1755,7 +1752,7 @@ class blocks_manager \param nb - Block's linear index. \param blk - Blocks's pointer - \return new GAP block pointer or NULL if block type mutated + \return new GAP block pointer or NULL if block type mutated into NULL */ bm::gap_word_t* extend_gap_block(block_idx_type nb, gap_word_t* blk) { @@ -1871,10 +1868,7 @@ class blocks_manager return top_blocks_[nsub] == NULL; } - bm::word_t*** top_blocks_root() BMNOEXCEPT - { - return top_blocks_; - } + bm::word_t*** top_blocks_root() BMNOEXCEPT { return top_blocks_; } /*! \brief Returns current GAP level vector */ @@ -1963,7 +1957,16 @@ class blocks_manager } top_blocks_ = 0; } - + + /// allocate first level of descr. of blocks + void init_tree(unsigned top_size) + { + BM_ASSERT(top_blocks_ == 0); + if (top_size > top_block_size_) + top_block_size_ = top_size; + init_tree(); + } + // ---------------------------------------------------------------- #define BM_FREE_OP(x) blk = blk_blk[j + x]; \ if (IS_VALID_ADDR(blk)) \ @@ -2018,6 +2021,8 @@ class blocks_manager */ void destroy_tree() BMNOEXCEPT { + BM_ASSERT(!arena_); // arena must be NULL here + if (!top_blocks_) return; @@ -2027,18 +2032,14 @@ class blocks_manager bm::word_t** blk_blk = top_blocks_[i]; if (!blk_blk) { - ++i; + ++i; // look ahead bool found = bm::find_not_null_ptr(top_blocks_, i, top_blocks, &i); - if (!found) + if (!found) // nothing to do break; blk_blk = top_blocks_[i]; } - if ((bm::word_t*)blk_blk == FULL_BLOCK_FAKE_ADDR) - { - ++i; - continue; - } - deallocate_top_subblock(i); + if ((bm::word_t*)blk_blk != FULL_BLOCK_FAKE_ADDR) + deallocate_top_subblock(i); ++i; } // for i @@ -2048,7 +2049,10 @@ class blocks_manager void deinit_tree() BMNOEXCEPT { - destroy_tree(); + if (arena_) + destroy_arena(); + else + destroy_tree(); top_blocks_ = 0; top_block_size_ = 0; } @@ -2234,8 +2238,9 @@ class blocks_manager if (bv_stat) { + unsigned level = bm::gap_level(gap_blk); bv_stat->add_gap_block( - bm::gap_capacity(gap_blk, glen()), len); + bm::gap_capacity(gap_blk, glen()), len, level); } } } @@ -2292,9 +2297,10 @@ class blocks_manager set_block_ptr(i, j, blk); if (bv_stat) { + level = bm::gap_level(gap_blk); bv_stat->add_gap_block( bm::gap_capacity(gap_blk, glen()), - bm::gap_length(gap_blk)); + bm::gap_length(gap_blk), unsigned(level)); } } else // non-compressable bit-block @@ -2378,6 +2384,116 @@ class blocks_manager + // ----------------------------------------------------------------------- + + /*! + @brief Calculates bitvector arena statistics. + */ + void calc_arena_stat(bm::bv_arena_statistics* st) const BMNOEXCEPT + { + BM_ASSERT(st); + + st->reset(); + const bm::word_t* const * const* blk_root = top_blocks_root(); + + if (!blk_root) + return; + unsigned top_size = st->top_block_size = top_block_size(); + for (unsigned i = 0; i < top_size; ++i) + { + const bm::word_t* const* blk_blk = blk_root[i]; + if (!blk_blk) + { + ++i; + bool found = bm::find_not_null_ptr(blk_root, i, top_size, &i); + if (!found) + break; + blk_blk = blk_root[i]; + BM_ASSERT(blk_blk); + if (!blk_blk) + break; + } + if ((bm::word_t*)blk_blk == FULL_BLOCK_FAKE_ADDR) + continue; + st->ptr_sub_blocks_sz += bm::set_sub_array_size; + for (unsigned j = 0; j < bm::set_sub_array_size; ++j) + { + const bm::word_t* blk = blk_blk[j]; + if (IS_VALID_ADDR(blk)) + { + if (BM_IS_GAP(blk)) + { + const bm::gap_word_t* gap_blk = BMGAP_PTR(blk); + unsigned len = bm::gap_length(gap_blk); + BM_ASSERT(gap_blk[len-1] == 65535); + st->gap_blocks_sz += len; + } + else // bit block + st->bit_blocks_sz += bm::set_block_size; + } + } // for j + } // for i + + } + + /** + Arena allocation memory guard + @internal + */ + struct arena_guard + { + arena_guard(arena* ar, blocks_manager& bman) noexcept + : ar_(ar), bman_(bman) + {} + ~arena_guard() noexcept + { + if (ar_) + { + blocks_manager::free_arena(ar_, bman_.alloc_); + ::free(ar_); + } + } + void release() noexcept { ar_ = 0; } + + arena* ar_; + blocks_manager& bman_; + }; + + /// calculate arena statistics, calculate and copy all blocks there + /// + void copy_to_arena(const blocks_manager& bman) + { + BM_ASSERT(arena_ == 0 && top_blocks_ == 0); + + bm::bv_arena_statistics src_arena_st; + if (bman.arena_) + src_arena_st = bman.arena_->st_; + else + bman.calc_arena_stat(&src_arena_st); + + arena* ar = (arena*)::malloc(sizeof(arena)); + if (!ar) + { + #ifndef BM_NO_STL + throw std::bad_alloc(); + #else + BM_THROW(BM_ERR_BADALLOC); + #endif + } + ar->reset(); + arena_guard aguard(ar, *this); // alloc_arena can throw an exception.. + + alloc_arena(ar, src_arena_st, get_allocator()); + bman.copy_to_arena(ar); + arena_ = ar; + aguard.release(); // ownership of arena is transfered + + // reset the top tree link to work with the arena + top_blocks_ = ar->top_blocks_; + top_block_size_ = ar->st_.top_block_size; + } + + // ---------------------------------------------------------------- /// Allocate arena (content memory) based on arena statistics @@ -2392,12 +2508,41 @@ class blocks_manager { BM_ASSERT(ar); ar->st_ = st; - ar->blk_blks_ = (bm::word_t**) alloc.alloc_ptr(st.ptr_sub_blocks_sz); - ar->blocks_ = alloc.alloc_bit_block( - unsigned(st.bit_blocks_sz / bm::set_block_size)); - unsigned len = - (unsigned)(st.gap_blocks_sz / (sizeof(bm::word_t) / sizeof(bm::gap_word_t))); - ar->gap_blocks_ = (bm::gap_word_t*)alloc.get_block_alloc().allocate(len, 0); + + // compute total allocation size in bytes + size_t alloc_sz = st.get_alloc_size(); + // size to alloc in pointers + size_t alloc_sz_v = (alloc_sz + (sizeof(void*)-1)) / sizeof(void*); + + char* arena_mem_ptr = (char*) alloc.alloc_ptr(alloc_sz_v); + ar->a_ptr_ = arena_mem_ptr; + + if (st.bit_blocks_sz) + { + ar->blocks_ = (bm::word_t*)arena_mem_ptr; + BM_ASSERT(bm::is_aligned(ar->blocks_)); + arena_mem_ptr += st.bit_blocks_sz * sizeof(bm::word_t); + } + else + ar->blocks_ = 0; + + ar->top_blocks_ = (bm::word_t***) arena_mem_ptr; + for (unsigned i = 0; i < ar->st_.top_block_size; ++i) // init as NULLs + ar->top_blocks_[i] = 0; + arena_mem_ptr += st.top_block_size * sizeof(void*); + + if (st.ptr_sub_blocks_sz) + { + ar->blk_blks_ = (bm::word_t**) arena_mem_ptr; + arena_mem_ptr += st.ptr_sub_blocks_sz * sizeof(void*); + } + else + ar->blk_blks_ = 0; + + if (st.gap_blocks_sz) + ar->gap_blocks_ = (bm::gap_word_t*)arena_mem_ptr; + else + ar->gap_blocks_ = 0; } // ---------------------------------------------------------------- @@ -2407,25 +2552,47 @@ class blocks_manager /// @param alloc - allocator /// static - void free_arena(arena* ar, allocator_type& alloc) + void free_arena(arena* ar, allocator_type& alloc) BMNOEXCEPT { BM_ASSERT(ar); + if (ar->a_ptr_) + { + size_t alloc_sz = ar->st_.get_alloc_size(); + // size to alloc in pointers + size_t alloc_sz_v = (alloc_sz + (sizeof(void*)-1)) / sizeof(void*); + alloc.free_ptr(ar->a_ptr_, alloc_sz_v); + } + } - alloc.free_ptr(ar->blk_blks_, ar->st_.ptr_sub_blocks_sz); - alloc.free_bit_block(ar->blocks_, unsigned(ar->st_.bit_blocks_sz / bm::set_block_size)); - unsigned len = - (unsigned)(ar->st_.gap_blocks_sz / (sizeof(bm::word_t) / sizeof(bm::gap_word_t))); - alloc.get_block_alloc().deallocate((bm::word_t*)ar->gap_blocks_, len); + // ---------------------------------------------------------------- + /// free all arena memory + /// + void destroy_arena() BMNOEXCEPT + { + free_arena(arena_, alloc_); + ::free(arena_); arena_ = 0; top_blocks_ = 0; top_block_size_ = 0; } // ---------------------------------------------------------------- - void copy_to_arena(arena* ar, - const bm::bv_arena_statistics& arena_st, - bm::bv_arena_statistics& st) const BMNOEXCEPT + /** + Copy blocks into arena allocated memory + @param ar - target allocated arena + @param arena_st - arena allocation statistics + @sa alloc_arena + */ + void copy_to_arena(arena* ar) const BMNOEXCEPT { - (void) arena_st; - bm::word_t*** blk_root = top_blocks_root(); + BM_ASSERT(ar); + + bm::bv_arena_statistics& st = ar->st_; (void) st; + bm::bv_arena_statistics arena_st; + arena_st.reset(); + + bm::word_t*** blk_root = ar->top_blocks_; + const bm::word_t* const * const * blk_root_arg = top_blocks_root(); + BM_ASSERT (blk_root_arg); + // arena target pointers bm::word_t** t_blk_blk = ar->blk_blks_; bm::word_t* t_block = ar->blocks_; @@ -2434,29 +2601,28 @@ class blocks_manager unsigned top_size = top_block_size(); for (unsigned i = 0; i < top_size; ++i) { - const bm::word_t* const* blk_blk = blk_root[i]; - if (!blk_blk) + const bm::word_t* const* blk_blk_arg = blk_root_arg[i]; + if (!blk_blk_arg) { ++i; - bool found = bm::find_not_null_ptr(blk_root, i, top_size, &i); + bool found = bm::find_not_null_ptr(blk_root_arg, i, top_size, &i); if (!found) break; - blk_blk = blk_root[i]; - BM_ASSERT(blk_blk); - if (!blk_blk) + blk_blk_arg = blk_root_arg[i]; + BM_ASSERT(blk_blk_arg); + if (!blk_blk_arg) break; } - if ((bm::word_t*)blk_blk == FULL_BLOCK_FAKE_ADDR) + if ((bm::word_t*)blk_blk_arg == FULL_BLOCK_FAKE_ADDR) + { + blk_root[i] = (bm::word_t**)FULL_BLOCK_FAKE_ADDR; continue; + } blk_root[i] = t_blk_blk; - t_blk_blk += bm::set_sub_array_size; - st.ptr_sub_blocks_sz += bm::set_sub_array_size; - BM_ASSERT(st.ptr_sub_blocks_sz <= arena_st.ptr_sub_blocks_sz); - for (unsigned j = 0; j < bm::set_sub_array_size; ++j) { - const bm::word_t* blk = blk_blk[j]; + const bm::word_t* blk = blk_blk_arg[j]; t_blk_blk[j] = (bm::word_t*)blk; // copy FULL and NULL blocks if (!IS_VALID_ADDR(blk)) continue; @@ -2472,19 +2638,26 @@ class blocks_manager BMSET_PTRGAP(blk_p); t_blk_blk[j] = blk_p; t_gap_block += len; - st.gap_blocks_sz += len; - BM_ASSERT(st.gap_blocks_sz < arena_st.gap_blocks_sz); + + arena_st.gap_blocks_sz += len; + BM_ASSERT(st.gap_blocks_sz >= arena_st.gap_blocks_sz); } else // bit block { bm::bit_block_copy(t_block, blk); t_blk_blk[j] = t_block; t_block += bm::set_block_size; - st.bit_blocks_sz += bm::set_block_size; - BM_ASSERT(st.bit_blocks_sz < arena_st.bit_blocks_sz); + + arena_st.bit_blocks_sz += bm::set_block_size; + BM_ASSERT(st.bit_blocks_sz >= arena_st.bit_blocks_sz); } } // for j + + t_blk_blk += bm::set_sub_array_size; + arena_st.ptr_sub_blocks_sz += bm::set_sub_array_size; + BM_ASSERT(st.ptr_sub_blocks_sz >= arena_st.ptr_sub_blocks_sz); + } // for i } @@ -2498,6 +2671,7 @@ class blocks_manager block_idx_type block_from = 0, block_idx_type block_to = bm::set_total_blocks) { + BM_ASSERT(!arena_); bm::word_t*** blk_root_arg = blockman.top_blocks_root(); if (!blk_root_arg) return; @@ -2584,17 +2758,19 @@ class blocks_manager private: /// maximum addresable bits - id_type max_bits_; + id_type max_bits_ = bm::id_max; /// Tree of blocks. - bm::word_t*** top_blocks_; + bm::word_t*** top_blocks_ = 0; /// Size of the top level block array in blocks_ tree - unsigned top_block_size_; + unsigned top_block_size_; /// Temp block. - bm::word_t* temp_block_; + bm::word_t* temp_block_ = 0; /// vector defines gap block lengths for different levels gap_word_t glevel_len_[bm::gap_levels]; /// allocator allocator_type alloc_; + /// memory arena pointer + arena* arena_ = 0; }; /** @@ -2637,6 +2813,49 @@ class bit_block_guard bm::word_t* block_; }; +/*! + Resource guard for PCLASS::set_allocator_pool() + @ingroup bvector + @internal +*/ +template +class alloc_pool_guard +{ +public: + alloc_pool_guard() BMNOEXCEPT : optr_(0) + {} + + alloc_pool_guard(POOL& pool, PCLASS& obj) BMNOEXCEPT + : optr_(&obj) + { + obj.set_allocator_pool(&pool); + } + ~alloc_pool_guard() BMNOEXCEPT + { + if (optr_) + optr_->set_allocator_pool(0); + } + + /// check if vector has no assigned allocator and set one + void assign_if_not_set(POOL& pool, + PCLASS& obj) BMNOEXCEPT + { + if (!obj.get_allocator_pool()) // alloc pool not set yet + { + BM_ASSERT(!optr_); + optr_ = &obj; + optr_->set_allocator_pool(&pool); + } + } + +private: + alloc_pool_guard(const alloc_pool_guard&) = delete; + void operator=(const alloc_pool_guard&) = delete; +private: + PCLASS* optr_; ///< garded object +}; + + } diff --git a/tools/tax/src/bm/bmbmatrix.h b/tools/tax/src/bm/bmbmatrix.h index cdfd9b2f..97f2d811 100644 --- a/tools/tax/src/bm/bmbmatrix.h +++ b/tools/tax/src/bm/bmbmatrix.h @@ -101,8 +101,9 @@ class basic_bmatrix #endif - void set_allocator_pool(allocator_pool_type* pool_ptr) BMNOEXCEPT - { pool_ = pool_ptr; } + void set_allocator_pool(allocator_pool_type* pool_ptr) BMNOEXCEPT; + allocator_pool_type* get_allocator_pool() const BMNOEXCEPT + { return pool_; } ///@} @@ -116,6 +117,9 @@ class basic_bmatrix /*! Copy content */ void copy_from(const basic_bmatrix& bbm); + /*! Freeze content into read-only mode drop editing overhead */ + void freeze(); + ///@} // ------------------------------------------------------------ @@ -255,7 +259,7 @@ class basic_bmatrix /*! Optimize block in all planes @internal */ - void optimize_block(block_idx_type nb); + void optimize_block(block_idx_type nb, typename BV::optmode opt_mode); /*! Compute memory statistics @param st [out] - statistics object @@ -282,6 +286,12 @@ class basic_bmatrix */ void clear_column(size_type idx, size_type row_from); + /*! Swap columns (bits in all rows) + @param idx1 - column index 1 + @param idx2 - column index 2 + */ + void swap_columns(size_type idx1, size_type idx2); + /** Set SUB (MINUS) operation on all existing rows @param bv - argument vector row[i] -= bv @@ -359,9 +369,9 @@ class base_sparse_vector base_sparse_vector(); base_sparse_vector(bm::null_support null_able, - allocation_policy_type ap, - size_type bv_max_size, - const allocator_type& alloc); + allocation_policy_type ap = allocation_policy_type(), + size_type bv_max_size = bm::id_max, + const allocator_type& alloc = allocator_type()); base_sparse_vector(const base_sparse_vector& bsv); @@ -395,6 +405,10 @@ class base_sparse_vector /*! return true if empty */ bool empty() const BMNOEXCEPT { return size() == 0; } + /** swap two vector elements */ + void swap_elements(size_type idx1, size_type idx2) + { bmatr_.swap_columns(idx1, idx2); } + public: // ------------------------------------------------------------ @@ -436,7 +450,18 @@ class base_sparse_vector is not configured to support assignment flags */ bool is_null(size_type idx) const BMNOEXCEPT; - + + /** + Set allocation pool + */ + void set_allocator_pool(allocator_pool_type* pool_ptr) BMNOEXCEPT + { bmatr_.set_allocator_pool(pool_ptr); } + + /** + Get allocation pool + */ + allocator_pool_type* get_allocator_pool() const BMNOEXCEPT + { return bmatr_.get_allocator_pool(); } ///@} @@ -591,6 +616,11 @@ class base_sparse_vector */ void merge_matr(bmatrix_type& bmatr); + /** + Turn on RO mode + */ + void freeze_matr() { bmatr_.freeze(); is_ro_ = true; } + /*! clear column in all value planes \param plane_idx - row (plane index to start from) @@ -641,7 +671,11 @@ class base_sparse_vector //static constexpr unsigned null_plane() BMNOEXCEPT { return value_bits(); } /** optimize block in all matrix planes */ - void optimize_block(block_idx_type nb) { bmatr_.optimize_block(nb); } + void optimize_block(block_idx_type nb, typename BV::optmode opt_mode) + { bmatr_.optimize_block(nb, opt_mode); } + + /// Sybc read-only state + void sync_ro() BMNOEXCEPT; /** Perform copy_range() on a set of planes @@ -654,9 +688,10 @@ class base_sparse_vector protected: bmatrix_type bmatr_; ///< bit-transposed matrix - unsigned_value_type slice_mask_; ///< slice presence bit-mask - size_type size_; ///< array size - unsigned effective_slices_; + unsigned_value_type slice_mask_ = 0; ///< slice presence bit-mask + size_type size_ = 0; ///< array size + unsigned effective_slices_=0; ///< number of bit slices actually allocated + bool is_ro_=false; ///< read-only }; //--------------------------------------------------------------------- @@ -828,6 +863,16 @@ void basic_bmatrix::clear_column(size_type idx, //--------------------------------------------------------------------- +template +void basic_bmatrix::swap_columns(size_type idx1, size_type idx2) +{ + for (size_type i = 0; i < rsize_; ++i) + if (bvector_type* bv = get_row(i)) + bv->swap(idx1, idx2); +} + +//--------------------------------------------------------------------- + template void basic_bmatrix::allocate_rows(size_type rsize) { @@ -886,6 +931,21 @@ void basic_bmatrix::free_rows() BMNOEXCEPT bv_rows_ = 0; } +//--------------------------------------------------------------------- + +template +void basic_bmatrix::set_allocator_pool(allocator_pool_type* pool_ptr) BMNOEXCEPT +{ + if (pool_ != pool_ptr) + { + for (size_type i = 0; i < rsize_; ++i) + if (bvector_type_ptr bv = bv_rows_[i]) + bv->set_allocator_pool(pool_ptr); + } + pool_ = pool_ptr; +} + + //--------------------------------------------------------------------- template @@ -966,7 +1026,7 @@ template typename basic_bmatrix::bvector_type_ptr basic_bmatrix::construct_row(size_type row) { - if (row > rsize_) + if (row >= rsize_) allocate_rows(row + 8); BM_ASSERT(row < rsize_); bvector_type_ptr bv = bv_rows_[row]; @@ -981,7 +1041,7 @@ template typename basic_bmatrix::bvector_type_ptr basic_bmatrix::construct_row(size_type row, const bvector_type& bv_src) { - if (row > rsize_) + if (row >= rsize_) allocate_rows(row + 8); BM_ASSERT(row < rsize_); bvector_type_ptr bv = bv_rows_[row]; @@ -1053,6 +1113,8 @@ basic_bmatrix::construct_bvector(const bvector_type* bv) const rbv->init(); } #endif + if (pool_) + rbv->set_allocator_pool(pool_); return rbv; } @@ -1181,22 +1243,33 @@ void basic_bmatrix::insert_octet(size_type pos, //--------------------------------------------------------------------- +/// @internal +inline +bool check_any_fullb(const bm::word_t* blka[8], const bm::word_t* FBADDR) +{ + bool b1, b2; + b1 = (blka[0] == FBADDR); + b2 = (blka[1] == FBADDR); + b1 |= (blka[2] == FBADDR); + b2 |= (blka[3] == FBADDR); + b1 |= (blka[4] == FBADDR); + b2 |= (blka[5] == FBADDR); + b1 |= (blka[6] == FBADDR); + b2 |= (blka[7] == FBADDR); + return b1 | b2; +} + template unsigned char basic_bmatrix::get_octet(size_type pos, size_type octet_idx) const BMNOEXCEPT { + const bm::word_t* blka[8]; unsigned v = 0; block_idx_type nb = (pos >> bm::set_block_shift); - unsigned i0 = unsigned(nb >> bm::set_array_shift); // top block address - unsigned j0 = unsigned(nb & bm::set_array_mask); // address in sub-block + unsigned i0, j0; + bm::get_block_coord(nb, i0, j0); - const bm::word_t* blk; - const bm::word_t* blka[8]; - unsigned nbit = unsigned(pos & bm::set_block_mask); - unsigned nword = unsigned(nbit >> bm::set_word_shift); - unsigned mask0 = 1u << (nbit & bm::set_word_mask); - unsigned row_idx = unsigned(octet_idx * 8); if (row_idx + 7 >= rsize_ || (null_idx_ && (row_idx + 7 > null_idx_))) // out of bounds request? @@ -1210,11 +1283,65 @@ basic_bmatrix::get_octet(size_type pos, size_type octet_idx) const BMNOEXCEP blka[5] = get_block(row_idx+5, i0, j0); blka[6] = get_block(row_idx+6, i0, j0); blka[7] = get_block(row_idx+7, i0, j0); + + + const bm::word_t* const FBADDR = FULL_BLOCK_FAKE_ADDR; + unsigned is_set; - + unsigned nbit = unsigned(pos & bm::set_block_mask); + const unsigned nword = unsigned(nbit >> bm::set_word_shift); + const unsigned mask0 = 1u << (nbit & bm::set_word_mask); +#if 0 + bool any_full = bm::check_any_fullb(blka, FBADDR); + if (!any_full) + { + if (const bm::word_t* blk; (blk = blka[0])!=0) + { + is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); + v |= (unsigned)bool(is_set); + } + if (const bm::word_t* blk;(blk = blka[1])!=0) + { + is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); + v |= unsigned(bool(is_set)) << 1u; + } + if (const bm::word_t* blk;(blk = blka[2])!=0) + { + is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); + v |= unsigned(bool(is_set)) << 2u; + } + if (const bm::word_t* blk;(blk = blka[3])!=0) + { + is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); + v |= unsigned(bool(is_set)) << 3u; + } + if (const bm::word_t* blk;(blk = blka[4])!=0) + { + is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); + v |= unsigned(bool(is_set)) << 4u; + } + if (const bm::word_t* blk;(blk = blka[5])!=0) + { + is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); + v |= unsigned(bool(is_set)) << 5u; + } + if (const bm::word_t* blk;(blk = blka[6])!=0) + { + is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); + v |= unsigned(bool(is_set)) << 6u; + } + if (const bm::word_t* blk;(blk = blka[7])!=0) + { + is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); + v |= unsigned(bool(is_set)) << 7u; + } + return (unsigned char)v; + } +#endif + const bm::word_t* blk; if ((blk = blka[0])!=0) { - if (blk == FULL_BLOCK_FAKE_ADDR) + if (blk == FBADDR) is_set = 1; else is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); @@ -1222,7 +1349,7 @@ basic_bmatrix::get_octet(size_type pos, size_type octet_idx) const BMNOEXCEP } if ((blk = blka[1])!=0) { - if (blk == FULL_BLOCK_FAKE_ADDR) + if (blk == FBADDR) is_set = 1; else is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); @@ -1230,7 +1357,7 @@ basic_bmatrix::get_octet(size_type pos, size_type octet_idx) const BMNOEXCEP } if ((blk = blka[2])!=0) { - if (blk == FULL_BLOCK_FAKE_ADDR) + if (blk == FBADDR) is_set = 1; else is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); @@ -1238,17 +1365,16 @@ basic_bmatrix::get_octet(size_type pos, size_type octet_idx) const BMNOEXCEP } if ((blk = blka[3])!=0) { - if (blk == FULL_BLOCK_FAKE_ADDR) + if (blk == FBADDR) is_set = 1; else is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); v |= unsigned(bool(is_set)) << 3u; } - if ((blk = blka[4])!=0) { - if (blk == FULL_BLOCK_FAKE_ADDR) + if (blk == FBADDR) is_set = 1; else is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); @@ -1256,7 +1382,7 @@ basic_bmatrix::get_octet(size_type pos, size_type octet_idx) const BMNOEXCEP } if ((blk = blka[5])!=0) { - if (blk == FULL_BLOCK_FAKE_ADDR) + if (blk == FBADDR) is_set = 1; else is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); @@ -1264,7 +1390,7 @@ basic_bmatrix::get_octet(size_type pos, size_type octet_idx) const BMNOEXCEP } if ((blk = blka[6])!=0) { - if (blk == FULL_BLOCK_FAKE_ADDR) + if (blk == FBADDR) is_set = 1; else is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); @@ -1272,7 +1398,7 @@ basic_bmatrix::get_octet(size_type pos, size_type octet_idx) const BMNOEXCEP } if ((blk = blka[7])!=0) { - if (blk == FULL_BLOCK_FAKE_ADDR) + if (blk == FBADDR) is_set = 1; else is_set = (BM_IS_GAP(blk)) ? bm::gap_test_unr(BMGAP_PTR(blk), nbit) : (blk[nword] & mask0); @@ -1329,8 +1455,8 @@ basic_bmatrix::get_half_octet(size_type pos, size_type row_idx) const BMNOEX unsigned v = 0; block_idx_type nb = (pos >> bm::set_block_shift); - unsigned i0 = unsigned(nb >> bm::set_array_shift); // top block address - unsigned j0 = unsigned(nb & bm::set_array_mask); // address in sub-block + unsigned i0, j0; + bm::get_block_coord(nb, i0, j0); const bm::word_t* blk; const bm::word_t* blka[4]; @@ -1413,6 +1539,16 @@ void basic_bmatrix::optimize(bm::word_t* temp_block, //--------------------------------------------------------------------- +template +void basic_bmatrix::freeze() +{ + for (unsigned k = 0; k < rsize_; ++k) + if (bvector_type* bv = get_row(k)) + bv->freeze(); +} + +//--------------------------------------------------------------------- + template void basic_bmatrix::calc_stat(typename bvector_type::statistics& st, size_type rsize) const BMNOEXCEPT @@ -1431,7 +1567,8 @@ void basic_bmatrix::calc_stat(typename bvector_type::statistics& st, //--------------------------------------------------------------------- template -void basic_bmatrix::optimize_block(block_idx_type nb) +void basic_bmatrix::optimize_block(block_idx_type nb, + typename BV::optmode opt_mode) { for (unsigned k = 0; k < rsize_; ++k) { @@ -1441,25 +1578,22 @@ void basic_bmatrix::optimize_block(block_idx_type nb) bm::get_block_coord(nb, i, j); typename bvector_type::blocks_manager_type& bman = bv->get_blocks_manager(); - bman.optimize_bit_block(i, j); + bman.optimize_bit_block(i, j, opt_mode); } } // for k } //--------------------------------------------------------------------- +// //--------------------------------------------------------------------- template base_sparse_vector::base_sparse_vector() -: bmatr_(sv_slices, allocation_policy_type(), bm::id_max, allocator_type()), - slice_mask_(0), - size_(0), - effective_slices_(0) -{ -} +: bmatr_(sv_slices, allocation_policy_type(), bm::id_max, allocator_type()) +{} //--------------------------------------------------------------------- @@ -1469,10 +1603,7 @@ base_sparse_vector::base_sparse_vector( allocation_policy_type ap, size_type bv_max_size, const allocator_type& alloc) -: bmatr_(sv_slices, ap, bv_max_size, alloc), - slice_mask_(0), - size_(0), - effective_slices_(0) +: bmatr_(sv_slices, ap, bv_max_size, alloc) { if (null_able == bm::use_null) { @@ -1492,8 +1623,7 @@ base_sparse_vector::base_sparse_vector( slice_mask_(bsv.slice_mask_), size_(bsv.size_), effective_slices_(bsv.effective_slices_) -{ -} +{} //--------------------------------------------------------------------- @@ -1596,12 +1726,16 @@ void base_sparse_vector::swap( template void base_sparse_vector::clear_all(bool free_mem) BMNOEXCEPT { - unsigned slices = value_bits(); + auto slices = bmatr_.rows(); + bvector_type* bv_null = this->get_null_bvect(); for (size_type i = 0; i < slices; ++i) - bmatr_.clear_row(i, free_mem); + if (bvector_type* bv = this->bmatr_.get_row(i)) + if (bv != bv_null) + bmatr_.clear_row(i, free_mem); slice_mask_ = 0; size_ = 0; - if (bvector_type* bv_null = get_null_bvect()) + if (bv_null) bv_null->clear(true); + is_ro_ = false; } //--------------------------------------------------------------------- @@ -1614,18 +1748,16 @@ void base_sparse_vector::clear_range( { if (right < left) return clear_range(right, left, set_null); - unsigned planes = value_bits(); + auto planes = bmatr_.rows(); + bvector_type* bv_null = this->get_null_bvect(); for (unsigned i = 0; i < planes; ++i) { if (bvector_type* bv = this->bmatr_.get_row(i)) - { - BM_ASSERT(bv != this->get_null_bvect()); - bv->set_range(left, right, false); - } + if (bv != bv_null) + bv->clear_range_no_check(left, right); } - if (set_null) - if (bvector_type* bv_null = this->get_null_bvect()) - bv_null->set_range(left, right, false); + if (set_null && bv_null) + bv_null->clear_range_no_check(left, right); } //--------------------------------------------------------------------- @@ -1862,6 +1994,25 @@ bool base_sparse_vector::equal( //--------------------------------------------------------------------- +template +void base_sparse_vector::sync_ro() BMNOEXCEPT +{ + unsigned slices = (unsigned) this->bmatr_.rows(); + for (unsigned j = 0; j < slices; ++j) + { + if (const bvector_type* bv = this->bmatr_.get_row(j)) + { + if (bv->is_ro()) + { + is_ro_ = true; + break; + } + } + } // for j +} + +//--------------------------------------------------------------------- + template void base_sparse_vector::copy_range_slices( const base_sparse_vector& bsv, diff --git a/tools/tax/src/bm/bmbuffer.h b/tools/tax/src/bm/bmbuffer.h index 6a39639b..42684926 100644 --- a/tools/tax/src/bm/bmbuffer.h +++ b/tools/tax/src/bm/bmbuffer.h @@ -197,24 +197,25 @@ class byte_buffer : public byte_buffer_ptr size_t capacity() const BMNOEXCEPT { return capacity_; } /// adjust current size (buffer content preserved) - void resize(size_t new_size, bool copy_content = true) + unsigned char* resize(size_t new_size, bool copy_content = true) { if (new_size <= capacity_) { this->size_ = new_size; - return; + return data(); } byte_buffer tmp_buffer(new_size); // temp with new capacity if (copy_content) tmp_buffer = *this; this->swap(tmp_buffer); - this->size_ = new_size; + return data(); } /// adjust current size (no need to reallocate) void resize_no_check(size_t new_size) BMNOEXCEPT { + BM_ASSERT(new_size < capacity_ || !new_size); this->size_ = new_size; } @@ -267,7 +268,20 @@ class byte_buffer : public byte_buffer_ptr return sizeof(capacity_) + sizeof(alloc_factor_) + capacity(); } - + + /** + Free mmemory + @internal + */ + void free_buffer() + { + if (byte_buf_) + { + allocator_type::deallocate((bm::word_t*)byte_buf_, alloc_factor_); + this->byte_buf_ = 0; + } + } + private: /// Override from the base class void set_buf(unsigned char* buf, size_t size); @@ -295,15 +309,7 @@ class byte_buffer : public byte_buffer_ptr capacity_ = alloc_factor_ * sizeof(bm::word_t); } - void free_buffer() - { - if (byte_buf_) - { - allocator_type::deallocate((bm::word_t*)byte_buf_, alloc_factor_); - this->byte_buf_ = 0; - } - } - + private: size_t capacity_; ///< current capacity size_t alloc_factor_; ///< number of blocks allocated for buffer @@ -491,12 +497,23 @@ class heap_vector @brief resize without content preservation @internal */ - void resize_no_copy(size_type new_size) + value_type* resize_no_copy(size_type new_size) { size_type v_size = value_size(); - buffer_.resize(new_size * v_size); + return (value_type*) buffer_.resize(new_size * v_size, false /*no copy content*/); + } + + /** + @brief resize without content preservation or capacity verification + @internal + */ + void resize_no_check(size_type new_size) + { + size_type v_size = value_size(); + buffer_.resize_no_check(new_size * v_size); } + /** @brief Add element to the end of the vector, return reference @return reference the the last element @@ -727,6 +744,14 @@ class dynamic_heap_matrix set_zero(); } + /** + Free memory + */ + void free() BMNOEXCEPT + { + buffer_.free_buffer(); + } + size_type rows() const BMNOEXCEPT { return rows_; } size_type cols() const BMNOEXCEPT { return cols_; } diff --git a/tools/tax/src/bm/bmbvimport.h b/tools/tax/src/bm/bmbvimport.h index 57f3990c..0c91199d 100644 --- a/tools/tax/src/bm/bmbvimport.h +++ b/tools/tax/src/bm/bmbvimport.h @@ -76,7 +76,11 @@ void bit_import_u32(BV& bv, &bit_arr[nb*bm::set_block_size]; bm::bit_block_copy_unalign(block, bit_arr_block_ptr); if (optimize) - bman.optimize_bit_block(nb); // returns tem_block if needed + { + unsigned i0, j0; + bm::get_block_coord(nb, i0, j0); + bman.optimize_bit_block(i0, j0, BV::opt_compress); // returns tem_block if needed + } } // for nb // tail processing @@ -94,7 +98,11 @@ void bit_import_u32(BV& bv, while (k < bm::set_block_size) // zero the block's tail block[k++] = 0; if (optimize) - bman.optimize_bit_block(nb); // returns tem_block if needed + { + unsigned i0, j0; + bm::get_block_coord(nb, i0, j0); + bman.optimize_bit_block(i0, j0, BV::opt_compress); // returns tem_block if needed + } } if (optimize) bman.free_temp_block(); diff --git a/tools/tax/src/bm/bmconst.h b/tools/tax/src/bm/bmconst.h index df7a12f6..b0459e2d 100644 --- a/tools/tax/src/bm/bmconst.h +++ b/tools/tax/src/bm/bmconst.h @@ -123,8 +123,9 @@ const unsigned rs3_border0_1 = rs3_border0 + rs3_half_span; // intermed pnt 1 const unsigned rs3_border1_1 = rs3_border1 + rs3_half_span; // intermed pnt 2 // misc parameters for sparse vec algorithms -const unsigned sub_block3_size = bm::gap_max_bits / 4; - +//const unsigned sub_bfind_block_cnt = 32; // bfind discretization factor +//const unsigned sub_block_l1_size = +// bm::gap_max_bits / bm::sub_bfind_block_cnt; // size in bits/elements #if defined(BM64OPT) || defined(BM64_SSE4) typedef id64_t wordop_t; @@ -147,6 +148,17 @@ enum strategy BM_GAP = 1 //!< GAP compression is ON. }; +/*! + @brief copy strategy + @ingroup bvector +*/ +enum class finalization +{ + UNDEFINED = 0, + READONLY = 1, //!< immutable (read-only object) + READWRITE = 2, //!< mutable (read-write object) +}; + /** Codes of set operations @@ -230,10 +242,16 @@ template struct _copyright static const unsigned _v[3]; ///< MAJOR.MINOR.PATCH version components }; +#define BM_VERSION_MAJOR 7 +#define BM_VERSION_MINOR 13 +#define BM_VERSION_PATCH 3 + template const char _copyright::_p[] = - "BitMagic C++ Library. v.7.10.1 (c) 2002-2022 Anatoliy Kuznetsov."; -template const unsigned _copyright::_v[3] = {7, 10, 1}; + "BitMagic Library. v.7.13.3 (c) 2002-2022 Anatoliy Kuznetsov."; +template const unsigned _copyright::_v[3] = + { BM_VERSION_MAJOR, BM_VERSION_MINOR, BM_VERSION_PATCH }; +#define BM_SCALAR_VERSION (((BM_VERSION_MAJOR) << 16) + ((BM_VERSION_MINOR) << 8) + (BM_VERSION_PATCH)) /** diff --git a/tools/tax/src/bm/bmdbg.h b/tools/tax/src/bm/bmdbg.h index 4a6a1d3d..a1ecde13 100644 --- a/tools/tax/src/bm/bmdbg.h +++ b/tools/tax/src/bm/bmdbg.h @@ -409,7 +409,7 @@ void print_stat(TOut& tout, const BV& bv, typename BV::block_idx_type blocks = 0 { const typename BV::blocks_manager_type& bman = bv.get_blocks_manager(); - bm::id_t count = 0; + bm::id_t count = 0; (void)count; int printed = 0; int total_gap_eff = 0; @@ -664,9 +664,12 @@ void print_svector_stat(TOut& tout, const SV& svect, bool print_sim = false) bm::build_jaccard_similarity_batch(sbatch, svect); - sbatch.calculate(); - sbatch.sort(); - + if (print_sim) + { + sbatch.calculate(); + sbatch.sort(); + } + typename similarity_batch_type::vector_type& sim_vec = sbatch.descr_vect_; if (print_sim) { @@ -1084,6 +1087,27 @@ void convert_bv2sv(SV& sv, const BV& bv) bit.flush(); } +#if 0 +/** + Get RSS on + @internal + */ +size_t getCurrentRSS( ) +{ + long rss = 0L; + FILE* fp = NULL; + if ( (fp = fopen( "/proc/self/statm", "r" )) == NULL ) + return (size_t)0L; /* Can't open? */ + if ( fscanf( fp, "%*s%ld", &rss ) != 1 ) + { + fclose( fp ); + return (size_t)0L; /* Can't read? */ + } + fclose( fp ); + return (size_t)rss * (size_t)sysconf( _SC_PAGESIZE); +} +#endif + } // namespace diff --git a/tools/tax/src/bm/bmfunc.h b/tools/tax/src/bm/bmfunc.h index 6dbfbc7e..7993bdd1 100644 --- a/tools/tax/src/bm/bmfunc.h +++ b/tools/tax/src/bm/bmfunc.h @@ -64,6 +64,8 @@ struct bv_statistics gap_word_t gap_levels[bm::gap_levels]; ///< GAP block lengths in the bvect unsigned long long gaps_by_level[bm::gap_levels]; ///< number of GAP blocks at each level + bv_statistics() BMNOEXCEPT { reset(); } + /// cound bit block void add_bit_block() BMNOEXCEPT { @@ -74,23 +76,18 @@ struct bv_statistics } /// count gap block - void add_gap_block(unsigned capacity, unsigned length) BMNOEXCEPT + void add_gap_block(unsigned capacity, unsigned length, unsigned level) BMNOEXCEPT { + BM_ASSERT(level < bm::gap_levels); + ++gap_blocks; size_t mem_used = (capacity * sizeof(gap_word_t)); memory_used += mem_used; max_serialize_mem += (unsigned)(length * sizeof(gap_word_t)); BM_ASSERT(length <= capacity); gap_cap_overhead += (capacity - length) * sizeof(gap_word_t); - for (unsigned i = 0; i < bm::gap_levels; ++i) - { - if (capacity == gap_levels[i]) - { - gaps_by_level[i]++; - return; - } - } - BM_ASSERT(0); + if (level < bm::gap_levels) + gaps_by_level[level]++; } /// Reset statisctics @@ -99,7 +96,7 @@ struct bv_statistics bit_blocks = gap_blocks = ptr_sub_blocks = bv_count = 0; max_serialize_mem = memory_used = gap_cap_overhead = 0; for (unsigned i = 0; i < bm::gap_levels; ++i) - gaps_by_level[i] = 0ull; + gaps_by_level[i] = 0; } /// Sum data from another sttructure @@ -125,11 +122,27 @@ struct bv_arena_statistics size_t bit_blocks_sz; ///< Total size of bit blocks size_t gap_blocks_sz; ///< Total size of gap blocks size_t ptr_sub_blocks_sz; ///< Total size of sub-blocks ptrs + unsigned top_block_size; ///< size of top descriptor /// Reset statisctics void reset() BMNOEXCEPT { - bit_blocks_sz = gap_blocks_sz = ptr_sub_blocks_sz = 0; + bit_blocks_sz = gap_blocks_sz = ptr_sub_blocks_sz = top_block_size = 0; + } + + /// Get allocation size in bytes + size_t get_alloc_size() const BMNOEXCEPT + { + size_t sz = bit_blocks_sz * sizeof(bm::word_t); + + if (gap_blocks_sz) // add padding space for SIMD vect overread + { + sz += (gap_blocks_sz + bm::gap_len_table_min::_len[0]) + * sizeof(bm::gap_word_t); + } + + sz += (ptr_sub_blocks_sz + top_block_size) * sizeof(void*); + return sz; } }; @@ -686,7 +699,7 @@ unsigned short bitscan_bsf(unsigned w, B* bits) BMNOEXCEPT unsigned short pos = 0; while (w) { - bits[pos++] = count_trailing_zeros_u32(w); + bits[pos++] = (B)bm::count_trailing_zeros_u32(w); w &= w - 1; } return pos; @@ -719,7 +732,7 @@ unsigned short bitscan_bsf64(bm::id64_t w, B* bits) BMNOEXCEPT unsigned short pos = 0; while (w) { - bits[pos++] = bm::count_trailing_zeros_u64(w); + bits[pos++] = (B)bm::count_trailing_zeros_u64(w); w &= w - 1; } return pos; @@ -1016,8 +1029,8 @@ bm::id64_t digest_mask(unsigned from, unsigned to) BMNOEXCEPT bm::id64_t digest_from = from >> bm::set_block_digest_pos_shift; bm::id64_t digest_to = to >> bm::set_block_digest_pos_shift;; - bm::id64_t mask(~0ull); - mask = (mask >> (63 - (digest_to - digest_from))) << digest_from; + bm::id64_t mask = + ((~0ull) >> (63 - (digest_to - digest_from))) << digest_from; //BM_ASSERT(mask == bm::dm_control(from, to)); @@ -1037,7 +1050,7 @@ bm::id64_t digest_mask(unsigned from, unsigned to) BMNOEXCEPT @ingroup bitfunc @internal */ -inline +BMFORCEINLINE bool check_zero_digest(bm::id64_t digest, unsigned bitpos_from, unsigned bitpos_to) BMNOEXCEPT { @@ -1045,6 +1058,28 @@ bool check_zero_digest(bm::id64_t digest, return !(digest & mask); } +/** + \brief Is one range of 1s ( 0000110000 - one range, 000011000010 - more than one) + @return true + @internal + */ +inline +bool is_digest_one_range(bm::id64_t digest) BMNOEXCEPT +{ + BM_ASSERT(digest); + bm::id64_t mask = 1; + bool prev = digest & mask; + unsigned cnt = prev; + for (mask <<= 1; mask; mask <<= 1) + { + bool curr = digest & mask; + if (curr && curr != prev) + cnt++; + prev = curr; + } // for + return cnt == 1; +} + /*! \brief Init block with 000111000 pattren based on digest \param block - Bit block [out] @@ -1056,22 +1091,20 @@ bool check_zero_digest(bm::id64_t digest, inline void block_init_digest0(bm::word_t* const block, bm::id64_t digest) BMNOEXCEPT { - unsigned off; for (unsigned i = 0; i < 64; ++i) { - off = i * bm::set_block_digest_wave_size; - bm::word_t mask = (digest & 1) ? ~0u : 0u; + unsigned off = i * bm::set_block_digest_wave_size; + bm::word_t mask = 0u - unsigned(digest & 1u); // (digest & 1) ? ~0u : 0u; + BM_ASSERT(mask == ((digest & 1) ? ~0u : 0u)); #if defined(VECT_BLOCK_SET_DIGEST) VECT_BLOCK_SET_DIGEST(&block[off], mask); #else - for (unsigned j = 0; j < bm::set_block_digest_wave_size; j+=4) - { - block[off+j+0] = block[off+j+1] = - block[off+j+2] = block[off+j+3] = mask; - } // for j + for (; off < (i * bm::set_block_digest_wave_size) + + bm::set_block_digest_wave_size; off+=4) + block[off] = block[off+1] = block[off+2] = block[off+3] = mask; #endif digest >>= 1ull; - } // for + } // for i } /*! @@ -1137,6 +1170,7 @@ update_block_digest0(const bm::word_t* const block, bm::id64_t digest) BMNOEXCEP unsigned wave = bm::word_bitcount64(t - 1); unsigned off = wave * bm::set_block_digest_wave_size; + d = bm::bmi_bslr_u64(d); // d &= d - 1; #if defined(VECT_IS_DIGEST_ZERO) bool all_zero = VECT_IS_DIGEST_ZERO(&block[off]); @@ -1155,10 +1189,9 @@ update_block_digest0(const bm::word_t* const block, bm::id64_t digest) BMNOEXCEP digest &= w64 ? digest : ~(mask << wave); #endif - d = bm::bmi_bslr_u64(d); // d &= d - 1; } // while - BM_ASSERT(bm::calc_block_digest0(block) == digest); +// BM_ASSERT(bm::calc_block_digest0(block) == digest); return digest; } @@ -1325,7 +1358,7 @@ template struct all_set { bm::word_t BM_VECT_ALIGN* _s[bm::set_sub_array_size] BM_VECT_ALIGN_ATTR; bm::word_t BM_VECT_ALIGN _p[bm::set_block_size] BM_VECT_ALIGN_ATTR; - bm::word_t* _p_fullp; + bm::word_t* _p_fullp; all_set_block() BMNOEXCEPT { @@ -1391,7 +1424,8 @@ template typename all_set::all_set_block all_set::_block; @internal */ template -bool find_not_null_ptr(bm::word_t*** arr, N start, N size, N* pos) BMNOEXCEPT +bool find_not_null_ptr(const bm::word_t* const * const* arr, + N start, N size, N* pos) BMNOEXCEPT { BM_ASSERT(pos); // BM_ASSERT(start < size); @@ -1687,43 +1721,64 @@ gap_find_first(const T* BMRESTRICT buf, unsigned* BMRESTRICT first) BMNOEXCEPT \return GAP index. @ingroup gapfunc */ -template +template unsigned gap_bfind(const T* BMRESTRICT buf, unsigned pos, unsigned* BMRESTRICT is_set) BMNOEXCEPT { BM_ASSERT(pos < bm::gap_max_bits); - #undef VECT_GAP_BFIND // TODO: VECTOR bfind causes performance degradation - #ifdef VECT_GAP_BFIND + //#undef VECT_GAP_BFIND // TODO: VECTOR bfind causes performance degradation + #if defined(VECT_GAP_BFIND) //&& defined(BMAVX2OPT) return VECT_GAP_BFIND(buf, pos, is_set); #else - *is_set = (*buf) & 1; - unsigned start = 1; - unsigned end = 1 + ((*buf) >> 3); - while (start != end) - { - if ((end - start) < 16) // use direct scan on short span - { - do - { - if (buf[start] >= pos) - goto break2; - } while (++start); - BM_ASSERT(0); // should not get here - break; - } - unsigned curr = (start + end) >> 1; - if ( buf[curr] < pos ) - start = curr + 1; - else - end = curr; - } // while - break2: - *is_set ^= ((start-1) & 1); - return start; + unsigned start = 1; + unsigned end = ((*buf) >> 3); + + unsigned size = end - start; + for (; size >= 64; size = end - start) + { + unsigned mid = (start + end) >> 1; + if (buf[mid] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + } // for + + for (; size >= 16; size = end - start) + { + if (unsigned mid = (start + end) >> 1; buf[mid] < pos) + start = mid + 1; + else + end = mid; + if (unsigned mid = (start + end) >> 1; buf[mid] < pos) + start = mid + 1; + else + end = mid; + } // for + + for(; true; ++start) + if (buf[start] >= pos) + break; + + *is_set = ((*buf) & 1) ^ ((start-1) & 1); + return start; #endif } + + /*! \brief Tests if bit = pos is true. \param buf - GAP buffer pointer. @@ -1750,19 +1805,19 @@ unsigned gap_test(const T* BMRESTRICT buf, unsigned pos) BMNOEXCEPT if (buf[6] >= pos) return sv1; if (buf[7] >= pos) return sv; if (buf[8] >= pos) return sv1; - if (buf[9] >= pos) return sv; - BM_ASSERT(0); + BM_ASSERT(buf[9] >= pos); + return sv; } else { - while (start != end) + BM_ASSERT(start != end); + do { - unsigned curr = (start + end) >> 1; - if (buf[curr] < pos) - start = curr + 1; + if (unsigned mid = (start + end) >> 1; buf[mid] < pos) + start = mid + 1; else - end = curr; - } + end = mid; + } while (start != end); } return ((*buf) & 1) ^ ((--start) & 1); } @@ -1777,20 +1832,11 @@ unsigned gap_test(const T* BMRESTRICT buf, unsigned pos) BMNOEXCEPT template unsigned gap_test_unr(const T* BMRESTRICT buf, const unsigned pos) BMNOEXCEPT { + BM_ASSERT(buf); BM_ASSERT(pos < bm::gap_max_bits); - if (pos == 0) // quick answer possible - { - return (*buf) & 1; - } -#if defined(BMSSE2OPT) - unsigned res = bm::sse2_gap_test(buf, pos); - BM_ASSERT(res == bm::gap_test(buf, pos)); -#elif defined(BMSSE42OPT) - unsigned res = bm::sse42_gap_test(buf, pos); - BM_ASSERT(res == bm::gap_test(buf, pos)); -#elif defined(BMAVX2OPT) - unsigned res = bm::avx2_gap_test(buf, pos); +#if defined(VECT_GAP_TEST) + unsigned res = VECT_GAP_TEST(buf, pos); BM_ASSERT(res == bm::gap_test(buf, pos)); #else unsigned res = bm::gap_test(buf, pos); @@ -2646,53 +2692,6 @@ SIZE_TYPE gap_find_rank(const T* const block, } - -/*! - \brief Counts 1 bits in GAP buffer in the closed [0, right] range. - \param buf - GAP buffer pointer. - \param right- rightmost bit index - \param is_corrected - if true the result will be rank corrected - if right bit == true count=count-1 - \return Number of non-zero bits - @ingroup gapfunc -*/ -/* -template -unsigned gap_bit_count_to(const T* const buf, T right, - bool is_corrected=false) BMNOEXCEPT -{ - const T* pcurr = buf; - const T* pend = pcurr + (*pcurr >> 3); - - unsigned bits_counter = 0; - unsigned is_set = ~((unsigned(*buf) & 1u) - 1u); // 0xFFF.. if true (mask for branchless code) - BM_ASSERT(is_set == 0u || is_set == ~0u); - pcurr = buf + 1; - - if (right <= *pcurr) // we are in the target block right now - { - bits_counter = (right + 1u) & is_set; // & is_set == if (is_set) - bits_counter -= (is_set & unsigned(is_corrected)); - return bits_counter; - } - bits_counter += (*pcurr + 1u) & is_set; - - unsigned prev_gap = *pcurr++; - for (is_set ^= ~0u; right > *pcurr; is_set ^= ~0u) - { - bits_counter += (*pcurr - prev_gap) & is_set; - if (pcurr == pend) - { - bits_counter -= (is_set & unsigned(is_corrected)); - return bits_counter; - } - prev_gap = *pcurr++; - } - bits_counter += (right - prev_gap) & is_set; - bits_counter -= (is_set & unsigned(is_corrected)); - return bits_counter; -} -*/ template unsigned gap_bit_count_to(const T* const buf, T right) BMNOEXCEPT { @@ -3158,20 +3157,20 @@ unsigned gap_buff_count_op(const T* vect1, const T* vect2) BMNOEXCEPT2 \param buf - GAP buffer. \param pos - Index of bit to set. \param is_set - (OUT) flag if bit was actually set. + \param curr - (pos) position index - \return New GAP buffer length. + \return New GAP buffer length. @ingroup gapfunc + @internal */ template -unsigned gap_set_value(unsigned val, +unsigned gap_set_value_cpos(unsigned val, T* BMRESTRICT buf, unsigned pos, - unsigned* BMRESTRICT is_set) BMNOEXCEPT + unsigned* BMRESTRICT is_set, + unsigned curr) BMNOEXCEPT { - BM_ASSERT(pos < bm::gap_max_bits); - - unsigned curr = bm::gap_bfind(buf, pos, is_set); T end = (T)(*buf >> 3); if (*is_set == val) { @@ -3215,7 +3214,7 @@ unsigned gap_set_value(unsigned val, --end; do { *pprev++ = *pcurr++; } while (pcurr < pend); } - } + } } else if (*pcurr == pos) // Rightmost bit in the GAP. Border goes left. @@ -3239,6 +3238,31 @@ unsigned gap_set_value(unsigned val, } +/*! + \brief Sets or clears bit in the GAP buffer. + + \param val - new bit value + \param buf - GAP buffer. + \param pos - Index of bit to set. + \param is_set - (OUT) flag if bit was actually set. + + \return New GAP buffer length. + + @ingroup gapfunc +*/ +template +unsigned gap_set_value(unsigned val, + T* BMRESTRICT buf, + unsigned pos, + unsigned* BMRESTRICT is_set) BMNOEXCEPT +{ + BM_ASSERT(pos < bm::gap_max_bits); + + unsigned curr = bm::gap_bfind(buf, pos, is_set); + return gap_set_value_cpos(val, buf, pos, is_set, curr); +} + + @@ -3408,17 +3432,18 @@ bool gap_shift_r1(T* BMRESTRICT buf, unsigned co_flag, unsigned* BMRESTRICT new_len) BMNOEXCEPT { BM_ASSERT(new_len); - bool co; + BM_ASSERT(co_flag <= 1); + + bool co, gap_set_flag; + unsigned len = (*buf >> 3); // 1: increment all GAP values by 1 { unsigned bitval = *buf & 1; + gap_set_flag = (bitval != co_flag); if (buf[1] == bm::gap_max_bits-1) // full GAP block - { co = bitval; - } else { - unsigned len = (*buf >> 3); unsigned i = 1; for (; i < len; ++i) { @@ -3435,11 +3460,70 @@ bool gap_shift_r1(T* BMRESTRICT buf, co = bitval; } } - // set bit position 0 with carry-in flag - { - unsigned is_set; + // set bit bit 0 with carry-in flag + unsigned is_set; + if (gap_set_flag) *new_len = bm::gap_set_value(co_flag, buf, 0, &is_set); + else + *new_len = len; + + return co; +} + +/*! + @brief isnert bit into GAP compressed block + @param buf - block pointer + @param pos - insert position + @param value - (0 or 1) - value to set + @param new_len - output length of the GAP block after the operation + + @return carry over bit (1 or 0) + @ingroup gapfunc +*/ +template +bool gap_insert(T* BMRESTRICT buf, + unsigned pos, unsigned val, unsigned* BMRESTRICT new_len) BMNOEXCEPT +{ + BM_ASSERT(new_len); + BM_ASSERT(val <= 1); + + bool co, gap_set_flag; + unsigned is_set; + unsigned idx = bm::gap_bfind(buf, pos, &is_set); + BM_ASSERT(is_set <= 1); + + gap_set_flag = (val != is_set); + unsigned len = (*buf >> 3); + + // 1: increment all GAP values by 1 + if (buf[idx] == bm::gap_max_bits-1) + { + co = is_set; + } + else + { + unsigned i = idx; + for (; i < len; ++i) + { + buf[i]++; + is_set ^= 1; + } // for i + BM_ASSERT(buf[i] == bm::gap_max_bits-1); + if (buf[i-1] == bm::gap_max_bits-1) // last element shifts out + { + // Set correct length word + --len; + *buf = (T)((*buf & 7) + (len << 3)); + *new_len = len; + } + co = is_set; } + + if (gap_set_flag) + *new_len = bm::gap_set_value(val, buf, pos, &is_set); + else + *new_len = len; + return co; } @@ -3457,6 +3541,8 @@ bool gap_shift_l1(T* BMRESTRICT buf, unsigned co_flag, unsigned* BMRESTRICT new_len) BMNOEXCEPT { BM_ASSERT(new_len); + BM_ASSERT(co_flag <= 1); + unsigned is_set; // 1: decrement all GAP values by 1 @@ -3679,8 +3765,6 @@ unsigned test_bit(const unsigned* block, unsigned bitpos) BMNOEXCEPT inline void or_bit_block(unsigned* dest, unsigned bitpos, unsigned bitcount) BMNOEXCEPT { - const unsigned maskFF = ~0u; - dest += unsigned(bitpos >> bm::set_word_shift); // nword bitpos &= bm::set_word_mask; @@ -3690,11 +3774,11 @@ void or_bit_block(unsigned* dest, unsigned bitpos, unsigned bitcount) BMNOEXCEPT return; } - if (bitpos) // starting pos is not aligned + const unsigned maskFF = ~0u; + if (bitpos) // starting pos is not aligned { unsigned mask_r = maskFF << bitpos; - unsigned right_margin = bitpos + bitcount; - if (right_margin < 32) + if (unsigned right_margin = bitpos + bitcount; right_margin < 32) { *dest |= (maskFF >> (32 - right_margin)) & mask_r; return; @@ -3726,22 +3810,20 @@ void or_bit_block(unsigned* dest, unsigned bitpos, unsigned bitcount) BMNOEXCEPT inline void sub_bit_block(unsigned* dest, unsigned bitpos, unsigned bitcount) BMNOEXCEPT { - const unsigned maskFF = ~0u; - + BM_ASSERT(bitcount); + dest += unsigned(bitpos >> bm::set_word_shift); // nword bitpos &= bm::set_word_mask; - if (bitcount == 1u) // special case (only 1 bit to set) { - *dest &= ~(1u << bitpos); + *dest &= ~(bitcount << bitpos); return; } - + const unsigned maskFF = ~0u; if (bitpos) // starting pos is not aligned { unsigned mask_r = maskFF << bitpos; - unsigned right_margin = bitpos + bitcount; - if (right_margin < 32) + if (unsigned right_margin = bitpos + bitcount; right_margin < 32) { *dest &= ~((maskFF >> (32 - right_margin)) & mask_r); return; @@ -3756,9 +3838,7 @@ void sub_bit_block(unsigned* dest, unsigned bitpos, unsigned bitcount) BMNOEXCEP *dest++ = 0u; bitcount -= 32; } if (bitcount) - { *dest &= ~(maskFF >> (32 - bitcount)); - } } @@ -3855,15 +3935,19 @@ void gap_sub_to_bitset(unsigned* BMRESTRICT dest, \param pcurr - GAP buffer pointer. \param digest0 - digest of 0 strides inside bit block + @return new digest + @ingroup gapfunc */ template -void gap_sub_to_bitset(unsigned* BMRESTRICT dest, +bm::id64_t gap_sub_to_bitset(unsigned* BMRESTRICT dest, const T* BMRESTRICT pcurr, bm::id64_t digest0) BMNOEXCEPT { BM_ASSERT(dest && pcurr); - const T* pend = pcurr + (*pcurr >> 3); + const T* BMRESTRICT pbuf = pcurr; + const unsigned len = (*pcurr >> 3); + const T* BMRESTRICT pend = pcurr + len; if (*pcurr & 1) // Starts with 1 { bool all_zero = bm::check_zero_digest(digest0, 0, pcurr[1]); @@ -3878,33 +3962,40 @@ void gap_sub_to_bitset(unsigned* BMRESTRICT dest, { unsigned tz = bm::count_trailing_zeros_u64(digest0); unsigned start_pos = tz << set_block_digest_pos_shift; - for (; pcurr <= pend; pcurr += 2) // now we are in GAP "0" + if (len > 16) { - if (*pcurr >= start_pos) - break; + unsigned is_set; + unsigned found_pos = bm::gap_bfind(pbuf, start_pos, &is_set); + if (found_pos > 2) + { + found_pos += !is_set; // to GAP "1" (can go out of scope) + pcurr = pbuf + found_pos; + } + BM_ASSERT (pcurr > pend || *pcurr >= start_pos); + } + else + { + for (; pcurr <= pend; pcurr += 2) // now we are in GAP "1" + if (*pcurr >= start_pos) + break; } } - unsigned lz = bm::count_leading_zeros_u64(digest0); + const unsigned lz = bm::count_leading_zeros_u64(digest0); unsigned stop_pos = (64u - lz) << set_block_digest_pos_shift; - unsigned bc, pos; - T prev; - for (; pcurr <= pend; pcurr += 2) // now we are in GAP "1" again + for (T prev; pcurr <= pend; pcurr += 2) // now we are in GAP "1" again { BM_ASSERT(*pcurr > *(pcurr-1)); prev = pcurr[-1]; - bc = *pcurr - prev; - pos = 1u + prev; - + unsigned pos = 1u + prev; bool all_zero = bm::check_zero_digest(digest0, prev, *pcurr); if (!all_zero) - bm::sub_bit_block(dest, pos, bc); - + bm::sub_bit_block(dest, pos, *pcurr - prev); if (pos > stop_pos) break; // early break is possible based on digest tail - } // for + return bm::update_block_digest0(dest, digest0); } @@ -4026,19 +4117,22 @@ void gap_and_to_bitset(unsigned* BMRESTRICT dest, \brief ANDs GAP block to bitblock with digest assist \param dest - bitblock buffer pointer. \param pcurr - GAP buffer pointer. - \param digest0 - digest of 0 strides for the destination + \param digest0 - digest of 0 strides for the destination + + @return new digest @ingroup gapfunc */ template -void gap_and_to_bitset(unsigned* BMRESTRICT dest, +bm::id64_t gap_and_to_bitset(unsigned* BMRESTRICT dest, const T* BMRESTRICT pcurr, bm::id64_t digest0) BMNOEXCEPT { BM_ASSERT(dest && pcurr); if (!digest0) - return; - - const T* pend = pcurr + (*pcurr >> 3); + return digest0; + const T* BMRESTRICT pbuf = pcurr; + const unsigned len = (*pcurr >> 3); + const T* BMRESTRICT pend = pcurr + len; if (!(*pcurr & 1) ) // Starts with 0 { bool all_zero = bm::check_zero_digest(digest0, 0, pcurr[1]); @@ -4053,34 +4147,42 @@ void gap_and_to_bitset(unsigned* BMRESTRICT dest, { unsigned tz = bm::count_trailing_zeros_u64(digest0); unsigned start_pos = tz << set_block_digest_pos_shift; - for (; pcurr <= pend; pcurr += 2) // now we are in GAP "0" + if (len > 16) { - if (*pcurr >= start_pos) - break; + unsigned is_set; + unsigned found_pos = bm::gap_bfind(pbuf, start_pos, &is_set); + if (found_pos > 2) + { + found_pos += is_set; // to GAP "0" (can go out of scope) + pcurr = pbuf + found_pos; + } + BM_ASSERT (pcurr > pend || *pcurr >= start_pos); } + else + { + for (; pcurr <= pend; pcurr += 2) // now we are in GAP "0" + if (*pcurr >= start_pos) + break; + } + } - unsigned lz = bm::count_leading_zeros_u64(digest0); - unsigned stop_pos = (64u - lz) << set_block_digest_pos_shift; - - unsigned bc, pos; - T prev; - for (; pcurr <= pend; pcurr += 2) // now we are in GAP "0" again + const unsigned lz = bm::count_leading_zeros_u64(digest0); + const unsigned stop_pos = (64u - lz) << set_block_digest_pos_shift; + + for (T prev; pcurr <= pend; pcurr += 2) // now we are in GAP "0" again { BM_ASSERT(*pcurr > *(pcurr-1)); - prev = pcurr[-1]; - bc = *pcurr - prev; - pos = 1u + prev; - + unsigned pos = 1u + prev; bool all_zero = bm::check_zero_digest(digest0, prev, *pcurr); if (!all_zero) - bm::sub_bit_block(dest, pos, bc); - + bm::sub_bit_block(dest, pos, *pcurr - prev); if (pos > stop_pos) // early break is possible based on digest tail break; + } // for pcurr - } // for + return bm::update_block_digest0(dest, digest0); } @@ -6806,7 +6908,8 @@ bm::id64_t bit_block_and(bm::word_t* BMRESTRICT dst, unsigned wave = bm::word_bitcount64(t - 1); unsigned off = wave * bm::set_block_digest_wave_size; - + d = bm::bmi_bslr_u64(d); // d &= d - 1; + #if defined(VECT_AND_DIGEST) bool all_zero = VECT_AND_DIGEST(&dst[off], &src[off]); if (all_zero) @@ -6829,8 +6932,6 @@ bm::id64_t bit_block_and(bm::word_t* BMRESTRICT dst, if (!acc) // all zero digest &= ~(mask << wave); #endif - - d = bm::bmi_bslr_u64(d); // d &= d - 1; } // while return digest; @@ -6860,9 +6961,9 @@ bm::id64_t bit_block_and_5way(bm::word_t* BMRESTRICT dst, while (d) { bm::id64_t t = bm::bmi_blsi_u64(d); // d & -d; - unsigned wave = bm::word_bitcount64(t - 1); unsigned off = wave * bm::set_block_digest_wave_size; + d = bm::bmi_bslr_u64(d); // d &= d - 1; #if defined(VECT_AND_DIGEST_5WAY) bool all_zero = VECT_AND_DIGEST_5WAY(&dst[off], &src0[off], &src1[off], &src2[off], &src3[off]); @@ -6885,12 +6986,72 @@ bm::id64_t bit_block_and_5way(bm::word_t* BMRESTRICT dst, acc |= dst_u->w64[j + 3] &= src_u0->w64[j + 3] & src_u1->w64[j + 3] & src_u2->w64[j + 3] & src_u3->w64[j + 3]; j += 4; } while (j < bm::set_block_digest_wave_size / 2); - if (!acc) // all zero digest &= ~(mask << wave); #endif + } // while + return digest; +} + +/*! + \brief digest based bit-block AND + + dst &= src1 AND src2 + + \param dst - src/destination block. + \param src1 - source block. + \param src2 - source block. + \param digest - known initial digest + + \return new digest + + @ingroup bitfunc +*/ +inline +bm::id64_t bit_block_and_3way(bm::word_t* BMRESTRICT dst, + const bm::word_t* BMRESTRICT src1, + const bm::word_t* BMRESTRICT src2, + bm::id64_t digest) BMNOEXCEPT +{ + BM_ASSERT(dst); + BM_ASSERT(src1 && src2); + BM_ASSERT(dst != src1 && dst != src2); + + const bm::id64_t mask(1ull); + bm::id64_t d = digest; + while (d) + { + bm::id64_t t = bm::bmi_blsi_u64(d); // d & -d; + unsigned wave = bm::word_bitcount64(t - 1); + unsigned off = wave * bm::set_block_digest_wave_size; d = bm::bmi_bslr_u64(d); // d &= d - 1; + + #if defined(VECT_AND_DIGEST_3WAY) + bool all_zero = VECT_AND_DIGEST_3WAY(&dst[off], &src1[off], &src2[off]); + if (all_zero) + digest &= ~(mask << wave); + #else + const bm::bit_block_t::bunion_t* BMRESTRICT src_u1 = + (const bm::bit_block_t::bunion_t*)(&src1[off]); + const bm::bit_block_t::bunion_t* BMRESTRICT src_u2 = + (const bm::bit_block_t::bunion_t*)(&src2[off]); + bm::bit_block_t::bunion_t* BMRESTRICT dst_u = + (bm::bit_block_t::bunion_t*)(&dst[off]); + unsigned j = 0; bm::id64_t acc = 0; + do + { + acc |= dst_u->w64[j] &= src_u1->w64[j] & src_u2->w64[j]; + acc |= dst_u->w64[j+1] &= src_u1->w64[j+1] & src_u2->w64[j+1]; + acc |= dst_u->w64[j+2] &= src_u1->w64[j+2] & src_u2->w64[j+2]; + acc |= dst_u->w64[j+3] &= src_u1->w64[j+3] & src_u2->w64[j+3]; + j+=4; + } while (j < bm::set_block_digest_wave_size/2); + + if (!acc) // all zero + digest &= ~(mask << wave); + #endif + } // while return digest; @@ -6929,7 +7090,8 @@ bm::id64_t bit_block_and_2way(bm::word_t* BMRESTRICT dst, unsigned wave = bm::word_bitcount64(t - 1); unsigned off = wave * bm::set_block_digest_wave_size; - + d = bm::bmi_bslr_u64(d); // d &= d - 1; + #if defined(VECT_AND_DIGEST_2WAY) bool all_zero = VECT_AND_DIGEST_2WAY(&dst[off], &src1[off], &src2[off]); if (all_zero) @@ -6941,12 +7103,10 @@ bm::id64_t bit_block_and_2way(bm::word_t* BMRESTRICT dst, (const bm::bit_block_t::bunion_t*)(&src2[off]); bm::bit_block_t::bunion_t* BMRESTRICT dst_u = (bm::bit_block_t::bunion_t*)(&dst[off]); - - bm::id64_t acc = 0; - unsigned j = 0; + unsigned j = 0; bm::id64_t acc = 0; do { - acc |= dst_u->w64[j+0] = src_u1->w64[j+0] & src_u2->w64[j+0]; + acc |= dst_u->w64[j] = src_u1->w64[j] & src_u2->w64[j]; acc |= dst_u->w64[j+1] = src_u1->w64[j+1] & src_u2->w64[j+1]; acc |= dst_u->w64[j+2] = src_u1->w64[j+2] & src_u2->w64[j+2]; acc |= dst_u->w64[j+3] = src_u1->w64[j+3] & src_u2->w64[j+3]; @@ -6957,12 +7117,80 @@ bm::id64_t bit_block_and_2way(bm::word_t* BMRESTRICT dst, digest &= ~(mask << wave); #endif - d = bm::bmi_bslr_u64(d); // d &= d - 1; } // while return digest; } +/*! + \brief digest based bit-block AND (0 elements of digest will be zeroed) + + dst = src1 AND src2 + + \param dst - destination block. + \param src1 - source block. + \param src2 - source block. + \param digest - known initial digest + + \return new digest + + @ingroup bitfunc +*/ +inline +bm::id64_t bit_block_init_and_2way(bm::word_t* BMRESTRICT dst, + const bm::word_t* BMRESTRICT src1, + const bm::word_t* BMRESTRICT src2, + bm::id64_t digest) BMNOEXCEPT +{ + bm::id64_t d = digest; + unsigned off; + for (unsigned i = 0; i < 64; ++i) + { + off = i * bm::set_block_digest_wave_size; + if (digest & 1) + { + #if defined(VECT_AND_DIGEST_2WAY) + bool all_zero = VECT_AND_DIGEST_2WAY(&dst[off], &src1[off], &src2[off]); + if (all_zero) + d &= ~(1ull << i); + #else + const bm::bit_block_t::bunion_t* BMRESTRICT src_u1 = + (const bm::bit_block_t::bunion_t*)(&src1[off]); + const bm::bit_block_t::bunion_t* BMRESTRICT src_u2 = + (const bm::bit_block_t::bunion_t*)(&src2[off]); + bm::bit_block_t::bunion_t* BMRESTRICT dst_u = + (bm::bit_block_t::bunion_t*)(&dst[off]); + bm::id64_t acc = 0; + unsigned j = 0; + do + { + acc |= dst_u->w64[j] = src_u1->w64[j] & src_u2->w64[j]; + acc |= dst_u->w64[j+1] = src_u1->w64[j+1] & src_u2->w64[j+1]; + acc |= dst_u->w64[j+2] = src_u1->w64[j+2] & src_u2->w64[j+2]; + acc |= dst_u->w64[j+3] = src_u1->w64[j+3] & src_u2->w64[j+3]; + j+=4; + } while (j < bm::set_block_digest_wave_size/2); + + if (!acc) // all zero + d &= ~(1ull << i); + #endif + + } + else // init to all 0s + { + #if defined(VECT_BLOCK_SET_DIGEST) + VECT_BLOCK_SET_DIGEST(&dst[off], 0u); + #else + for (; off < (i * bm::set_block_digest_wave_size) + + bm::set_block_digest_wave_size; off+=4) + dst[off] = dst[off+1] = dst[off+2] = dst[off+3] = 0u; + #endif + } + digest >>= 1ull; + } // for + return d; +} + /*! \brief digest based bit-block AND - OR @@ -7927,6 +8155,7 @@ bm::id64_t bit_block_sub(bm::word_t* BMRESTRICT dst, unsigned wave = bm::word_bitcount64(t - 1); unsigned off = wave * bm::set_block_digest_wave_size; + d = bm::bmi_bslr_u64(d); // d &= d - 1; #if defined(VECT_SUB_DIGEST) bool all_zero = VECT_SUB_DIGEST(&dst[off], &src[off]); @@ -7951,7 +8180,6 @@ bm::id64_t bit_block_sub(bm::word_t* BMRESTRICT dst, digest &= ~(mask << wave); #endif - d = bm::bmi_bslr_u64(d); // d &= d - 1; } // while return digest; @@ -7988,6 +8216,7 @@ bm::id64_t bit_block_sub_2way(bm::word_t* BMRESTRICT dst, unsigned wave = bm::word_bitcount64(t - 1); unsigned off = wave * bm::set_block_digest_wave_size; + d = bm::bmi_bslr_u64(d); // d &= d - 1; #if defined(VECT_SUB_DIGEST_2WAY) bool all_zero = VECT_SUB_DIGEST_2WAY(&dst[off], &src1[off], &src2[off]); @@ -8015,10 +8244,117 @@ bm::id64_t bit_block_sub_2way(bm::word_t* BMRESTRICT dst, if (!acc) // all zero digest &= ~(mask << wave); #endif + } // while + + return digest; +} + +/*! + \brief digest based bit-block SUB 5-way + \return new digest + @ingroup bitfunc +*/ +inline +bm::id64_t bit_block_sub_5way(bm::word_t* BMRESTRICT dst, + const bm::word_t* BMRESTRICT src0, + const bm::word_t* BMRESTRICT src1, + const bm::word_t* BMRESTRICT src2, + const bm::word_t* BMRESTRICT src3, + bm::id64_t digest) BMNOEXCEPT +{ + BM_ASSERT(dst); + BM_ASSERT(src0 && src1 && src2 && src3); + + const bm::id64_t mask(1ull); + bm::id64_t d = digest; + while (d) + { + bm::id64_t t = bm::bmi_blsi_u64(d); // d & -d; + + unsigned wave = bm::word_bitcount64(t - 1); + unsigned off = wave * bm::set_block_digest_wave_size; d = bm::bmi_bslr_u64(d); // d &= d - 1; + +#if defined(VECT_SUB_DIGEST_5WAY) + bool all_zero = VECT_SUB_DIGEST_5WAY(&dst[off], &src0[off], &src1[off], &src2[off], &src3[off]); + if (all_zero) + digest &= ~(mask << wave); +#else + const bm::bit_block_t::bunion_t* BMRESTRICT src_u0 = (const bm::bit_block_t::bunion_t*)(&src0[off]); + const bm::bit_block_t::bunion_t* BMRESTRICT src_u1 = (const bm::bit_block_t::bunion_t*)(&src1[off]); + const bm::bit_block_t::bunion_t* BMRESTRICT src_u2 = (const bm::bit_block_t::bunion_t*)(&src2[off]); + const bm::bit_block_t::bunion_t* BMRESTRICT src_u3 = (const bm::bit_block_t::bunion_t*)(&src3[off]); + bm::bit_block_t::bunion_t* BMRESTRICT dst_u = (bm::bit_block_t::bunion_t*)(&dst[off]); + + bm::id64_t acc = 0; + unsigned j = 0; + do + { + acc |= dst_u->w64[j + 0] &= ~src_u0->w64[j + 0] & ~src_u1->w64[j + 0] & ~src_u2->w64[j + 0] & ~src_u3->w64[j + 0]; + acc |= dst_u->w64[j + 1] &= ~src_u0->w64[j + 1] & ~src_u1->w64[j + 1] & ~src_u2->w64[j + 1] & ~src_u3->w64[j + 1]; + acc |= dst_u->w64[j + 2] &= ~src_u0->w64[j + 2] & ~src_u1->w64[j + 2] & ~src_u2->w64[j + 2] & ~src_u3->w64[j + 2]; + acc |= dst_u->w64[j + 3] &= ~src_u0->w64[j + 3] & ~src_u1->w64[j + 3] & ~src_u2->w64[j + 3] & ~src_u3->w64[j + 3]; + j += 4; + } while (j < bm::set_block_digest_wave_size / 2); + + if (!acc) // all zero + digest &= ~(mask << wave); +#endif + + } // while + + return digest; +} + +/*! + \brief digest based bit-block SUB 3-way + \return new digest + @ingroup bitfunc +*/ +inline +bm::id64_t bit_block_sub_3way(bm::word_t* BMRESTRICT dst, + const bm::word_t* BMRESTRICT src0, + const bm::word_t* BMRESTRICT src1, + bm::id64_t digest) BMNOEXCEPT +{ + BM_ASSERT(dst); + BM_ASSERT(src0 && src1); + + const bm::id64_t mask(1ull); + bm::id64_t d = digest; + while (d) + { + bm::id64_t t = bm::bmi_blsi_u64(d); // d & -d; + + unsigned wave = bm::word_bitcount64(t - 1); + unsigned off = wave * bm::set_block_digest_wave_size; + d = bm::bmi_bslr_u64(d); // d &= d - 1; + +#if defined(VECT_SUB_DIGEST_3WAY) + bool all_zero = VECT_SUB_DIGEST_3WAY(&dst[off], &src0[off], &src1[off]); + if (all_zero) + digest &= ~(mask << wave); +#else + const bm::bit_block_t::bunion_t* BMRESTRICT src_u0 = (const bm::bit_block_t::bunion_t*)(&src0[off]); + const bm::bit_block_t::bunion_t* BMRESTRICT src_u1 = (const bm::bit_block_t::bunion_t*)(&src1[off]); + bm::bit_block_t::bunion_t* BMRESTRICT dst_u = (bm::bit_block_t::bunion_t*)(&dst[off]); + + bm::id64_t acc = 0; + unsigned j = 0; + do + { + acc |= dst_u->w64[j + 0] &= ~src_u0->w64[j + 0] & ~src_u1->w64[j + 0]; + acc |= dst_u->w64[j + 1] &= ~src_u0->w64[j + 1] & ~src_u1->w64[j + 1]; + acc |= dst_u->w64[j + 2] &= ~src_u0->w64[j + 2] & ~src_u1->w64[j + 2]; + acc |= dst_u->w64[j + 3] &= ~src_u0->w64[j + 3] & ~src_u1->w64[j + 3]; + j += 4; + } while (j < bm::set_block_digest_wave_size / 2); + + if (!acc) // all zero + digest &= ~(mask << wave); +#endif } // while - return digest; } @@ -8410,7 +8746,7 @@ bool bit_find_first(const bm::word_t* BMRESTRICT block, BM_ASSERT(pos); #ifdef VECT_BIT_FIND_FIRST - return VECT_BIT_FIND_FIRST(block, pos); + return VECT_BIT_FIND_FIRST(block, 0, pos); #else for (unsigned i = 0; i < bm::set_block_size; ++i) { @@ -8431,7 +8767,7 @@ bool bit_find_first(const bm::word_t* BMRESTRICT block, \param block - bit block buffer pointer \param first - index of the first 1 bit (out) - \param digest - known digest of dst block + \param digest - known digest of dst block \return 0 if not found @@ -8447,20 +8783,43 @@ unsigned bit_find_first(const bm::word_t* BMRESTRICT block, BM_ASSERT(digest); bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d; - unsigned wave = bm::word_bitcount64(t - 1); - unsigned off = wave * bm::set_block_digest_wave_size; - for (unsigned i = off; i < bm::set_block_size; ++i) + unsigned i = wave * bm::set_block_digest_wave_size; + +#ifdef VECT_BIT_FIND_FIRST + return VECT_BIT_FIND_FIRST(block, i, first); +#else + do { - bm::word_t w = block[i]; - if (w) + bm::id64_t w64 = block[i] | block[i+1]; + if (w64) { - unsigned idx = bit_scan_forward32(w); // trailing zeros - *first = unsigned(idx + (i * 8u * unsigned(sizeof(bm::word_t)))); - return w; + unsigned base = i * 8u * unsigned(sizeof(bm::word_t)); + if (bm::word_t w0 = block[i]) + { + *first = bm::bit_scan_forward32(w0) + base; + return w0; + } + BM_ASSERT(block[i+1]); + return *first = bm::bit_scan_forward32(block[i+1]) + base + 32; } - } // for i + i+=2; + w64 = block[i] | block[i+1]; + if (w64) + { + unsigned base = i * 8u * unsigned(sizeof(bm::word_t)); + if (bm::word_t w0 = block[i]) + { + *first = bm::bit_scan_forward32(w0) + base; + return w0; + } + BM_ASSERT(block[i+1]); + return *first = bm::bit_scan_forward32(block[i+1]) + base + 32; + } + i+=2; + } while (i < bm::set_block_size); return 0u; +#endif } @@ -8484,35 +8843,59 @@ bool bit_find_first_if_1(const bm::word_t* BMRESTRICT block, BM_ASSERT(first); BM_ASSERT(bm::word_bitcount64(digest)==1); - bool found = false; - bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d; + const bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d; + const unsigned wave = bm::word_bitcount64(t - 1); + const unsigned off = wave * bm::set_block_digest_wave_size; - unsigned wave = bm::word_bitcount64(t - 1); - unsigned off = wave * bm::set_block_digest_wave_size; - unsigned i; - for (i = off; i < off + bm::set_block_digest_wave_size; ++i) +#if defined(BMSSE42OPT) || defined(BMAVX2OPT) || defined(BMAVX512OPT) + for (unsigned i = off; i < off + bm::set_block_digest_wave_size; i+=4) { - bm::word_t w = block[i]; - if (w) + __m128i wA = _mm_load_si128((__m128i*)&block[i]); + if (_mm_test_all_zeros(wA, wA)) + continue; + const unsigned cnt = i + 4; + do { - unsigned bc = bm::word_bitcount(w); - if (bc != 1) + const bm::word_t w = block[i]; + switch (bm::word_bitcount(w)) + { + case 0: break; + case 1: + *first = (i * 32) + bm::bit_scan_forward32(w); + for (++i; i < cnt; ++i) // check the rest of the SSE lane + if (block[i]) + return false; + goto check_tail; // break out of switch-while + default: return false; - - unsigned idx = bit_scan_forward32(w); // trailing zeros - *first = unsigned(idx + (i * 8u * sizeof(bm::word_t))); - found = true; - break; - } + } // switch + } while (++i < cnt); + check_tail: + for (; i < off + bm::set_block_digest_wave_size; i+=4) + { + wA = _mm_load_si128((__m128i*)&block[i]); + if (!_mm_test_all_zeros(wA, wA)) // another != 0 found + return false; + } // for i + return true; } // for i - - // check if all other bits are zero - for (++i; i < off + bm::set_block_digest_wave_size; ++i) +#else + for (auto i = off; i < off + bm::set_block_digest_wave_size; ++i) { - if (block[i]) - return false; - } - return found; + if (auto w = block[i]) + { + if (bm::word_bitcount(w) != 1) + return false; + const unsigned idx = (i * 32) + bm::bit_scan_forward32(w); // tzero + for (++i; i < off + bm::set_block_digest_wave_size; ++i) + if (block[i]) + return false; + *first = idx; + return true; + } + } // for i +#endif + return false; } diff --git a/tools/tax/src/bm/bmfwd.h b/tools/tax/src/bm/bmfwd.h index fbedae24..abed87c5 100644 --- a/tools/tax/src/bm/bmfwd.h +++ b/tools/tax/src/bm/bmfwd.h @@ -58,7 +58,7 @@ template class deserializer; template class sparse_vector; template class rsc_sparse_vector; -template class sparse_vector_scanner; +template class sparse_vector_scanner; template class sparse_vector_serializer; template class sparse_vector_deserializer; diff --git a/tools/tax/src/bm/bmserial.h b/tools/tax/src/bm/bmserial.h index fe35fa23..81524380 100644 --- a/tools/tax/src/bm/bmserial.h +++ b/tools/tax/src/bm/bmserial.h @@ -255,11 +255,12 @@ class serializer @param sim_model - [out] similarity model to compute @param ref_vect - [in] reference vectors @param params - parameters to regulate search depth + @return true - if similarity model created successfully @sa set_ref_vectors @internal */ - void compute_sim_model(xor_sim_model_type& sim_model, + bool compute_sim_model(xor_sim_model_type& sim_model, const bv_ref_vector_type& ref_vect, const bm::xor_sim_params& params); @@ -1312,11 +1313,11 @@ void serializer::set_ref_vectors(const bv_ref_vector_type* ref_vect) } template -void serializer::compute_sim_model(xor_sim_model_type& sim_model, +bool serializer::compute_sim_model(xor_sim_model_type& sim_model, const bv_ref_vector_type& ref_vect, const bm::xor_sim_params& params) { - xor_scan_.compute_sim_model(sim_model, ref_vect, params); + return xor_scan_.compute_sim_model(sim_model, ref_vect, params); } template @@ -4726,7 +4727,7 @@ void deserializer::xor_decode(blocks_manager_type& bman) if (nb_from == x_nb_ || nb_to == x_nb_) return; } - bman.optimize_bit_block(i0, j0); + bman.optimize_bit_block(i0, j0, BV::opt_compress); } // --------------------------------------------------------------------------- @@ -7101,7 +7102,7 @@ iterator_deserializer::deserialize( switch (op) // target block optimization for non-const operations { case set_AND: case set_SUB: case set_XOR: case set_OR: - bman.optimize_bit_block(i0, j0); + bman.optimize_bit_block(i0, j0, bvector_type::opt_compress); break; default: break; } // switch diff --git a/tools/tax/src/bm/bmsparsevec.h b/tools/tax/src/bm/bmsparsevec.h index f904e4ce..6deef2f9 100644 --- a/tools/tax/src/bm/bmsparsevec.h +++ b/tools/tax/src/bm/bmsparsevec.h @@ -358,11 +358,15 @@ class sparse_vector : public base_sparse_vector typedef typename bvector_type::block_idx_type block_idx_type; private: - bm::sparse_vector* sv_; ///!< pointer on the parent vector - bvector_type* bv_null_; ///!< not NULL vector pointer - buffer_type buffer_; ///!< value buffer - unsigned_value_type* buf_ptr_; ///!< position in the buffer - bool set_not_null_; + buffer_type buffer_; ///!< value buffer + bm::sparse_vector* sv_ = 0; ///!< pointer on the parent vector + bvector_type* bv_null_ = 0; ///!< not NULL vector pointer + unsigned_value_type* buf_ptr_ = 0; ///!< position in the buffer + bool set_not_null_ = true; + + block_idx_type prev_nb_ = 0;///!< previous block added + typename + bvector_type::optmode opt_mode_ = bvector_type::opt_compress; }; friend const_iterator; @@ -513,6 +517,15 @@ class sparse_vector : public base_sparse_vector */ void erase(size_type idx, bool erase_null = true); + + /** + \brief swap two vector elements between each other + \param idx1 - element index 1 + \param idx1 - element index 2 + */ + void swap(size_type idx1, size_type idx2); + + /*! \brief clear specified element with bounds checking and automatic resize \param idx - element index @@ -793,6 +806,18 @@ class sparse_vector : public base_sparse_vector */ void calc_stat( struct sparse_vector::statistics* st) const BMNOEXCEPT; + + /** + @brief Turn sparse vector into immutable mode + Read-only (immutable) vector uses less memory and allows faster searches. + Before freezing it is recommenede to call optimize() to get full memory saving effect + @sa optimize + */ + void freeze() { this->freeze_matr(); } + + /** Returns true if vector is read-only */ + bool is_ro() const BMNOEXCEPT { return this->is_ro_; } + ///@} // ------------------------------------------------------------ @@ -865,7 +890,7 @@ class sparse_vector : public base_sparse_vector /*! \brief syncronize internal structures, build fast access index */ - void sync(bool /*force*/) {} + void sync(bool /*force*/) { this->sync_ro(); } /*! @@ -1050,11 +1075,10 @@ class sparse_vector : public base_sparse_vector protected: template friend class rsc_sparse_vector; - template friend class sparse_vector_scanner; + template friend class sparse_vector_scanner; template friend class sparse_vector_serializer; template friend class sparse_vector_deserializer; - }; @@ -1227,7 +1251,7 @@ void sparse_vector::import_u_nocheck const size_type* r = tm.row(p); row_len[p] = 0; - bv->import_sorted(r, rl); + bv->import_sorted(r, rl, false); } } // for j @@ -1244,7 +1268,7 @@ void sparse_vector::import_u_nocheck if (!bv) bv = this->get_create_slice(k); const size_type* row = tm.row(k); - bv->import_sorted(row, rl); + bv->import_sorted(row, rl, false); } } // for k @@ -1279,7 +1303,29 @@ void sparse_vector::import_back(const value_type* arr, size_type arr_size, bool set_not_null) { - this->import_back_u((const unsigned_value_type)arr, arr_size, set_not_null); + if constexpr (std::is_signed::value) + { + const unsigned tmp_size = 1024; + unsigned_value_type arr_tmp[tmp_size]; + size_type k(0), i(0); + while (i < arr_size) + { + arr_tmp[k++] = this->s2u(arr[i++]); + if (k == tmp_size) + { + import_back_u(arr_tmp, k, set_not_null); + k = 0; + } + } // while + if (k) + { + import_back_u(arr_tmp, k, set_not_null); + } + } + else + { + this->import_back_u((const unsigned_value_type*)arr, arr_size, set_not_null); + } } @@ -1393,9 +1439,9 @@ sparse_vector::gather(value_type* arr, // process block co-located elements at ones for best (CPU cache opt) // - unsigned i0 = unsigned(nb >> bm::set_array_shift); // top block address - unsigned j0 = unsigned(nb & bm::set_array_mask); // address in sub-block - + unsigned i0, j0; + bm::get_block_coord(nb, i0, j0); + unsigned eff_planes = this->effective_slices(); // TODO: get real effective planes for [i,j] BM_ASSERT(eff_planes <= (sizeof(value_type) * 8)); @@ -1493,8 +1539,9 @@ sparse_vector::extract_range(value_type* arr, // calculate logical block coordinates and masks // block_idx_type nb = (start >> bm::set_block_shift); - unsigned i0 = unsigned(nb >> bm::set_array_shift); // top block address - unsigned j0 = unsigned(nb & bm::set_array_mask); // address in sub-block + unsigned i0, j0; + bm::get_block_coord(nb, i0, j0); + unsigned nbit = unsigned(start & bm::set_block_mask); unsigned nword = unsigned(nbit >> bm::set_word_shift); unsigned mask0 = 1u << (nbit & bm::set_word_mask); @@ -1515,8 +1562,7 @@ sparse_vector::extract_range(value_type* arr, if (nb1 != nb) // block switch boundaries { nb = nb1; - i0 = unsigned(nb >> bm::set_array_shift); - j0 = unsigned(nb & bm::set_array_mask); + bm::get_block_coord(nb, i0, j0); blk = this->bmatr_.get_block(j, i0, j0); is_gap = BM_IS_GAP(blk); } @@ -1815,6 +1861,17 @@ void sparse_vector::push_back_null(size_type count) this->size_ += count; } +//--------------------------------------------------------------------- + +template +void sparse_vector::swap(size_type idx1, size_type idx2) +{ + BM_ASSERT(idx1 < this->size()); + BM_ASSERT(idx2 < this->size()); + + this->swap_elements(idx1, idx2); +} + //--------------------------------------------------------------------- @@ -1916,12 +1973,6 @@ void sparse_vector::set_value_no_null(size_type idx, { unsigned_value_type uv = this->s2u(v); - // calculate logical block coordinates and masks - // - block_idx_type nb = (idx >> bm::set_block_shift); - unsigned i0 = unsigned(nb >> bm::set_array_shift); // top block address - unsigned j0 = unsigned(nb & bm::set_array_mask); // address in sub-block - // clear the planes where needed unsigned bsr = uv ? bm::bit_scan_reverse(uv) : 0u; if (need_clear) @@ -1942,9 +1993,13 @@ void sparse_vector::set_value_no_null(size_type idx, } else if (need_clear) { + block_idx_type nb = (idx >> bm::set_block_shift); + unsigned i0, j0; + bm::get_block_coord(nb, i0, j0); + if (const bm::word_t* blk = this->bmatr_.get_block(j, i0, j0)) { - // TODO: more efficient set/clear on on block + // TODO: more efficient set/clear on the block bvector_type* bv = this->bmatr_.get_row(j); bv->clear_bit_no_check(idx); } @@ -2367,7 +2422,6 @@ bool sparse_vector::const_iterator::is_null() const BMNOEXCEPT template sparse_vector::back_insert_iterator::back_insert_iterator() -: sv_(0), bv_null_(0), buf_ptr_(0), set_not_null_(true) {} //--------------------------------------------------------------------- @@ -2375,10 +2429,11 @@ sparse_vector::back_insert_iterator::back_insert_iterator() template sparse_vector::back_insert_iterator::back_insert_iterator( typename sparse_vector::back_insert_iterator::sparse_vector_type* sv) -: sv_(sv), set_not_null_(true) +: sv_(sv) { if (sv) { + prev_nb_ = sv_->size() >> bm::set_block_shift; bv_null_ = sv_->get_null_bvect(); buffer_.reserve(n_buf_size * sizeof(value_type)); buf_ptr_ = (unsigned_value_type*)(buffer_.data()); @@ -2395,10 +2450,12 @@ template sparse_vector::back_insert_iterator::back_insert_iterator( const typename sparse_vector::back_insert_iterator& bi) : sv_(bi.sv_), bv_null_(bi.bv_null_), buf_ptr_(0), - set_not_null_(bi.set_not_null_) + set_not_null_(bi.set_not_null_), + prev_nb_(bi.prev_nb_), opt_mode_(bi.opt_mode_) { if (sv_) { + prev_nb_ = sv_->size() >> bm::set_block_shift; buffer_.reserve(n_buf_size * sizeof(value_type)); buf_ptr_ = (unsigned_value_type*)(buffer_.data()); } @@ -2410,7 +2467,8 @@ template sparse_vector::back_insert_iterator::back_insert_iterator( typename sparse_vector::back_insert_iterator&& bi) BMNOEXCEPT : sv_(bi.sv_), bv_null_(bi.bv_null_), buf_ptr_(bi.buf_ptr_), - set_not_null_(bi.set_not_null_) + set_not_null_(bi.set_not_null_), + prev_nb_(bi.prev_nb_), opt_mode_(bi.opt_mode_) { buffer_.swap(bi.buffer_); buf_ptr_ = bi.buf_ptr_; @@ -2514,6 +2572,12 @@ bool sparse_vector::back_insert_iterator::flush() return false; sv_->import_back_u(arr, arr_size, false); buf_ptr_ = (unsigned_value_type*) buffer_.data(); + block_idx_type nb = sv_->size() >> bm::set_block_shift; + if (nb != prev_nb_) + { + sv_->optimize_block(prev_nb_, opt_mode_); + prev_nb_ = nb; + } return true; } diff --git a/tools/tax/src/bm/bmsparsevec_algo.h b/tools/tax/src/bm/bmsparsevec_algo.h index 6037c544..a6b83ed2 100644 --- a/tools/tax/src/bm/bmsparsevec_algo.h +++ b/tools/tax/src/bm/bmsparsevec_algo.h @@ -1,7 +1,7 @@ #ifndef BMSPARSEVEC_ALGO__H__INCLUDED__ #define BMSPARSEVEC_ALGO__H__INCLUDED__ /* -Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com) +Copyright(c) 2002-2022 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ For more information please visit: http://bitmagic.io #endif #include +#include #include "bmdef.h" #include "bmsparsevec.h" @@ -481,6 +482,81 @@ void sparse_vector_find_mismatch(typename SV1::bvector_type& bv, } } +/** + \brief Index for SV sorted vectors for approximate range queries + + @internal + */ +template +class sv_sample_index +{ +public: + typedef typename SV::value_type value_type; + typedef typename SV::size_type size_type; + typedef typename SV::bvector_type bvector_type; + typedef typename bvector_type::allocator_type allocator_type; + typedef bm::dynamic_heap_matrix heap_matrix_type; + + sv_sample_index(){} + sv_sample_index(const SV& sv, unsigned s_factor) + { + construct(sv, s_factor); + } + + + + /** + Build sampling index for the sorted sprase vector + @param sv - string sparse vector to index + @param s_factor - sampling factor + */ + void construct(const SV& sv, unsigned s_factor); + + + /// Original SV size + size_type sv_size() const BMNOEXCEPT { return sv_size_; } + + /// Index size (number of sampled elements) + size_type size() const BMNOEXCEPT { return idx_size_; } + + /// returns true if all index values are unique + bool is_unique() const BMNOEXCEPT { return idx_unique_; } + + /// find range (binary) + /// @internal + bool bfind_range(const value_type* search_str, + size_t in_len, + size_type& l, + size_type& r) const BMNOEXCEPT; + + /// find common prefix between index elements and search string + /// + size_type common_prefix_length(const value_type* search_str, + size_t in_len, + size_type l, size_type r) const BMNOEXCEPT; + + + /** + recalculate range into SV coordinates range [from..to) + */ + void recalc_range(const value_type* search_str, + size_type& l, + size_type& r) const BMNOEXCEPT; + + /// Return length of minimal indexed string + size_t get_min_len() const BMNOEXCEPT { return min_key_len_; } + + + +private: + heap_matrix_type s_cache_; ///< cache for SV sampled elements + unsigned s_factor_ = 0; + size_type sv_size_ = 0; ///< original sv size + size_type idx_size_ = 0; ///< index size + bool idx_unique_ = true; ///< inx value unique or there are dups? + size_t min_key_len_ = 0; ///< minimal key size in index +}; + /** \brief algorithms for sparse_vector scan/search @@ -491,11 +567,17 @@ void sparse_vector_find_mismatch(typename SV1::bvector_type& bv, Class uses fast algorithms based on properties of bit-planes. This is NOT a brute force, direct scan, scanner uses search space pruning and cache optimizations to run the search. + + S_FACTOR - Sampling factor for search. Can be: [ 4, 8, 16, 32, 64 ]. Default: 16. + Lower sampling facor (4, 8) lowers memory footprint for the scanner class instance + Higher - improves search performance (at the expense for memory for sampled elements) + Sampling factor is used for binary search in bound string sparse vector, so memory consumption + depends on sampling and max string length. @ingroup svalgo @ingroup setalgo */ -template +template class sparse_vector_scanner { public: @@ -508,7 +590,7 @@ class sparse_vector_scanner typedef typename bvector_type::allocator_type allocator_type; typedef typename allocator_type::allocator_pool_type allocator_pool_type; - typedef bm::aggregator aggregator_type; + typedef bm::aggregator aggregator_type; typedef bm::heap_vector remap_vector_type; @@ -659,6 +741,20 @@ class sparse_vector_scanner */ void find_eq(const SV& sv, value_type value, bvector_type& bv_out); + + /** + \brief find all sparse vector elements EQ to search value + + Find all sparse vector elements equivalent to specified value + + \param sv - input sparse vector + \param value - value to search for + \param bi - back insert iterator for the search results + */ + template + void find_eq(const SV& sv, value_type value, BII bi); + + /** \brief find first sparse vector element @@ -798,8 +894,11 @@ class sparse_vector_scanner void find_eq_str(TPipe& pipe); /** - \brief binary find first sparse vector element (string) - Sparse vector must be sorted. + \brief binary find first sparse vector element (string). Sparse vector must be sorted. + + @param sv - sparse vector of strings to search + @param str - string prefix to search for + @param pos - [out] first position found */ bool bfind_eq_str(const SV& sv, const value_type* str, size_type& pos); @@ -821,11 +920,28 @@ class sparse_vector_scanner /** \brief binary find first sparse vector element (string) - Sparse vector must be sorted and attached + Sparse vector must be sorted and attached (use method bind()) + + @param str - string prefix to search for + @param pos - [out] first position found + @sa bind */ bool bfind_eq_str(const value_type* str, size_type& pos); + /** + \brief binary find first sparse vector element (string) + Sparse vector must be sorted and attached (use method bind()) + + @param str - string prefix to search for + @param len - string length + @param pos - [out] first position found + + @sa bind + */ + bool bfind_eq_str(const value_type* str, size_t len, size_type& pos); + + //@} /** @@ -927,6 +1043,13 @@ class sparse_vector_scanner protected: + template + bool bfind_eq_str_impl(const SV& sv, + const value_type* str, size_t in_len, + bool remaped, + size_type& pos); + + /// Remap input value into SV char encodings static bool remap_tosv(remap_vector_type& remap_vect_target, @@ -934,10 +1057,10 @@ class sparse_vector_scanner const SV& sv); /// set search boundaries (hint for the aggregator) - void set_search_range(size_type from, size_type to); + void set_search_range(size_type from, size_type to) BMNOEXCEPT; /// reset (disable) search range - void reset_search_range(); + void reset_search_range() BMNOEXCEPT; /// find value (may include NULL indexes) bool find_eq_with_nulls(const SV& sv, @@ -953,8 +1076,10 @@ class sparse_vector_scanner /// find first string value (may include NULL indexes) bool find_first_eq(const SV& sv, const value_type* str, + size_t in_len, size_type& idx, - bool remaped); + bool remaped, + unsigned prefix_len); /// find EQ str / prefix impl bool find_eq_str_impl(const SV& sv, @@ -975,7 +1100,9 @@ class sparse_vector_scanner void decompress(const SV& sv, bvector_type& bv_out); /// compare sv[idx] with input str - int compare_str(const SV& sv, size_type idx, const value_type* str); + template + int compare_str(const SV& sv, size_type idx, + const value_type* str) const BMNOEXCEPT; /// compare sv[idx] with input value int compare(const SV& sv, size_type idx, const value_type val) BMNOEXCEPT; @@ -1001,6 +1128,13 @@ class sparse_vector_scanner return 0; } + void resize_buffers() + { + value_vect_.resize_no_copy(effective_str_max_ * 2); + remap_value_vect_.resize_no_copy(effective_str_max_ * 2); + remap_prefix_vect_.resize_no_copy(effective_str_max_ * 2); + } + protected: sparse_vector_scanner(const sparse_vector_scanner&) = delete; @@ -1063,6 +1197,12 @@ class sparse_vector_scanner } // for } + enum code + { + sub_bfind_block_cnt = S_FACTOR, + sub_block_l1_size = bm::gap_max_bits / S_FACTOR // size in bits/elements + }; + private: allocator_pool_type pool_; bvector_type bv_tmp_; @@ -1074,11 +1214,17 @@ class sparse_vector_scanner bool mask_set_; const SV* bound_sv_; - heap_matrix_type block0_elements_cache_; ///< cache for elements[0] of each block - heap_matrix_type block3_elements_cache_; ///< cache for elements[16384x] of each block + + bm::sv_sample_index range_idx_; ///< range index +/* + heap_matrix_type block_l0_cache_; ///< cache for elements[0] of each block + heap_matrix_type block_l1_cache_; ///< cache for elements[x] +*/ size_type effective_str_max_; - remap_vector_type remap_value_vect_; ///< remap buffer + remap_vector_type value_vect_; ///< value buffer + remap_vector_type remap_value_vect_; ///< remap buffer + remap_vector_type remap_prefix_vect_; ///< common prefix buffer /// masks of allocated bit-planes (1 - means there is a bit-plane) mask_vector_type vector_plane_masks_; matrix_search_buf_type hmatr_; ///< heap matrix for string search linear stage @@ -1432,8 +1578,8 @@ void set2set_11_transform::one_pass_run(const bvector_type& bv_in, // //---------------------------------------------------------------------------- -template -sparse_vector_scanner::sparse_vector_scanner() +template +sparse_vector_scanner::sparse_vector_scanner() { mask_set_ = false; mask_from_ = mask_to_ = bm::id_max; @@ -1444,41 +1590,24 @@ sparse_vector_scanner::sparse_vector_scanner() //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::bind(const SV& sv, bool sorted) +template +void sparse_vector_scanner::bind(const SV& sv, bool sorted) { + static_assert(S_FACTOR == 4 || S_FACTOR == 8 || S_FACTOR == 16 + || S_FACTOR == 32 || S_FACTOR == 64, + "BM: sparse_vector_scanner<> incorrect sampling factor template parameter"); + (void)sorted; // MSVC warning over if constexpr variable "not-referenced" bound_sv_ = &sv; if constexpr (SV::is_str()) // bindings for the string sparse vector { effective_str_max_ = sv.effective_vector_max(); + resize_buffers(); + if (sorted) { - size_type sv_sz = sv.size(); - BM_ASSERT(sv_sz); - size_type total_nb = sv_sz / bm::gap_max_bits + 1; - - block0_elements_cache_.resize(total_nb, effective_str_max_+1); - block0_elements_cache_.set_zero(); - - block3_elements_cache_.resize(total_nb * 3, effective_str_max_+1); - block3_elements_cache_.set_zero(); - - // fill in elements cache - for (size_type i = 0; i < sv_sz; i+= bm::gap_max_bits) - { - size_type nb = (i >> bm::set_block_shift); - value_type* s0 = block0_elements_cache_.row(nb); - sv.get(i, s0, size_type(block0_elements_cache_.cols())); - - for (size_type k = 0; k < 3; ++k) - { - value_type* s1 = block3_elements_cache_.row(nb * 3 + k); - size_type idx = i + (k+1) * bm::sub_block3_size; - sv.get(idx, s1, size_type(block3_elements_cache_.cols())); - } // for k - } // for i + range_idx_.construct(sv, S_FACTOR); } // pre-calculate vector plane masks // @@ -1492,8 +1621,8 @@ void sparse_vector_scanner::bind(const SV& sv, bool sorted) //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::reset_binding() BMNOEXCEPT +template +void sparse_vector_scanner::reset_binding() BMNOEXCEPT { bound_sv_ = 0; effective_str_max_ = 0; @@ -1501,8 +1630,8 @@ void sparse_vector_scanner::reset_binding() BMNOEXCEPT //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::find_zero(const SV& sv, +template +void sparse_vector_scanner::find_zero(const SV& sv, bvector_type& bv_out, bool null_correct) { @@ -1528,8 +1657,8 @@ void sparse_vector_scanner::find_zero(const SV& sv, //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::invert(const SV& sv, bvector_type& bv_out) +template +void sparse_vector_scanner::invert(const SV& sv, bvector_type& bv_out) { if (sv.size() == 0) { @@ -1542,23 +1671,12 @@ void sparse_vector_scanner::invert(const SV& sv, bvector_type& bv_out) bv_out.invert(); bv_out.resize(old_sz); correct_nulls(sv, bv_out); - /* - bv_out.invert(); - const bvector_type* bv_null = sv.get_null_bvector(); - if (bv_null) // correct result to only use not NULL elements - bv_out &= *bv_null; - else - { - // TODO: use the shorter range to clear the tail - bv_out.set_range(sv.size(), bm::id_max - 1, false); - } - */ } //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::correct_nulls(const SV& sv, +template +void sparse_vector_scanner::correct_nulls(const SV& sv, bvector_type& bv_out) { const bvector_type* bv_null = sv.get_null_bvector(); @@ -1568,8 +1686,8 @@ void sparse_vector_scanner::correct_nulls(const SV& sv, //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::find_eq_with_nulls(const SV& sv, +template +bool sparse_vector_scanner::find_eq_with_nulls(const SV& sv, value_type value, bvector_type& bv_out, size_type search_limit) @@ -1599,8 +1717,8 @@ bool sparse_vector_scanner::find_eq_with_nulls(const SV& sv, //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::find_first_eq(const SV& sv, +template +bool sparse_vector_scanner::find_first_eq(const SV& sv, value_type value, size_type& idx) { @@ -1623,32 +1741,77 @@ bool sparse_vector_scanner::find_first_eq(const SV& sv, //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::find_first_eq( +template +bool sparse_vector_scanner::find_first_eq( const SV& sv, const value_type* str, + size_t in_len, size_type& idx, - bool remaped) + bool remaped, + unsigned prefix_len) { - if (sv.empty()) - return false; // nothing to do - BM_ASSERT(*str); + BM_ASSERT(*str && in_len); + BM_ASSERT(in_len == ::strlen(str)); if (!*str) return false; - agg_.reset(); unsigned common_prefix_len = 0; - if (mask_set_) + + value_type* pref = remap_prefix_vect_.data(); + if (mask_set_) // it is assumed that the sv is SORTED so common prefix check + { + // if in range is exactly one block + if (/*bool one_nb = */agg_.set_range_hint(mask_from_, mask_to_)) + { + if (prefix_len == ~0u) // not valid (uncalculated) prefix len + { + common_prefix_len = + sv.template common_prefix_length(mask_from_, mask_to_, pref); + if (common_prefix_len) + { + if (remaped) + str = remap_value_vect_.data(); + // next comparison is in the remapped form + for (unsigned i = 0; i < common_prefix_len; ++i) + if (str[i] != pref[i]) + return false; + } + } + else + { + unsigned pl; (void)pl; + BM_ASSERT(prefix_len <= + (pl=sv.template common_prefix_length( + mask_from_, mask_to_, pref))); + common_prefix_len = prefix_len; + } + } // if one block hit + else + { + if (prefix_len != ~0u) // not valid (uncalculated) prefix len + { + unsigned pl; (void)pl; + BM_ASSERT(prefix_len <= + (pl=sv.template common_prefix_length( + mask_from_, mask_to_, pref))); + common_prefix_len = prefix_len; + } + } + } + + // prefix len checks + if (common_prefix_len && (in_len <= common_prefix_len)) { - agg_.set_range_hint(mask_from_, mask_to_); - common_prefix_len = sv.common_prefix_length(mask_from_, mask_to_); + if (in_len == common_prefix_len) + --common_prefix_len; + else // (in_len < common_prefix_len) + return false; } - + + const value_type* search_str = str; if (remaped) - { str = remap_value_vect_.data(); - } else { if (sv.is_remap() && (str != remap_value_vect_.data())) @@ -1662,21 +1825,28 @@ bool sparse_vector_scanner::find_first_eq( } bool found = prepare_and_sub_aggregator(sv, str, common_prefix_len, true); - if (!found) - return found; - - found = agg_.find_first_and_sub(idx); + if (found) + { + found = agg_.find_first_and_sub(idx); + if (found && idx > mask_to_) // out of bounds? may be false positive + { + int cmp = sv.compare(idx, search_str); + found = (cmp == 0); + } + } + agg_.reset(); return found; } //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::prepare_and_sub_aggregator(const SV& sv, +template +bool sparse_vector_scanner::prepare_and_sub_aggregator( + const SV& sv, const value_type* str, - unsigned octet_start, - bool prefix_sub) + unsigned octet_start, + bool prefix_sub) { int len = 0; for (; str[len] != 0; ++len) @@ -1684,14 +1854,11 @@ bool sparse_vector_scanner::prepare_and_sub_aggregator(const SV& sv, BM_ASSERT(len); // use reverse order (faster for sorted arrays) - for (int octet_idx = len-1; octet_idx >= 0; --octet_idx) + // octet_start is the common prefix length (end index) + for (int octet_idx = len-1; octet_idx >= int(octet_start); --octet_idx) { - if (unsigned(octet_idx) < octet_start) // common prefix - break; - - unsigned value = unsigned(str[octet_idx]) & 0xFF; + unsigned value = unsigned(str[octet_idx]) & 0xFFu; BM_ASSERT(value != 0); - bm::id64_t planes_mask; if (&sv == bound_sv_) planes_mask = vector_plane_masks_[unsigned(octet_idx)]; @@ -1708,9 +1875,9 @@ bool sparse_vector_scanner::prepare_and_sub_aggregator(const SV& sv, // if (prefix_sub) { - unsigned plane_idx = unsigned(len * 8); typename SV::size_type planes = sv.get_bmatrix().rows_not_null(); - for (; plane_idx < planes; ++plane_idx) + for (unsigned plane_idx = unsigned(len * 8); + plane_idx < planes; ++plane_idx) { if (bvector_type_const_ptr bv = sv.get_slice(plane_idx)) agg_.add(bv, 1); // agg to SUB group @@ -1721,9 +1888,10 @@ bool sparse_vector_scanner::prepare_and_sub_aggregator(const SV& sv, //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::prepare_and_sub_aggregator(const SV& sv, - value_type value) +template +bool sparse_vector_scanner::prepare_and_sub_aggregator( + const SV& sv, + value_type value) { using unsigned_value_type = typename SV::unsigned_value_type; @@ -1761,8 +1929,8 @@ bool sparse_vector_scanner::prepare_and_sub_aggregator(const SV& sv, //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::find_eq_with_nulls_horizontal( +template +void sparse_vector_scanner::find_eq_with_nulls_horizontal( const SV& sv, value_type value, bvector_type& bv_out) @@ -1813,8 +1981,9 @@ void sparse_vector_scanner::find_eq_with_nulls_horizontal( //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::find_gt(const SV& sv, +template +void sparse_vector_scanner::find_gt( + const SV& sv, value_type val, bvector_type& bv_out) { @@ -1824,8 +1993,9 @@ void sparse_vector_scanner::find_gt(const SV& sv, //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::find_ge(const SV& sv, +template +void sparse_vector_scanner::find_ge( + const SV& sv, value_type val, bvector_type& bv_out) { @@ -1862,8 +2032,9 @@ void sparse_vector_scanner::find_ge(const SV& sv, //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::find_lt(const SV& sv, +template +void sparse_vector_scanner::find_lt( + const SV& sv, value_type val, bvector_type& bv_out) { @@ -1873,10 +2044,10 @@ void sparse_vector_scanner::find_lt(const SV& sv, //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::find_le(const SV& sv, - value_type val, - bvector_type& bv_out) +template +void sparse_vector_scanner::find_le(const SV& sv, + value_type val, + bvector_type& bv_out) { find_gt(sv, val, bv_out); invert(sv, bv_out); @@ -1884,8 +2055,8 @@ void sparse_vector_scanner::find_le(const SV& sv, //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::find_range(const SV& sv, +template +void sparse_vector_scanner::find_range(const SV& sv, value_type from, value_type to, bvector_type& bv_out) { @@ -1902,8 +2073,8 @@ void sparse_vector_scanner::find_range(const SV& sv, //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::find_gt_horizontal(const SV& sv, +template +void sparse_vector_scanner::find_gt_horizontal(const SV& sv, value_type value, bvector_type& bv_out, bool null_correct) @@ -2088,8 +2259,8 @@ void sparse_vector_scanner::find_gt_horizontal(const SV& sv, //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::aggregate_OR_slices( +template +void sparse_vector_scanner::aggregate_OR_slices( bvector_type& bv_target, const SV& sv, unsigned from, unsigned total_planes) @@ -2107,8 +2278,8 @@ void sparse_vector_scanner::aggregate_OR_slices( //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::aggregate_AND_OR_slices(bvector_type& bv_target, +template +void sparse_vector_scanner::aggregate_AND_OR_slices(bvector_type& bv_target, const bvector_type& bv_mask, const SV& sv, unsigned from, unsigned total_planes) @@ -2125,8 +2296,8 @@ void sparse_vector_scanner::aggregate_AND_OR_slices(bvector_type& bv_target, //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::find_eq_str_prefix(const SV& sv, +template +bool sparse_vector_scanner::find_eq_str_prefix(const SV& sv, const typename SV::value_type* str, typename SV::bvector_type& bv_out) { @@ -2136,9 +2307,10 @@ bool sparse_vector_scanner::find_eq_str_prefix(const SV& sv, //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::find_eq_str(const typename SV::value_type* str, - typename SV::size_type& pos) +template +bool sparse_vector_scanner::find_eq_str( + const typename SV::value_type* str, + typename SV::size_type& pos) { BM_ASSERT(bound_sv_); return this->find_eq_str(*bound_sv_, str, pos); @@ -2146,10 +2318,11 @@ bool sparse_vector_scanner::find_eq_str(const typename SV::value_type* str, //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::find_eq_str(const SV& sv, - const typename SV::value_type* str, - typename SV::size_type& pos) +template +bool sparse_vector_scanner::find_eq_str( + const SV& sv, + const typename SV::value_type* str, + typename SV::size_type& pos) { bool found = false; if (sv.empty()) @@ -2170,9 +2343,10 @@ bool sparse_vector_scanner::find_eq_str(const SV& sv, str = remap_value_vect_.data(); } } - + + size_t in_len = ::strlen(str); size_type found_pos; - found = find_first_eq(sv, str, found_pos, remaped); + found = find_first_eq(sv, str, in_len, found_pos, remaped, ~0u); if (found) { pos = found_pos; @@ -2195,9 +2369,10 @@ bool sparse_vector_scanner::find_eq_str(const SV& sv, //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::find_eq_str(const typename SV::value_type* str, - typename SV::bvector_type& bv_out) +template +bool sparse_vector_scanner::find_eq_str( + const typename SV::value_type* str, + typename SV::bvector_type& bv_out) { BM_ASSERT(bound_sv_); return find_eq_str(*bound_sv_, str, bv_out); @@ -2205,8 +2380,9 @@ bool sparse_vector_scanner::find_eq_str(const typename SV::value_type* str, //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::find_eq_str(const SV& sv, +template +bool sparse_vector_scanner::find_eq_str( + const SV& sv, const typename SV::value_type* str, typename SV::bvector_type& bv_out) { @@ -2215,8 +2391,8 @@ bool sparse_vector_scanner::find_eq_str(const SV& sv, //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::remap_tosv( +template +bool sparse_vector_scanner::remap_tosv( remap_vector_type& remap_vect_target, const typename SV::value_type* str, const SV& sv) @@ -2229,8 +2405,9 @@ bool sparse_vector_scanner::remap_tosv( //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::find_eq_str_impl(const SV& sv, +template +bool sparse_vector_scanner::find_eq_str_impl( + const SV& sv, const typename SV::value_type* str, typename SV::bvector_type& bv_out, bool prefix_sub) @@ -2280,8 +2457,8 @@ bool sparse_vector_scanner::find_eq_str_impl(const SV& sv, //---------------------------------------------------------------------------- -template template -void sparse_vector_scanner::find_eq_str(TPipe& pipe) +template template +void sparse_vector_scanner::find_eq_str(TPipe& pipe) { if (pipe.bv_and_mask_) { @@ -2297,135 +2474,146 @@ void sparse_vector_scanner::find_eq_str(TPipe& pipe) //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::bfind_eq_str( +template +template +bool sparse_vector_scanner::bfind_eq_str_impl( const SV& sv, const typename SV::value_type* str, + size_t in_len, + bool remaped, typename SV::size_type& pos) { bool found = false; if (sv.empty()) return found; - if (*str) + unsigned prefix_len = ~0u; + + if (in_len) { - bool remaped = false; - // test search pre-condition based on remap tables - if constexpr (SV::is_remap_support::value) + reset_search_range(); + + size_type l, r; + size_type found_pos; + + if constexpr (BOUND) { - if (sv.is_remap() && (str != remap_value_vect_.data())) + found = range_idx_.bfind_range(str, in_len, l, r); + if (!found) + return found; + + prefix_len = + (unsigned) range_idx_.common_prefix_length(str, in_len, l, r); + + if ((l == r) && (in_len == prefix_len)) { - auto str_len = sv.effective_vector_max(); - remap_value_vect_.resize(str_len); - remaped = sv.remap_tosv(remap_value_vect_.data(), str_len, str); - if (!remaped) - return remaped; + range_idx_.recalc_range(str, l, r); + pos = l; + return found; } + + range_idx_.recalc_range(str, l, r); + set_search_range(l, r); // r := r-1 (may happen here) [l..r] interval + + BM_ASSERT(this->compare_str(sv, l, str) <= 0); + // bad assert, because of the r = r-1 correction in recalc_range() + //BM_ASSERT(this->compare_str(sv, r, str) >= 0); + } - - reset_search_range(); - - // narrow down the search - const unsigned min_distance_cutoff = bm::gap_max_bits + bm::gap_max_bits / 2; - size_type l, r, dist; - l = 0; r = sv.size()-1; - size_type found_pos; - - // binary search to narrow down the search window - while (l <= r) + else { - dist = r - l; - if (dist < min_distance_cutoff) + // narrow down the search + const unsigned min_distance_cutoff = bm::gap_max_bits + bm::gap_max_bits / 2; + size_type dist; + l = 0; r = sv.size()-1; + + // binary search to narrow down the search window + while (l <= r) { - // we are in an narrow <2 blocks window, but still may be in two - // different neighboring blocks, lets try to narrow - // to exactly one block - - size_type nb_l = (l >> bm::set_block_shift); - size_type nb_r = (r >> bm::set_block_shift); - if (nb_l != nb_r) + dist = r - l; + if (dist < min_distance_cutoff) { - size_type mid = nb_r * bm::gap_max_bits; - if (mid < r) + // we are in an narrow <2 blocks window, but still may be in two + // different neighboring blocks, lets try to narrow + // to exactly one block + + size_type nb_l = (l >> bm::set_block_shift); + size_type nb_r = (r >> bm::set_block_shift); + if (nb_l != nb_r) { - int cmp = this->compare_str(sv, mid, str); - if (cmp < 0) // mid < str - l = mid; - else - r = mid-(cmp!=0); // branchless if (cmp==0) r=mid; - BM_ASSERT(l < r); + size_type mid = nb_r * bm::gap_max_bits; + if (mid < r) + { + int cmp = this->compare_str(sv, mid, str); + if (cmp < 0) // mid < str + l = mid; + else + r = mid-(cmp!=0); // branchless if (cmp==0) r=mid; + BM_ASSERT(l < r); + } + nb_l = unsigned(l >> bm::set_block_shift); + nb_r = unsigned(r >> bm::set_block_shift); } - nb_l = unsigned(l >> bm::set_block_shift); - nb_r = unsigned(r >> bm::set_block_shift); - } - - if (nb_l == nb_r) - { - size_type max_nb = sv.size() >> bm::set_block_shift; - if (nb_l != max_nb) + + if (nb_l == nb_r) { - // linear in-place fixed depth scan to identify the sub-range - size_type mid = nb_r * bm::gap_max_bits + bm::sub_block3_size; - int cmp = this->compare_str(sv, mid, str); - if (cmp < 0) + size_type max_nb = sv.size() >> bm::set_block_shift; + if (nb_l != max_nb) { - l = mid; - mid = nb_r * bm::gap_max_bits + bm::sub_block3_size * 2; - cmp = this->compare_str(sv, mid, str); - if (cmp < 0) + // linear scan to identify the sub-range + size_type mid = nb_r * bm::gap_max_bits + sub_block_l1_size; + for (unsigned i = 0; i < (sub_bfind_block_cnt-1); + ++i, mid += sub_block_l1_size) { - l = mid; - mid = nb_r * bm::gap_max_bits + bm::sub_block3_size * 3; - cmp = this->compare_str(sv, mid, str); + int cmp = this->compare_str(sv, mid, str); if (cmp < 0) l = mid; else + { r = mid; - } - else - { - r = mid; - } - } - else - { - r = mid; + break; + } + } // for i } + set_search_range(l, r); + break; } } - - set_search_range(l, r); - break; - } - typename SV::size_type mid = dist/2+l; - size_type nb = (mid >> bm::set_block_shift); - mid = nb * bm::gap_max_bits; - if (mid <= l) - { - if (nb == 0 && r > bm::gap_max_bits) - mid = bm::gap_max_bits; + typename SV::size_type mid = dist/2+l; + size_type nb = (mid >> bm::set_block_shift); + mid = nb * bm::gap_max_bits; + int cmp; + if (mid <= l) + { + if (nb == 0 && r > bm::gap_max_bits) + mid = bm::gap_max_bits; + else + { + mid = dist / 2 + l; + cmp = this->compare_str(sv, mid, str); + goto l1; + } + } + BM_ASSERT(mid > l); + cmp = this->compare_str(sv, mid, str); + l1: + if (cmp == 0) + { + found_pos = mid; + //found = true; + set_search_range(l, mid); + break; + } + if (cmp < 0) + l = mid+1; else - mid = dist / 2 + l; - } - BM_ASSERT(mid > l); - - int cmp = this->compare_str(sv, mid, str); - if (cmp == 0) - { - found_pos = mid; - found = true; - set_search_range(l, mid); - break; - } - if (cmp < 0) - l = mid+1; - else - r = mid-1; - } // while + r = mid-1; + } // while + } // use linear search (range is set) - found = find_first_eq(sv, str, found_pos, remaped); + found = find_first_eq(sv, str, in_len, found_pos, remaped, prefix_len); if (found) { pos = found_pos; @@ -2446,18 +2634,101 @@ bool sparse_vector_scanner::bfind_eq_str( //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::bfind_eq_str(const typename SV::value_type* str, - typename SV::size_type& pos) +template +bool sparse_vector_scanner::bfind_eq_str(const SV& sv, + const value_type* str, size_type& pos) +{ + size_t len = ::strlen(str); + effective_str_max_ = sv.effective_max_str(); + if (len > effective_str_max_) + return false; // impossible value + + resize_buffers(); + + bool remaped = false; + if constexpr (SV::is_remap_support::value) + { + if (sv.is_remap()) + { + remap_value_vect_.resize_no_copy(len); + remaped = sv.remap_tosv(remap_value_vect_.data(), + effective_str_max_, str); + if (!remaped) + return remaped; + } + } + return bfind_eq_str_impl(sv, str, len, remaped, pos); +} + +//---------------------------------------------------------------------------- + +template +bool sparse_vector_scanner::bfind_eq_str( + const typename SV::value_type* str, + typename SV::size_type& pos) +{ + BM_ASSERT(bound_sv_); // this function needs prior bind() + size_t len = ::strlen(str); + if (len > effective_str_max_) + return false; // impossible value + bool remaped = false; + if constexpr (SV::is_remap_support::value) + { + if (bound_sv_->is_remap()) + { + remaped = bound_sv_->remap_tosv(remap_value_vect_.data(), + effective_str_max_, str); + if (!remaped) + return remaped; + } + } + return bfind_eq_str_impl(*bound_sv_, str, len, remaped, pos); +} + +//---------------------------------------------------------------------------- + +template +bool sparse_vector_scanner::bfind_eq_str( + const value_type* str, size_t in_len, size_type& pos) { + BM_ASSERT(str); BM_ASSERT(bound_sv_); - return bfind_eq_str(*bound_sv_, str, pos); + + if (in_len > effective_str_max_) + return false; // impossible value + + value_type* s = value_vect_.data(); // copy to temp buffer, put zero end + + bool remaped = false; + // test search pre-condition based on remap tables + if constexpr (SV::is_remap_support::value) + { + if (bound_sv_->is_remap()) + { + remaped = bound_sv_->remap_n_tosv_2way( + remap_value_vect_.data(), + s, + effective_str_max_, + str, + in_len); + if (!remaped) + return remaped; + } + } + if (!remaped) // copy string, make sure it is zero terminated + { + for (size_t i = 0; i < in_len && *str; ++i) + s[i] = str[i]; + s[in_len] = value_type(0); + } + return bfind_eq_str_impl(*bound_sv_, s, in_len, remaped, pos); } //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::bfind(const SV& sv, +template +bool sparse_vector_scanner::bfind( + const SV& sv, const typename SV::value_type val, typename SV::size_type& pos) { @@ -2576,8 +2847,8 @@ bool sparse_vector_scanner::bfind(const SV& sv, //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::lower_bound_str( +template +bool sparse_vector_scanner::lower_bound_str( const SV& sv, const typename SV::value_type* str, typename SV::size_type& pos) @@ -2593,7 +2864,7 @@ bool sparse_vector_scanner::lower_bound_str( --r; // check initial boundary conditions if insert point is at tail/head - cmp = this->compare_str(sv, l, str); // left (0) boundary check + cmp = this->compare_str(sv, l, str); // left (0) boundary check if (cmp > 0) // vect[x] > str { pos = 0; @@ -2604,7 +2875,7 @@ bool sparse_vector_scanner::lower_bound_str( pos = 0; return true; } - cmp = this->compare_str(sv, r, str); // right(size-1) boundary check + cmp = this->compare_str(sv, r, str); // right(size-1) boundary check if (cmp == 0) { pos = r; @@ -2612,7 +2883,7 @@ bool sparse_vector_scanner::lower_bound_str( // TODO: adapt one-sided binary search to traverse large platos for (; r >= 0; --r) { - cmp = this->compare_str(sv, r, str); + cmp = this->compare_str(sv, r, str); if (cmp != 0) return true; pos = r; @@ -2630,7 +2901,7 @@ bool sparse_vector_scanner::lower_bound_str( { for (; l <= r; ++l) { - cmp = this->compare_str(sv, l, str); + cmp = this->compare_str(sv, l, str); if (cmp == 0) { pos = l; @@ -2646,14 +2917,14 @@ bool sparse_vector_scanner::lower_bound_str( while (l <= r) { size_type mid = (r-l)/2+l; - cmp = this->compare_str(sv, mid, str); + cmp = this->compare_str(sv, mid, str); if (cmp == 0) { pos = mid; // back-scan to rewind all duplicates for (size_type i = mid-1; i >= 0; --i) { - cmp = this->compare_str(sv, i, str); + cmp = this->compare_str(sv, i, str); if (cmp != 0) return true; pos = i; @@ -2691,7 +2962,7 @@ bool sparse_vector_scanner::lower_bound_str( return false; } } - cmp = this->compare_str(sv, l, str); + cmp = this->compare_str(sv, l, str); if (cmp > 0) // vect[x] > str { pos = l; @@ -2710,59 +2981,85 @@ bool sparse_vector_scanner::lower_bound_str( //---------------------------------------------------------------------------- -template -int sparse_vector_scanner::compare_str(const SV& sv, +template +template +int sparse_vector_scanner::compare_str( + const SV& sv, size_type idx, - const value_type* str) + const value_type* BMRESTRICT str + ) const BMNOEXCEPT { - if (bound_sv_ == &sv) +#if 0 + if constexpr (BOUND) { + BM_ASSERT(bound_sv_ == &sv); + size_type nb = (idx >> bm::set_block_shift); size_type nbit = (idx & bm::set_block_mask); - if (nbit == 0) // access to sentinel, first block element + int res = 0; + const value_type* BMRESTRICT s0; + /* + if (!nbit) // access to sentinel, first block element + s0 = block_l0_cache_.row(nb); + else { - value_type* s0 = block0_elements_cache_.row(nb); - if (*s0 == 0) // uninitialized element + BM_ASSERT(nbit % sub_block_l1_size == 0); + size_type k = + (nb * (sub_bfind_block_cnt-1)) + (nbit / sub_block_l1_size - 1); + s0 = block_l1_cache_.row(k); + } + */ + // strcmp + /* + if constexpr (sizeof(void*) == 8) // TODO: improve for WASM + { + for (unsigned i = 0; true; i+=sizeof(bm::id64_t)) { - sv.get(idx, s0, size_type(block0_elements_cache_.cols())); - } - int res = 0; - for (unsigned i = 0; i < block0_elements_cache_.cols(); ++i) + bm::id64_t o64, v64; + ::memcpy(&o64, str+i, sizeof(o64)); + ::memcpy(&v64, s0+i, sizeof(v64)); + + if (o64 != v64 || bm::has_zero_byte_u64(o64) + || bm::has_zero_byte_u64(v64)) + { + do + { + char octet = str[i]; char value = s0[i]; + res = (value > octet) - (value < octet); + if (res || !octet) + return res; + ++i; + } while(1); + } + } // for i + } + else */ + { + for (unsigned i = 0; true; ++i) { char octet = str[i]; char value = s0[i]; res = (value > octet) - (value < octet); if (res || !octet) break; } // for i - return res; - } - else - { - if (nbit % bm::sub_block3_size == 0) // TODO: use AND mask here - { - size_type k = nbit / bm::sub_block3_size - 1; - value_type* s1 = block3_elements_cache_.row(nb * 3 + k); - int res = 0; - for (unsigned i = 0; i < block3_elements_cache_.cols(); ++i) - { - char octet = str[i]; char value = s1[i]; - res = (value > octet) - (value < octet); - if (res || !octet) - break; - } // for i - return res; - } } + + return res; + } + else +#endif + { + return sv.compare(idx, str); } - return sv.compare(idx, str); } //---------------------------------------------------------------------------- -template -int sparse_vector_scanner::compare(const SV& sv, - size_type idx, - const value_type val) BMNOEXCEPT +template +int sparse_vector_scanner::compare( + const SV& sv, + size_type idx, + const value_type val) BMNOEXCEPT { // TODO: implement sentinel elements cache (similar to compare_str()) return sv.compare(idx, val); @@ -2770,8 +3067,9 @@ int sparse_vector_scanner::compare(const SV& sv, //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::find_eq(const SV& sv, +template +void sparse_vector_scanner::find_eq( + const SV& sv, typename SV::value_type value, typename SV::bvector_type& bv_out) { @@ -2794,8 +3092,43 @@ void sparse_vector_scanner::find_eq(const SV& sv, //---------------------------------------------------------------------------- -template -bool sparse_vector_scanner::find_eq(const SV& sv, +template template +void sparse_vector_scanner::find_eq( + const SV& sv, value_type value, BII bi) +{ + static_assert(!SV::is_compressed(), "BM:find_eq on RSC vector not implemented"); + + if (sv.empty()) + return; // nothing to do + if (!value) + { + // TODO: better implementation for 0 value seach + typename SV::bvector_type bv_out; + find_zero(sv, bv_out); + typename SV::bvector_type::enumerator en = bv_out.get_enumerator(0); + for (; en.valid(); ++en) + *bi = *en; + return; + } + + // search for value with aggregator + // + agg_.reset(); + + bool found = prepare_and_sub_aggregator(sv, value); + if (!found) + return; // impossible value + + found = agg_.combine_and_sub_bi(bi); + agg_.reset(); +} + + +//---------------------------------------------------------------------------- + +template +bool sparse_vector_scanner::find_eq( + const SV& sv, typename SV::value_type value, typename SV::size_type& pos) { @@ -2823,9 +3156,10 @@ bool sparse_vector_scanner::find_eq(const SV& sv, //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::find_nonzero(const SV& sv, - typename SV::bvector_type& bv_out) +template +void sparse_vector_scanner::find_nonzero( + const SV& sv, + typename SV::bvector_type& bv_out) { agg_.reset(); // in case if previous scan was interrupted auto sz = sv.effective_slices(); // sv.slices(); @@ -2837,8 +3171,9 @@ void sparse_vector_scanner::find_nonzero(const SV& sv, //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::find_positive(const SV& sv, +template +void sparse_vector_scanner::find_positive( + const SV& sv, typename SV::bvector_type& bv_out) { BM_ASSERT(sv.size()); @@ -2852,8 +3187,9 @@ void sparse_vector_scanner::find_positive(const SV& sv, //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::decompress(const SV& sv, +template +void sparse_vector_scanner::decompress( + const SV& sv, typename SV::bvector_type& bv_out) { if constexpr (SV::is_compressed()) @@ -2873,32 +3209,31 @@ void sparse_vector_scanner::decompress(const SV& sv, //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::set_search_range(size_type from, size_type to) +template +void sparse_vector_scanner::set_search_range( + size_type from, size_type to) BMNOEXCEPT { - BM_ASSERT(from < to); - mask_from_ = from; - mask_to_ = to; - mask_set_ = true; + BM_ASSERT(from <= to); + mask_from_ = from; mask_to_ = to; mask_set_ = true; } //---------------------------------------------------------------------------- -template -void sparse_vector_scanner::reset_search_range() +template +void sparse_vector_scanner::reset_search_range() BMNOEXCEPT { mask_set_ = false; - mask_from_ = mask_to_ = bm::id_max; } //---------------------------------------------------------------------------- -// sparse_vector_scanner::pipeline +// sparse_vector_scanner::pipeline //---------------------------------------------------------------------------- -template template +template template void -sparse_vector_scanner::pipeline::set_search_mask(const bvector_type* bv_mask) BMNOEXCEPT +sparse_vector_scanner::pipeline::set_search_mask( + const bvector_type* bv_mask) BMNOEXCEPT { static_assert(Opt::is_masks(), "BM: Search masking needs to be enabled in template parameter options before function call. see bm::agg_run_options<> "); @@ -2907,9 +3242,10 @@ sparse_vector_scanner::pipeline::set_search_mask(const bvector_type* bv //---------------------------------------------------------------------------- -template template +template template void -sparse_vector_scanner::pipeline::add(const typename SV::value_type* str) +sparse_vector_scanner::pipeline::add( + const typename SV::value_type* str) { BM_ASSERT(str); @@ -2969,7 +3305,7 @@ sparse_vector_scanner::pipeline::add(const typename SV::value_type* str //if (prefix_sub) { unsigned plane_idx = unsigned(len * 8); - // SUB group should NOt include not NULL bvector + // SUB group should not include not NULL bvector size_type planes = size_type(this->eff_slices_); for (; plane_idx < planes; ++plane_idx) { @@ -2980,8 +3316,260 @@ sparse_vector_scanner::pipeline::add(const typename SV::value_type* str } +//---------------------------------------------------------------------------- +// sv_sample_index +//---------------------------------------------------------------------------- + +template +void sv_sample_index::construct(const SV& sv, unsigned s_factor) +{ + BM_ASSERT(SV::is_str()); + s_factor_ = s_factor; + sv_size_ = sv.size(); + if (!sv_size_) + return; + + // resize and init the cache matrix + // + auto effective_str_max = sv.effective_vector_max() + 1; + size_type total_nb = (sv_size_ / bm::gap_max_bits) + 1; + size_type idx_size = total_nb * s_factor + 1; + s_cache_.init_resize(idx_size, effective_str_max); + s_cache_.set_zero(); + + // build the index + const size_type cols = size_type(s_cache_.cols()); + const size_type s_step = bm::gap_max_bits / s_factor; + idx_size_ = 0; + for(size_type i = 0; true; ) + { + value_type* s_str = s_cache_.row(idx_size_); + ++idx_size_; + sv.get(i, s_str, cols); + + if (i == sv_size_-1) // last element was aleady covered, break + break; + i += s_step; + if (i >= sv_size_) // add the last sampled element + { + i = sv_size_-1; + if (i) + { + s_str = s_cache_.row(idx_size_); + ++idx_size_; + sv.get(i, s_str, cols); + } + break; + } + } // for i + + size_t min_len = 0; + { + const value_type* s = s_cache_.row(0); + min_len = ::strlen(s); + } + + // find index duplicates, minimum key size, ... + // + idx_unique_ = true; + const value_type* str_prev = s_cache_.row(0); + for(size_type i = 1; i < idx_size_; ++i) + { + const value_type* str_curr = s_cache_.row(i); + size_t curr_len = ::strlen(str_curr); + if (curr_len < min_len) + min_len = curr_len; + + int cmp = SV::compare_str(str_prev, str_curr); + BM_ASSERT(cmp <= 0); + if (cmp == 0) // duplicate + { + idx_unique_ = false; + break; + } + str_prev = str_curr; + } // for i + + min_key_len_ = min_len; + +} + //---------------------------------------------------------------------------- +template +bool sv_sample_index::bfind_range(const value_type* search_str, + size_t in_len, + size_type& l, + size_type& r) const BMNOEXCEPT +{ + const size_type linear_cutoff = 4; + if (!idx_size_) + return false; + l = 0; r = idx_size_ - 1; + int cmp; + + size_t min_len = this->min_key_len_; + if (in_len < min_len) + min_len = in_len; + + // check the left-right boundaries + { + const value_type* str = s_cache_.row(l); + cmp = SV::compare_str(search_str, str, min_len); + if (cmp < 0) + return false; + + str = s_cache_.row(r); + cmp = SV::compare_str(search_str, str, min_len); + if (cmp > 0) + return false; + } + + while (l < r) + { + size_type dist = r - l; + if (dist < linear_cutoff) // do linear scan here + { + for (size_type i = l+1; i < r; ++i) + { + const value_type* str_i = s_cache_.row(i); + cmp = SV::compare_str(search_str, str_i, min_len); + if (cmp > 0) // |----i-*--|----| + { // |----*----|----| + l = i; + continue; // continue searching + } + /* + if (cmp == 0) // |----*----|----| + { + l = r = i; + return true; + } + */ + // |--*-i----|----| + BM_ASSERT(i); + r = i; + break; + } // for i + return true; + } // if linear scan + + size_type mid = (r-l) / 2 + l; + const value_type* str_m = s_cache_.row(mid); + cmp = SV::compare_str(str_m, search_str, min_len); + if (cmp <= 0) // str_m <= search_str + l = mid; + else // str_m > search_str + r = mid; + } // while + + return true; +} + +//---------------------------------------------------------------------------- + +template +typename sv_sample_index::size_type +sv_sample_index::common_prefix_length(const value_type* str_s, + size_t in_len, + size_type l, + size_type r) const BMNOEXCEPT +{ + const value_type* str_l = s_cache_.row(l); + const value_type* str_r = s_cache_.row(r); + + size_t min_len = (in_len < min_key_len_) ? in_len : min_key_len_; + size_type i = 0; + if (min_len >= 4) + { + for (; i < min_len-3; i+=4) + { + unsigned i2, i1; + ::memcpy(&i2, &str_l[i], sizeof(i2)); + ::memcpy(&i1, &str_r[i], sizeof(i1)); + BM_ASSERT(!bm::has_zero_byte_u64( + bm::id64_t(i2) | (bm::id64_t(i1) << 32))); + if (i1 != i2) + break; + ::memcpy(&i2, &str_s[i], sizeof(i2)); + BM_ASSERT(!bm::has_zero_byte_u64( + bm::id64_t(i2) | (bm::id64_t(i1) << 32))); + if (i1 != i2) + break; + } // for i + } + + for (; true; ++i) + { + auto ch1 = str_l[i]; auto ch2 = str_r[i]; + if (ch1 != ch2 || (!(ch1|ch2))) // chars not the same or both zero + break; + auto chs = str_s[i]; + if (ch1 != chs) + break; + } // for i + return i; +} + + +//---------------------------------------------------------------------------- + +template +void sv_sample_index::recalc_range(const value_type* search_str, + size_type& l, + size_type& r) const BMNOEXCEPT +{ + BM_ASSERT(l <= r); + BM_ASSERT(r < idx_size_); + + // -1 correction here below is done to get it to the closed interval + // [from..to] when possible, because it reduces search space + // by one scan wave + + const size_type s_step = bm::gap_max_bits / s_factor_; + if (r == idx_size_-1) // last element + { + l *= s_step; + if (l == r) + { + r = sv_size_-1; + BM_ASSERT(l <= r); + return; + } + r = sv_size_-1; + if (l > r) + l = 0; + } + else + { + if (l == r) + { + l *= s_step; + r = l + s_step-1; + if (r >= sv_size_) + r = sv_size_-1; + } + else + { + const value_type* str = s_cache_.row(r); + l *= s_step; + r *= s_step; + int cmp = SV::compare_str(search_str, str); + BM_ASSERT(cmp <= 0); + if (cmp != 0) + r -= (r && idx_unique_); // -1 correct + else + if (idx_unique_) + { + l = r; + } + } + } + BM_ASSERT(r <= sv_size_); + BM_ASSERT(l <= r); +} + +//---------------------------------------------------------------------------- } // namespace bm diff --git a/tools/tax/src/bm/bmsparsevec_compr.h b/tools/tax/src/bm/bmsparsevec_compr.h index 4b9a593e..a2d2f52c 100644 --- a/tools/tax/src/bm/bmsparsevec_compr.h +++ b/tools/tax/src/bm/bmsparsevec_compr.h @@ -744,6 +744,18 @@ class rsc_sparse_vector void calc_stat( struct rsc_sparse_vector::statistics* st) const BMNOEXCEPT; + /** + @brief Turn sparse vector into immutable mode + Read-only (immutable) vector uses less memory and allows faster searches. + Before freezing it is recommenede to call optimize() to get full memory saving effect + @sa optimize + */ + void freeze() { sv_.freeze(); } + + /** Returns true if vector is read-only */ + bool is_ro() const BMNOEXCEPT { return sv_.is_ro_; } + + ///@} @@ -944,10 +956,9 @@ class rsc_sparse_vector void throw_no_rsc_index(); protected: - template friend class sparse_vector_scanner; + template friend class sparse_vector_scanner; template friend class sparse_vector_serializer; template friend class sparse_vector_deserializer; - template friend class sparse_vector_scanner; private: @@ -1423,6 +1434,7 @@ void rsc_sparse_vector::sync(bool force) const bvector_type* bv_null = sv_.get_null_bvector(); BM_ASSERT(bv_null); bv_null->build_rs_index(rs_idx_); // compute popcount prefix list + sv_.is_ro_ = bv_null->is_ro(); if (force) sync_size(); diff --git a/tools/tax/src/bm/bmsparsevec_serial.h b/tools/tax/src/bm/bmsparsevec_serial.h index 0280d92e..4cc393b8 100755 --- a/tools/tax/src/bm/bmsparsevec_serial.h +++ b/tools/tax/src/bm/bmsparsevec_serial.h @@ -338,6 +338,18 @@ class sparse_vector_deserializer sparse_vector_deserializer(); ~sparse_vector_deserializer(); + /** + Set deserialization finalization to force deserialized vectors into READONLY (or READWRITE) mode. + Performance impact: Turning ON finalization will make deserialization a lit slower, + because each bit-vector will be re-converted into new mode (READONLY). + Following (search) operations may perform a bit faster. + + @param is_final - finalization code + (use bm::finalization::READONLY to produce an immutable vector) + */ + void set_finalization(bm::finalization is_final); + + /** Set external XOR reference vectors (data frame referenece vectors) @@ -351,7 +363,7 @@ class sparse_vector_deserializer @param sv - [out] target sparse vector to populate @param buf - input BLOB source memory pointer - @param clear_sv - if true clears the target vector + @param clear_sv - if true clears the target vector (sv) @sa deserialize_range */ @@ -403,8 +415,7 @@ class sparse_vector_deserializer /*! - Load serialization descriptor, create vectors - but DO NOT perform full deserialization + Load serialization descriptor, create vectors but DO NOT perform full deserialization @param sv - [out] target sparse vector to populate @param buf - source memory pointer */ @@ -467,10 +478,11 @@ class sparse_vector_deserializer typedef bm::heap_vector rlen_vector_type; protected: - const unsigned char* remap_buf_ptr_; - alloc_type alloc_; - bm::word_t* temp_block_; - allocator_pool_type pool_; + bm::finalization is_final_ = bm::finalization::UNDEFINED; + const unsigned char* remap_buf_ptr_ = 0; + alloc_type alloc_; + bm::word_t* temp_block_ = 0; + allocator_pool_type pool_; bvector_type plane_digest_bv_; // digest of bit-planes bm::id64_t sv_size_; @@ -486,11 +498,11 @@ class sparse_vector_deserializer rlen_vector_type remap_rlen_vect_; // XOR compression variables - bv_ref_vector_type bv_ref_; ///< reference vector - bv_ref_vector_type* bv_ref_ptr_; ///< external ref + bv_ref_vector_type bv_ref_; ///< reference vector + bv_ref_vector_type* bv_ref_ptr_ = 0; ///< external ref bit-vect // Range deserialization parameters - bool idx_range_set_; + bool idx_range_set_ = false; size_type idx_range_from_; size_type idx_range_to_; }; @@ -989,8 +1001,8 @@ void sparse_vector_serializer::serialize(const SV& sv, bm::xor_sim_params xs_params; build_xor_ref_vector(sv); bvs_.set_ref_vectors(&bv_ref_); - bvs_.compute_sim_model(sim_model_, bv_ref_, xs_params); - bvs_.set_sim_model(&sim_model_); + if (bvs_.compute_sim_model(sim_model_, bv_ref_, xs_params)) + bvs_.set_sim_model(&sim_model_); } } @@ -1142,7 +1154,6 @@ void sparse_vector_serializer::serialize(const SV& sv, template sparse_vector_deserializer::sparse_vector_deserializer() - : remap_buf_ptr_(0), bv_ref_ptr_(0), idx_range_set_(false) { temp_block_ = alloc_.alloc_bit_block(); not_null_mask_bv_.set_allocator_pool(&pool_); @@ -1152,22 +1163,30 @@ sparse_vector_deserializer::sparse_vector_deserializer() // ------------------------------------------------------------------------- template -void -sparse_vector_deserializer::set_xor_ref(bv_ref_vector_type* bv_ref_ptr) +sparse_vector_deserializer::~sparse_vector_deserializer() { - bv_ref_ptr_ = bv_ref_ptr; - if (!bv_ref_ptr_) - clear_xor_compression(); + if (temp_block_) + alloc_.free_bit_block(temp_block_); } +// ------------------------------------------------------------------------- + +template +void +sparse_vector_deserializer::set_finalization(bm::finalization is_final) +{ + this->is_final_ = is_final; +} // ------------------------------------------------------------------------- template -sparse_vector_deserializer::~sparse_vector_deserializer() +void +sparse_vector_deserializer::set_xor_ref(bv_ref_vector_type* bv_ref_ptr) { - if (temp_block_) - alloc_.free_bit_block(temp_block_); + bv_ref_ptr_ = bv_ref_ptr; + if (!bv_ref_ptr_) + clear_xor_compression(); } // ------------------------------------------------------------------------- @@ -1218,19 +1237,17 @@ void sparse_vector_deserializer::deserialize_structure(SV& sv, unsigned char matr_s_ser = 0; unsigned planes = load_header(dec, sv, matr_s_ser); + if (planes == 0) + return; - // bm::id64_t sv_size = dec.get_64(); load_planes_off_table(buf, dec, planes); // read the offset vector of bit-planes - for (unsigned i = 0; i < planes; ++i) { if (!off_vect_[i]) // empty vector continue; - bvector_type* bv = sv.get_create_slice(i); BM_ASSERT(bv); (void)bv; - - } // for + } // for i } // ------------------------------------------------------------------------- @@ -1474,7 +1491,6 @@ void sparse_vector_deserializer::deserialize_planes( if (mask_bv) // gather mask set, use AND operation deserializer { typename bvector_type::mem_pool_guard mp_g_z(pool_, *bv); - if (bm::conditional::test() && !remap_buf_ptr_) // last plane vector (special case) { @@ -1482,14 +1498,14 @@ void sparse_vector_deserializer::deserialize_planes( deserial_.deserialize(*bv, bv_buf_ptr, temp_block_); remap_buf_ptr_ = bv_buf_ptr + read_bytes; bv->bit_and(*mask_bv, bvector_type::opt_compress); - continue; } - if (idx_range_set_) - deserial_.set_range(idx_range_from_, idx_range_to_); - - deserial_.deserialize(*bv, bv_buf_ptr); - - bv->bit_and(*mask_bv, bvector_type::opt_compress); + else + { + if (idx_range_set_) + deserial_.set_range(idx_range_from_, idx_range_to_); + deserial_.deserialize(*bv, bv_buf_ptr); + bv->bit_and(*mask_bv, bvector_type::opt_compress); + } } else { @@ -1501,21 +1517,31 @@ void sparse_vector_deserializer::deserialize_planes( remap_buf_ptr_ = bv_buf_ptr + read_bytes; if (idx_range_set_) bv->keep_range(idx_range_from_, idx_range_to_); - continue; - } - if (idx_range_set_) - { - deserial_.set_range(idx_range_from_, idx_range_to_); - deserial_.deserialize(*bv, bv_buf_ptr); - bv->keep_range(idx_range_from_, idx_range_to_); } else { - //size_t read_bytes = - deserial_.deserialize(*bv, bv_buf_ptr, temp_block_); + if (idx_range_set_) + { + deserial_.set_range(idx_range_from_, idx_range_to_); + deserial_.deserialize(*bv, bv_buf_ptr); + bv->keep_range(idx_range_from_, idx_range_to_); + } + else + { + //size_t read_bytes = + deserial_.deserialize(*bv, bv_buf_ptr, temp_block_); + } } } + switch (is_final_) + { + case bm::finalization::READONLY: + bv->freeze(); + break; + default: + break; + } } // for i deserial_.unset_range(); @@ -1580,6 +1606,16 @@ int sparse_vector_deserializer::load_null_plane(SV& sv, if (mask_bv) bv->bit_and(*mask_bv, bvector_type::opt_compress); } + + switch (is_final_) + { + case bm::finalization::READONLY: + bv->freeze(); + break; + default: + break; + } + } return planes-1; } @@ -1701,34 +1737,37 @@ void sparse_vector_deserializer::load_remap(SV& sv, raise_invalid_format(); } rmatr->resize(rows, cols, false); - rmatr->set_zero(); - - // read gamma encoded row lens - remap_rlen_vect_.resize(0); + if (rows) { - bm::bit_in bi(dec_m); - for (size_t r = 0; r < rows; ++r) - { - unsigned rl = bi.gamma(); - remap_rlen_vect_.push_back(rl); - } // for r - } + rmatr->set_zero(); - for (size_t r = 0; r < rows; ++r) - { - unsigned char* BMRESTRICT row = rmatr->row(r); - size_t cnt = remap_rlen_vect_[r]; - if (!cnt || cnt > 256) + // read gamma encoded row lens + remap_rlen_vect_.resize(0); { - raise_invalid_format(); // format corruption! + bm::bit_in bi(dec_m); + for (size_t r = 0; r < rows; ++r) + { + unsigned rl = bi.gamma(); + remap_rlen_vect_.push_back(rl); + } // for r } - for (size_t j = 0; j < cnt; ++j) + + for (size_t r = 0; r < rows; ++r) { - unsigned idx = dec_m.get_8(); - unsigned char v = dec_m.get_8(); - row[idx] = v; - } // for j - } // for r + unsigned char* BMRESTRICT row = rmatr->row(r); + size_t cnt = remap_rlen_vect_[r]; + if (!cnt || cnt > 256) + { + raise_invalid_format(); // format corruption! + } + for (size_t j = 0; j < cnt; ++j) + { + unsigned idx = dec_m.get_8(); + unsigned char v = dec_m.get_8(); + row[idx] = v; + } // for j + } // for r + } } break; default: diff --git a/tools/tax/src/bm/bmsse2.h b/tools/tax/src/bm/bmsse2.h index 26164472..4e971df7 100644 --- a/tools/tax/src/bm/bmsse2.h +++ b/tools/tax/src/bm/bmsse2.h @@ -590,6 +590,85 @@ bool sse2_and_digest_5way(__m128i* BMRESTRICT dst, return z1 & z2; } +/*! + @brief AND block digest stride + @return true if stide is all zero + @ingroup SSE2 +*/ +inline +bool sse2_and_digest_3way(__m128i* BMRESTRICT dst, + const __m128i* BMRESTRICT src1, + const __m128i* BMRESTRICT src2) BMNOEXCEPT +{ + __m128i m1A, m1B, m1C, m1D; +// __m128i m1E, m1F, m1G, m1H; + + m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0)); + m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1)); + m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2)); + m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3)); +/* + m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0)); + m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1)); + m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2)); + m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3)); + + m1A = _mm_and_si128(m1A, m1E); + m1B = _mm_and_si128(m1B, m1F); + m1C = _mm_and_si128(m1C, m1G); + m1D = _mm_and_si128(m1D, m1H); +*/ + m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0)); + m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1)); + m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2)); + m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3)); + + _mm_store_si128(dst+0, m1A); + _mm_store_si128(dst+1, m1B); + _mm_store_si128(dst+2, m1C); + _mm_store_si128(dst+3, m1D); + + m1A = _mm_or_si128(m1A, m1B); + m1C = _mm_or_si128(m1C, m1D); + m1A = _mm_or_si128(m1A, m1C); + + bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF); + + m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4)); + m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5)); + m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6)); + m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7)); +/* + m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4)); + m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5)); + m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6)); + m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7)); + + m1A = _mm_and_si128(m1A, m1E); + m1B = _mm_and_si128(m1B, m1F); + m1C = _mm_and_si128(m1C, m1G); + m1D = _mm_and_si128(m1D, m1H); +*/ + m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4)); + m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5)); + m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6)); + m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7)); + + _mm_store_si128(dst+4, m1A); + _mm_store_si128(dst+5, m1B); + _mm_store_si128(dst+6, m1C); + _mm_store_si128(dst+7, m1D); + + m1A = _mm_or_si128(m1A, m1B); + m1C = _mm_or_si128(m1C, m1D); + m1A = _mm_or_si128(m1A, m1C); + + bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF); + + return z1 & z2; +} + + /*! @brief AND block digest stride @@ -740,17 +819,182 @@ bool sse2_sub_digest_2way(__m128i* BMRESTRICT dst, return z1 & z2; } +/*! + @brief SUB block digest stride + @return true if stide is all zero + @ingroup SSE4 +*/ +inline +bool sse2_sub_digest_5way(__m128i* BMRESTRICT dst, + const __m128i* BMRESTRICT src1, + const __m128i* BMRESTRICT src2, + const __m128i* BMRESTRICT src3, + const __m128i* BMRESTRICT src4) BMNOEXCEPT +{ + __m128i m1A, m1B, m1C, m1D; + __m128i m1E, m1F, m1G, m1H; + __m128i maskFF = _mm_set1_epi32(~0u); + + m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0))); + m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1))); + m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2))); + m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3))); + + m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+0)), _mm_xor_si128(maskFF,_mm_load_si128(src4+0))); + m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+1)), _mm_xor_si128(maskFF,_mm_load_si128(src4+1))); + m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+2)), _mm_xor_si128(maskFF,_mm_load_si128(src4+2))); + m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+3)), _mm_xor_si128(maskFF,_mm_load_si128(src4+3))); + + m1A = _mm_and_si128(m1A, m1E); + m1B = _mm_and_si128(m1B, m1F); + m1C = _mm_and_si128(m1C, m1G); + m1D = _mm_and_si128(m1D, m1H); + + m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0)); + m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1)); + m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2)); + m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3)); + + _mm_store_si128(dst+0, m1A); + _mm_store_si128(dst+1, m1B); + _mm_store_si128(dst+2, m1C); + _mm_store_si128(dst+3, m1D); + + m1A = _mm_or_si128(m1A, m1B); + m1C = _mm_or_si128(m1C, m1D); + m1A = _mm_or_si128(m1A, m1C); + + const __m128i maskz = _mm_setzero_si128(); + bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF); + + m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4))); + m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5))); + m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6))); + m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7))); + + m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+4)), _mm_xor_si128(maskFF,_mm_load_si128(src4+4))); + m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+5)), _mm_xor_si128(maskFF,_mm_load_si128(src4+5))); + m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+6)), _mm_xor_si128(maskFF,_mm_load_si128(src4+6))); + m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+7)), _mm_xor_si128(maskFF,_mm_load_si128(src4+7))); + + m1A = _mm_and_si128(m1A, m1E); + m1B = _mm_and_si128(m1B, m1F); + m1C = _mm_and_si128(m1C, m1G); + m1D = _mm_and_si128(m1D, m1H); + + m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4)); + m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5)); + m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6)); + m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7)); + + _mm_store_si128(dst+4, m1A); + _mm_store_si128(dst+5, m1B); + _mm_store_si128(dst+6, m1C); + _mm_store_si128(dst+7, m1D); + + m1A = _mm_or_si128(m1A, m1B); + m1C = _mm_or_si128(m1C, m1D); + m1A = _mm_or_si128(m1A, m1C); + + bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF); + + return z1 & z2; +} + + +/*! + @brief SUB block digest stride + @return true if stide is all zero + @ingroup SSE4 +*/ +inline +bool sse2_sub_digest_3way(__m128i* BMRESTRICT dst, + const __m128i* BMRESTRICT src1, + const __m128i* BMRESTRICT src2) BMNOEXCEPT +{ + __m128i m1A, m1B, m1C, m1D; +// __m128i m1E, m1F, m1G, m1H; + __m128i maskFF = _mm_set1_epi32(~0u); + + m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0))); + m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1))); + m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2))); + m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3))); +/* + m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+0)), _mm_xor_si128(maskFF,_mm_load_si128(src4+0))); + m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+1)), _mm_xor_si128(maskFF,_mm_load_si128(src4+1))); + m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+2)), _mm_xor_si128(maskFF,_mm_load_si128(src4+2))); + m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+3)), _mm_xor_si128(maskFF,_mm_load_si128(src4+3))); + + m1A = _mm_and_si128(m1A, m1E); + m1B = _mm_and_si128(m1B, m1F); + m1C = _mm_and_si128(m1C, m1G); + m1D = _mm_and_si128(m1D, m1H); +*/ + m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0)); + m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1)); + m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2)); + m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3)); + + _mm_store_si128(dst+0, m1A); + _mm_store_si128(dst+1, m1B); + _mm_store_si128(dst+2, m1C); + _mm_store_si128(dst+3, m1D); + + m1A = _mm_or_si128(m1A, m1B); + m1C = _mm_or_si128(m1C, m1D); + m1A = _mm_or_si128(m1A, m1C); + + const __m128i maskz = _mm_setzero_si128(); + bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF); + + m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4))); + m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5))); + m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6))); + m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7))); +/* + m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+4)), _mm_xor_si128(maskFF,_mm_load_si128(src4+4))); + m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+5)), _mm_xor_si128(maskFF,_mm_load_si128(src4+5))); + m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+6)), _mm_xor_si128(maskFF,_mm_load_si128(src4+6))); + m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+7)), _mm_xor_si128(maskFF,_mm_load_si128(src4+7))); + + m1A = _mm_and_si128(m1A, m1E); + m1B = _mm_and_si128(m1B, m1F); + m1C = _mm_and_si128(m1C, m1G); + m1D = _mm_and_si128(m1D, m1H); +*/ + m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4)); + m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5)); + m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6)); + m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7)); + + _mm_store_si128(dst+4, m1A); + _mm_store_si128(dst+5, m1B); + _mm_store_si128(dst+6, m1C); + _mm_store_si128(dst+7, m1D); + + m1A = _mm_or_si128(m1A, m1B); + m1C = _mm_or_si128(m1C, m1D); + m1A = _mm_or_si128(m1A, m1C); + + bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF); + return z1 & z2; +} + + + /*! \brief Find first non-zero bit @ingroup SSE2 */ inline -bool sse2_bit_find_first(const __m128i* BMRESTRICT block, +bool sse2_bit_find_first(const __m128i* BMRESTRICT block, unsigned off, unsigned* pos) BMNOEXCEPT { unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR; + block = (const __m128i*)((bm::word_t*)(block) + off); const __m128i* block_end = (const __m128i*)((bm::word_t*)(block) + bm::set_block_size); const __m128i maskZ = _mm_setzero_si128(); @@ -775,7 +1019,7 @@ bool sse2_bit_find_first(const __m128i* BMRESTRICT block, unsigned widx = bsf >> 2; // (bsf / 4); unsigned w = simd_buf[widx]; bsf = bm::bit_scan_forward32(w); // find first bit != 0 - *pos = (simd_lane * 128) + (widx * 32) + bsf; + *pos = (off * 32) +(simd_lane * 128) + (widx * 32) + bsf; return true; } unsigned mask = (_mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ))); @@ -786,7 +1030,7 @@ bool sse2_bit_find_first(const __m128i* BMRESTRICT block, unsigned widx = bsf >> 2; // (bsf / 4); unsigned w = simd_buf[widx]; bsf = bm::bit_scan_forward32(w); // find first bit != 0 - *pos = ((++simd_lane) * 128) + (widx * 32) + bsf; + *pos = (off * 32) + ((++simd_lane) * 128) + (widx * 32) + bsf; return true; } simd_lane+=2; @@ -1132,7 +1376,8 @@ SSE4.2 check for one to two (variable len) 128 bit SSE lines for gap search resu \internal */ inline -unsigned sse2_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size) +unsigned sse2_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, + const bm::gap_word_t pos, unsigned size) { BM_ASSERT(size <= 16); BM_ASSERT(size); @@ -1140,13 +1385,10 @@ unsigned sse2_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, const bm::gap_word const unsigned unroll_factor = 8; if (size < 4) // for very short vector use conventional scan { - unsigned j; - for (j = 0; j < size; ++j) - { - if (pbuf[j] >= pos) - break; - } - return j; + if (pbuf[0] >= pos) { size = 0; } + else if (pbuf[1] >= pos) { size = 1; } + else { size = 2; BM_ASSERT(pbuf[2] >= pos); } + return size; } __m128i m1, mz, maskF, maskFL; @@ -1179,7 +1421,7 @@ unsigned sse2_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, const bm::gap_word BM_ASSERT(pbuf2 > pbuf); // assert in place to make sure GCC warning is indeed false m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap) - mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp + mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp mi = _mm_movemask_epi8(mge_mask); if (mi) { @@ -1205,39 +1447,48 @@ unsigned sse2_gap_bfind(const unsigned short* BMRESTRICT buf, { unsigned start = 1; unsigned end = 1 + ((*buf) >> 3); - unsigned dsize = end - start; - if (dsize < 17) + const unsigned arr_end = end; + BM_ASSERT(start != end); + unsigned size = end - start; + + for (; size >= 64; size = end - start) { - start = bm::sse2_gap_find(buf+1, (bm::gap_word_t)pos, dsize); - *is_set = ((*buf) & 1) ^ (start & 1); - BM_ASSERT(buf[start+1] >= pos); - BM_ASSERT(buf[start] < pos || (start==0)); + unsigned mid = (start + end) >> 1; + if (buf[mid] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + } // for - return start+1; - } - unsigned arr_end = end; - while (start != end) + for (; size >= 16; size = end - start) { - unsigned curr = (start + end) >> 1; - if (buf[curr] < pos) - start = curr + 1; + if (unsigned mid = (start + end) >> 1; buf[mid] < pos) + start = mid + 1; else - end = curr; + end = mid; + if (unsigned mid = (start + end) >> 1; buf[mid] < pos) + start = mid + 1; + else + end = mid; + } // for - unsigned size = end - start; - if (size < 16) - { - size += (end != arr_end); - unsigned idx = - bm::sse2_gap_find(buf + start, (bm::gap_word_t)pos, size); - start += idx; - - BM_ASSERT(buf[start] >= pos); - BM_ASSERT(buf[start - 1] < pos || (start == 1)); - break; - } - } + size += (end != arr_end); + start += bm::sse2_gap_find(buf + start, (bm::gap_word_t)pos, size); + BM_ASSERT(buf[start] >= pos); + BM_ASSERT(buf[start - 1] < pos || (start == 1)); *is_set = ((*buf) & 1) ^ ((start-1) & 1); return start; @@ -1299,6 +1550,9 @@ unsigned sse2_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos) #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \ sse2_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4)) +#define VECT_AND_DIGEST_3WAY(dst, src1, src2) \ + sse2_and_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) + #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \ sse2_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) @@ -1323,6 +1577,12 @@ unsigned sse2_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos) #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \ sse2_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) +#define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \ + sse2_sub_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4)) + +#define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \ + sse2_sub_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) + #define VECT_XOR_BLOCK(dst, src) \ sse2_xor_block((__m128i*) dst, (__m128i*) (src)) @@ -1363,8 +1623,8 @@ unsigned sse2_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos) sse2_shift_r1((__m128i*)b, acc, co) -#define VECT_BIT_FIND_FIRST(src, pos) \ - sse2_bit_find_first((__m128i*) src, pos) +#define VECT_BIT_FIND_FIRST(src, off, pos) \ + sse2_bit_find_first((__m128i*) src, off, pos) #define VECT_BIT_FIND_DIFF(src1, src2, pos) \ sse2_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos) @@ -1378,6 +1638,9 @@ unsigned sse2_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos) #define VECT_GAP_BFIND(buf, pos, is_set) \ sse2_gap_bfind(buf, pos, is_set) +#define VECT_GAP_TEST(buf, pos) \ + sse2_gap_test(buf, pos) + } // namespace diff --git a/tools/tax/src/bm/bmsse4.h b/tools/tax/src/bm/bmsse4.h index 9b1e0da1..fd70bccd 100644 --- a/tools/tax/src/bm/bmsse4.h +++ b/tools/tax/src/bm/bmsse4.h @@ -117,7 +117,44 @@ bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end) BMNOEXCE return count; } +#ifdef BM64_SSE4 + +/*! + SSE4.2 optimized bitcounting, uses digest for positioning + @ingroup SSE4 +*/ +inline +bm::id_t sse42_bit_count_digest(const bm::word_t* BMRESTRICT block, + bm::id64_t digest) BMNOEXCEPT +{ + BM_ASSERT(digest); + + bm::id_t count = 0; + bm::id64_t d = digest; + while (d) + { + const bm::id64_t t = bm::bmi_blsi_u64(d); // d & -d; + const unsigned wave = (unsigned)_mm_popcnt_u64(t - 1); + const unsigned off = wave * bm::set_block_digest_wave_size; + + const bm::bit_block_t::bunion_t* BMRESTRICT src_u = + (const bm::bit_block_t::bunion_t*)(&block[off]); + unsigned j = 0; + do + { + count += + unsigned( _mm_popcnt_u64(src_u->w64[j]) + + _mm_popcnt_u64(src_u->w64[j+1]) + + _mm_popcnt_u64(src_u->w64[j+2]) + + _mm_popcnt_u64(src_u->w64[j+3])); + } while ((j+=4) < bm::set_block_digest_wave_size/2); + + d = bm::bmi_bslr_u64(d); // d &= d - 1; + } // while (d); + return count; +} +#endif /*! \internal @@ -445,6 +482,66 @@ bool sse4_and_or_digest_2way(__m128i* BMRESTRICT dst, return z1 & z2; } +/*! + @brief AND block digest stride + @return true if stide is all zero + @ingroup SSE4 +*/ +inline +bool sse4_and_digest_3way(__m128i* BMRESTRICT dst, + const __m128i* BMRESTRICT src1, + const __m128i* BMRESTRICT src2) BMNOEXCEPT +{ + __m128i m1A, m1B, m1C, m1D; + + m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0)); + m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1)); + m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2)); + m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3)); + + + m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0)); + m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1)); + m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2)); + m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3)); + + _mm_store_si128(dst+0, m1A); + _mm_store_si128(dst+1, m1B); + _mm_store_si128(dst+2, m1C); + _mm_store_si128(dst+3, m1D); + + m1A = _mm_or_si128(m1A, m1B); + m1C = _mm_or_si128(m1C, m1D); + m1A = _mm_or_si128(m1A, m1C); + + bool z1 = _mm_testz_si128(m1A, m1A); + + m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4)); + m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5)); + m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6)); + m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7)); + + + m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4)); + m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5)); + m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6)); + m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7)); + + _mm_store_si128(dst+4, m1A); + _mm_store_si128(dst+5, m1B); + _mm_store_si128(dst+6, m1C); + _mm_store_si128(dst+7, m1D); + + m1A = _mm_or_si128(m1A, m1B); + m1C = _mm_or_si128(m1C, m1D); + m1A = _mm_or_si128(m1A, m1C); + + bool z2 = _mm_testz_si128(m1A, m1A); + + return z1 & z2; +} + + /*! @brief AND block digest stride @@ -527,6 +624,7 @@ bool sse4_and_digest_5way(__m128i* BMRESTRICT dst, } + /*! @brief SUB (AND NOT) block digest stride *dst &= ~*src @@ -625,6 +723,147 @@ bool sse4_sub_digest_2way(__m128i* BMRESTRICT dst, return z1 & z2; } +/*! + @brief SUB block digest stride + @return true if stide is all zero + @ingroup SSE4 +*/ +inline +bool sse4_sub_digest_5way(__m128i* BMRESTRICT dst, + const __m128i* BMRESTRICT src1, + const __m128i* BMRESTRICT src2, + const __m128i* BMRESTRICT src3, + const __m128i* BMRESTRICT src4) BMNOEXCEPT +{ + __m128i m1A, m1B, m1C, m1D; + __m128i m1E, m1F, m1G, m1H; + __m128i maskFF = _mm_set1_epi32(~0u); + + m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0))); + m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1))); + m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2))); + m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3))); + + m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+0)), _mm_xor_si128(maskFF,_mm_load_si128(src4+0))); + m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+1)), _mm_xor_si128(maskFF,_mm_load_si128(src4+1))); + m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+2)), _mm_xor_si128(maskFF,_mm_load_si128(src4+2))); + m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+3)), _mm_xor_si128(maskFF,_mm_load_si128(src4+3))); + + m1A = _mm_and_si128(m1A, m1E); + m1B = _mm_and_si128(m1B, m1F); + m1C = _mm_and_si128(m1C, m1G); + m1D = _mm_and_si128(m1D, m1H); + + m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0)); + m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1)); + m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2)); + m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3)); + + _mm_store_si128(dst+0, m1A); + _mm_store_si128(dst+1, m1B); + _mm_store_si128(dst+2, m1C); + _mm_store_si128(dst+3, m1D); + + m1A = _mm_or_si128(m1A, m1B); + m1C = _mm_or_si128(m1C, m1D); + m1A = _mm_or_si128(m1A, m1C); + + bool z1 = _mm_testz_si128(m1A, m1A); + + m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4))); + m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5))); + m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6))); + m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7))); + + m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+4)), _mm_xor_si128(maskFF,_mm_load_si128(src4+4))); + m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+5)), _mm_xor_si128(maskFF,_mm_load_si128(src4+5))); + m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+6)), _mm_xor_si128(maskFF,_mm_load_si128(src4+6))); + m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+7)), _mm_xor_si128(maskFF,_mm_load_si128(src4+7))); + + m1A = _mm_and_si128(m1A, m1E); + m1B = _mm_and_si128(m1B, m1F); + m1C = _mm_and_si128(m1C, m1G); + m1D = _mm_and_si128(m1D, m1H); + + m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4)); + m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5)); + m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6)); + m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7)); + + _mm_store_si128(dst+4, m1A); + _mm_store_si128(dst+5, m1B); + _mm_store_si128(dst+6, m1C); + _mm_store_si128(dst+7, m1D); + + m1A = _mm_or_si128(m1A, m1B); + m1C = _mm_or_si128(m1C, m1D); + m1A = _mm_or_si128(m1A, m1C); + + bool z2 = _mm_testz_si128(m1A, m1A); + + return z1 & z2; +} + + +/*! + @brief SUB block digest stride + @return true if stide is all zero + @ingroup SSE4 +*/ +inline +bool sse4_sub_digest_3way(__m128i* BMRESTRICT dst, + const __m128i* BMRESTRICT src1, + const __m128i* BMRESTRICT src2) BMNOEXCEPT +{ + __m128i m1A, m1B, m1C, m1D; + __m128i maskFF = _mm_set1_epi32(~0u); + + m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0))); + m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1))); + m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2))); + m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3))); + + m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0)); + m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1)); + m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2)); + m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3)); + + _mm_store_si128(dst+0, m1A); + _mm_store_si128(dst+1, m1B); + _mm_store_si128(dst+2, m1C); + _mm_store_si128(dst+3, m1D); + + m1A = _mm_or_si128(m1A, m1B); + m1C = _mm_or_si128(m1C, m1D); + m1A = _mm_or_si128(m1A, m1C); + + bool z1 = _mm_testz_si128(m1A, m1A); + + m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4))); + m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5))); + m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6))); + m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7))); + + m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4)); + m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5)); + m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6)); + m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7)); + + _mm_store_si128(dst+4, m1A); + _mm_store_si128(dst+5, m1B); + _mm_store_si128(dst+6, m1C); + _mm_store_si128(dst+7, m1D); + + m1A = _mm_or_si128(m1A, m1B); + m1C = _mm_or_si128(m1C, m1D); + m1A = _mm_or_si128(m1A, m1C); + + bool z2 = _mm_testz_si128(m1A, m1A); + + return z1 & z2; +} + + /*! @@ -1033,10 +1272,12 @@ bool sse42_bit_find_first_diff(const __m128i* BMRESTRICT block1, */ inline bool sse42_bit_find_first(const __m128i* BMRESTRICT block, + unsigned off, unsigned* pos) BMNOEXCEPT { unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR; + block = (const __m128i*)((const bm::word_t*)(block) + off); const __m128i* block_end = (const __m128i*)((bm::word_t*)(block) + bm::set_block_size); const __m128i maskZ = _mm_setzero_si128(); @@ -1058,7 +1299,7 @@ bool sse42_bit_find_first(const __m128i* BMRESTRICT block, unsigned widx = bsf >> 2; // (bsf / 4); unsigned w = simd_buf[widx]; bsf = BM_BSF32(w); // find first bit != 0 - *pos = (simd_lane * 128) + (widx * 32) + bsf; + *pos = (off * 32) + (simd_lane * 128) + (widx * 32) + bsf; return true; } unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ)); @@ -1069,7 +1310,7 @@ bool sse42_bit_find_first(const __m128i* BMRESTRICT block, unsigned widx = bsf >> 2; // (bsf / 4); unsigned w = simd_buf[widx]; bsf = BM_BSF32(w); // find first bit != 0 - *pos = ((++simd_lane) * 128) + (widx * 32) + bsf; + *pos = (off * 32) + ((++simd_lane) * 128) + (widx * 32) + bsf; return true; } @@ -1100,19 +1341,8 @@ unsigned sse4_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size) BMNOEXCEPT { BM_ASSERT(size <= 16); - BM_ASSERT(size); - + BM_ASSERT(size >= 4); const unsigned unroll_factor = 8; - if (size < 4) // for very short vector use conventional scan - { - unsigned j; - for (j = 0; j < size; ++j) - { - if (pbuf[j] >= pos) - break; - } - return j; - } __m128i m1, mz, maskF, maskFL; @@ -1161,57 +1391,129 @@ unsigned sse42_gap_bfind(const unsigned short* BMRESTRICT buf, unsigned pos, unsigned* BMRESTRICT is_set) BMNOEXCEPT { unsigned start = 1; - unsigned end = 1 + ((*buf) >> 3); - unsigned dsize = end - start; +// unsigned end = 1 + ((*buf) >> 3); + unsigned end = ((*buf) >> 3); + BM_ASSERT(buf[end] == 65535); - if (dsize < 17) +// const unsigned arr_end = end+1; + unsigned size = end - start; + for (; size >= 64; size = end - start) { - start = bm::sse4_gap_find(buf+1, (bm::gap_word_t)pos, dsize); - *is_set = ((*buf) & 1) ^ (start & 1); - BM_ASSERT(buf[start+1] >= pos); - BM_ASSERT(buf[start] < pos || (start==0)); + unsigned mid = (start + end) >> 1; + if (buf[mid] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + } // for + BM_ASSERT(buf[end] >= pos); - return start+1; - } - unsigned arr_end = end; - while (start != end) + for (; size >= 16; size = end - start) { - unsigned curr = (start + end) >> 1; - if (buf[curr] < pos) - start = curr + 1; + if (unsigned mid = (start + end) >> 1; buf[mid] < pos) + start = mid + 1; + else + end = mid; + if (unsigned mid = (start + end) >> 1; buf[mid] < pos) + start = mid + 1; + else + end = mid; + } // for +// size += (end != arr_end); + ++size; + if (size < 4) // for very short vector use conventional scan + { + const unsigned short* BMRESTRICT pbuf = buf + start; + if (pbuf[0] >= pos) { } + else if (pbuf[1] >= pos) { start++; } else - end = curr; - - unsigned size = end - start; - if (size < 16) { - size += (end != arr_end); - unsigned idx = - bm::sse4_gap_find(buf + start, (bm::gap_word_t)pos, size); - start += idx; - - BM_ASSERT(buf[start] >= pos); - BM_ASSERT(buf[start - 1] < pos || (start == 1)); - break; + BM_ASSERT(pbuf[2] >= pos); + start+=2; } } - + else + { + start += bm::sse4_gap_find(buf+start, (bm::gap_word_t)pos, size); + } *is_set = ((*buf) & 1) ^ ((start-1) & 1); return start; } + /** - Hybrid binary search, starts as binary, then switches to scan + Hybrid binary search to test GAP value, starts as binary, then switches to scan + @return test result @ingroup SSE4 */ inline unsigned sse42_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos) BMNOEXCEPT { - unsigned is_set; - bm::sse42_gap_bfind(buf, pos, &is_set); - return is_set; -} + unsigned start = 1; +// unsigned end = start + ((*buf) >> 3); + unsigned end = ((*buf) >> 3); + unsigned size = end - start; +// const unsigned arr_end = end; + for (; size >= 64; size = end - start) + { + unsigned mid = (start + end) >> 1; + if (buf[mid] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + if (buf[mid = (start + end) >> 1] < pos) + start = mid+1; + else + end = mid; + } // for + for (; size >= 16; size = end - start) + { + if (unsigned mid = (start + end) >> 1; buf[mid] < pos) + start = mid+1; + else + end = mid; + } // for + //size += (end != arr_end); + ++size; + if (size < 4) // for very short vector use conventional scan + { + const unsigned short* BMRESTRICT pbuf = buf + start; + if (pbuf[0] >= pos) { } + else if (pbuf[1] >= pos) { start++; } + else + { + BM_ASSERT(pbuf[2] >= pos); + start+=2; + } + } + else + { + start += bm::sse4_gap_find(buf+start, (bm::gap_word_t)pos, size); + } + BM_ASSERT(buf[start] >= pos); + BM_ASSERT(buf[start - 1] < pos || (start == 1)); + return ((*buf) & 1) ^ ((--start) & 1); +} /** @@ -1809,7 +2111,12 @@ void sse42_bit_block_xor_2way(bm::word_t* target_block, #define VECT_BITCOUNT(first, last) \ sse4_bit_count((__m128i*) (first), (__m128i*) (last)) - +/* +#ifdef BM64_SSE4 +#define VECT_BIT_COUNT_DIGEST(src, digest) \ + sse42_bit_count_digest(src, digest) +#endif +*/ #define VECT_BITCOUNT_AND(first, last, mask) \ sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) @@ -1837,6 +2144,9 @@ void sse42_bit_block_xor_2way(bm::word_t* target_block, #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \ sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4)) +#define VECT_AND_DIGEST_3WAY(dst, src1, src2) \ + sse4_and_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) + #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \ sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) @@ -1861,6 +2171,12 @@ void sse42_bit_block_xor_2way(bm::word_t* target_block, #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \ sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) +#define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \ + sse4_sub_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4)) + +#define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \ + sse4_sub_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) + #define VECT_XOR_BLOCK(dst, src) \ sse2_xor_block((__m128i*) dst, (__m128i*) (src)) @@ -1923,8 +2239,8 @@ void sse42_bit_block_xor_2way(bm::word_t* target_block, sse42_bit_block_calc_change_bc((__m128i*)block, gc, bc) #endif -#define VECT_BIT_FIND_FIRST(src, pos) \ - sse42_bit_find_first((__m128i*) src, pos) +#define VECT_BIT_FIND_FIRST(src, off, pos) \ + sse42_bit_find_first((__m128i*) src, off, pos) #define VECT_BIT_FIND_DIFF(src1, src2, pos) \ sse42_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos) @@ -1939,6 +2255,9 @@ void sse42_bit_block_xor_2way(bm::word_t* target_block, #define VECT_GAP_BFIND(buf, pos, is_set) \ sse42_gap_bfind(buf, pos, is_set) +#define VECT_GAP_TEST(buf, pos) \ + sse42_gap_test(buf, pos) + #ifdef __GNUG__ #pragma GCC diagnostic pop #endif diff --git a/tools/tax/src/bm/bmsse_util.h b/tools/tax/src/bm/bmsse_util.h index bb22ca68..6051bc05 100644 --- a/tools/tax/src/bm/bmsse_util.h +++ b/tools/tax/src/bm/bmsse_util.h @@ -811,7 +811,7 @@ void sse2_set_block(__m128i* BMRESTRICT dst, bm::word_t value) BMNOEXCEPT __m128i* BMRESTRICT dst_end = (__m128i*)((bm::word_t*)(dst) + bm::set_block_size); - __m128i xmm0 = _mm_set1_epi32((int)value); + __m128i xmm0 = _mm_set1_epi32(int(value)); do { _mm_store_si128(dst, xmm0); diff --git a/tools/tax/src/bm/bmstrsparsevec.h b/tools/tax/src/bm/bmstrsparsevec.h index 0fddf611..92db824c 100644 --- a/tools/tax/src/bm/bmstrsparsevec.h +++ b/tools/tax/src/bm/bmstrsparsevec.h @@ -45,6 +45,12 @@ For more information please visit: http://bitmagic.io namespace bm { +enum class remap_setup +{ + COPY_RTABLES, //!< copy remap tables only (without data) +}; + + /*! \brief succinct sparse vector for strings with compression using bit-slicing ( transposition) method @@ -358,11 +364,32 @@ class str_sparse_vector : public base_sparse_vector BM_ASSERT(bi.empty()); buf_matrix_.init_resize( bi.buf_matrix_.rows(), bi.buf_matrix_.cols()); - this->flush(); sv_ = bi.sv_; + this->flush_impl(); sv_ = bi.sv_; return *this; } ~back_insert_iterator(); + + /** + Set optimization on load option (deafult: false) + */ + void set_optimize(typename bvector_type::optmode opt_mode) BMNOEXCEPT + { opt_mode_ = opt_mode; } + + /** + Method to configure back inserter to collect statistics on optimal character codes. + This methos makes back inserter slower, but can be used to accelerate later remap() of + the sparse vector. Use flush at the end to apply the remapping. + By default inserter does not collect additional statistics. + + Important! You should NOT use intermediate flush if you set remapping! + + @sa flush + */ + void set_remap(bool flag) BMNOEXCEPT { remap_flags_ = flag; } + + /// Get curent remap state flags + unsigned get_remap() const BMNOEXCEPT { return remap_flags_; } /** push value to the vector */ back_insert_iterator& operator=(const value_type* v) @@ -392,8 +419,20 @@ class str_sparse_vector : public base_sparse_vector /** add a series of consequitve NULLs (no-value) to the container */ void add_null(size_type count); - /** flush the accumulated buffer */ + /** flush the accumulated buffer. It is important to call flush at the end, before destruction of the + inserter. Flush may throw exceptions, which will be possible to intercept. + Otherwise inserter is flushed in the destructor. + */ void flush(); + + // access to internals + // + + /// Get octet frequence matrix + /// @internal + const octet_freq_matrix_type& get_octet_matrix() const noexcept + { return omatrix_; } + protected: /** return true if insertion buffer is empty */ bool empty() const BMNOEXCEPT; @@ -406,19 +445,32 @@ class str_sparse_vector : public base_sparse_vector */ void add_value(const value_type* v); + /** + account new value as remap statistics + */ + void add_remap_stat(const value_type* v); + + void flush_impl(); + private: enum buf_size_e { n_buf_size = str_sparse_vector_type::ins_buf_size // 1024 * 8 }; typedef bm::dynamic_heap_matrix buffer_matrix_type; + friend class str_sparse_vector; - private: + protected: str_sparse_vector_type* sv_; ///!< pointer on the parent vector bvector_type* bv_null_; ///!< not NULL vector pointer buffer_matrix_type buf_matrix_; ///!< value buffer size_type pos_in_buf_; ///!< buffer position - block_idx_type prev_nb_; ///!< previous block added + block_idx_type prev_nb_ = 0;///!< previous block added + typename + bvector_type::optmode opt_mode_ = bvector_type::opt_compress; + /// + unsigned remap_flags_ = 0; ///< target remapping + octet_freq_matrix_type omatrix_; ///< octet frequency matrix }; @@ -447,7 +499,14 @@ class str_sparse_vector : public base_sparse_vector /*! copy-ctor */ str_sparse_vector(const str_sparse_vector& str_sv); - + + /*! construct empty sparse vector, copying the remap tables from another vector + \param str_sv - source vector to take the remap tables from (assumed to be remaped) + \param remap_mode - remap table copy param + */ + str_sparse_vector(const str_sparse_vector& str_sv, bm::remap_setup remap_mode); + + /*! copy assignmment operator */ str_sparse_vector& operator = ( const str_sparse_vector& str_sv) @@ -474,9 +533,7 @@ class str_sparse_vector : public base_sparse_vector (str_sparse_vector&& str_sv) BMNOEXCEPT { if (this != &str_sv) - { this->swap(str_sv); - } return *this; } #endif @@ -538,6 +595,14 @@ class str_sparse_vector : public base_sparse_vector */ void insert(size_type idx, const value_type* str); + /** + \brief swap two vector elements between each other + \param idx1 - element index 1 + \param idx1 - element index 2 + */ + void swap(size_type idx1, size_type idx2); + + /*! \brief insert STL string @@ -708,6 +773,12 @@ class str_sparse_vector : public base_sparse_vector */ int compare(size_type idx, const value_type* str) const BMNOEXCEPT; + static + int compare_str(const value_type* str1, const value_type* str2) BMNOEXCEPT; + + static + int compare_str(const value_type* str1, const value_type* str2, size_t min_len) BMNOEXCEPT; + /** \brief Compare two vector elements @@ -722,9 +793,12 @@ class str_sparse_vector : public base_sparse_vector /** \brief Find size of common prefix between two vector elements in octets + @param prefix_buf - optional param for keeping the common prefix string (without remap decode) \return size of common prefix */ - unsigned common_prefix_length(size_type idx1, size_type idx2) const BMNOEXCEPT; + template + unsigned common_prefix_length(size_type idx1, size_type idx2, + value_type* prefix_buf=0) const BMNOEXCEPT; /** Variant of compare for remapped vectors. Caller MUST guarantee vector is remapped. @@ -841,7 +915,17 @@ class str_sparse_vector : public base_sparse_vector struct str_sparse_vector::statistics* st ) const BMNOEXCEPT; - + /** + @brief Turn sparse vector into immutable mode + Read-only (immutable) vector uses less memory and allows faster searches. + Before freezing it is recommenede to call optimize() to get full memory saving effect + @sa optimize, remap + */ + void freeze() { this->freeze_matr(); } + + /** Returns true if vector is read-only */ + bool is_ro() const BMNOEXCEPT { return this->is_ro_; } + ///@} // ------------------------------------------------------------ @@ -909,8 +993,11 @@ class str_sparse_vector : public base_sparse_vector should not be modified (should be read-only). \param str_sv - source sparse vector (assumed it is not remapped) + \param omatrix - pointer to externall computed char freaquency matrix (optional) + \so remap, freeze */ - void remap_from(const str_sparse_vector& str_sv); + void remap_from(const str_sparse_vector& str_sv, + octet_freq_matrix_type* omatrix = 0); /** Build remapping profile and re-load content to save memory @@ -943,7 +1030,22 @@ class str_sparse_vector : public base_sparse_vector const value_type* BMRESTRICT str, const slice_octet_matrix_type& BMRESTRICT octet_remap_matrix2 ) BMNOEXCEPT; - + /*! + remap string from external (ASCII) system to matrix internal code + also creates a zero terminated copy string + @return true if remapping was ok, false if found incorrect value + for the plane + @internal + */ + static + bool remap_n_tosv_2way( + value_type* BMRESTRICT sv_str, + value_type* BMRESTRICT str_cp, + size_type buf_size, + const value_type* BMRESTRICT str, + size_t in_len, + const slice_octet_matrix_type& BMRESTRICT octet_remap_matrix2) BMNOEXCEPT; + /*! remap string from external (ASCII) system to matrix internal code @internal @@ -954,6 +1056,22 @@ class str_sparse_vector : public base_sparse_vector { return remap_tosv(sv_str, buf_size, str, remap_matrix2_); } + + /*! + remap string from external (ASCII) system to matrix internal code + @internal + */ + bool remap_n_tosv_2way( + value_type* BMRESTRICT sv_str, + value_type* BMRESTRICT str_cp, + size_type buf_size, + const value_type* BMRESTRICT str, + size_t in_len) const BMNOEXCEPT + { + return remap_n_tosv_2way( + sv_str, str_cp, buf_size, str, in_len, remap_matrix2_); + } + /*! remap string from internal code to external (ASCII) system @return true if remapping was ok, false if found incorrect value @@ -1029,42 +1147,50 @@ class str_sparse_vector : public base_sparse_vector struct sv_decode_visitor_func { sv_decode_visitor_func(CharMatrix& cmatr) BMNOEXCEPT2 - : cmatr_(cmatr), mask_(0), sv_off_(0) + : cmatr_(cmatr) {} int add_bits(size_type bv_offset, - const unsigned char* BMRESTRICT bits, unsigned bits_size) BMNOEXCEPT + const unsigned char* BMRESTRICT bits, + unsigned bits_size) BMNOEXCEPT { + BM_ASSERT(bits_size); + // can be negative (-1) when bv base offset = 0 and sv = 1,2.. size_type base = bv_offset - sv_off_; - unsigned_value_type m = mask_; + const unsigned_value_type m = mask_; const unsigned i = substr_i_; - for (unsigned j = 0; j < bits_size; ++j) + unsigned j = 0; + do { size_type idx = bits[j] + base; value_type* BMRESTRICT str = cmatr_.row(idx); str[i] |= m; - } // for i + } while (++j < bits_size); return 0; } + int add_range(size_type bv_offset, size_type sz) BMNOEXCEPT { + BM_ASSERT(sz); + auto base = bv_offset - sv_off_; - unsigned_value_type m = mask_; + const unsigned_value_type m = mask_; const unsigned i = substr_i_; - for (size_type j = 0; j < sz; ++j) + size_type j = 0; + do { size_type idx = j + base; value_type* BMRESTRICT str = cmatr_.row(idx); str[i] |= m; - } // for i + } while(++j < sz); return 0; } - CharMatrix& cmatr_; ///< target array for reverse transpose - unsigned_value_type mask_; ///< bit-plane mask - unsigned substr_i_; ///< i - size_type sv_off_; ///< SV read offset + CharMatrix& cmatr_; ///< target array for reverse transpose + unsigned_value_type mask_ = 0; ///< bit-plane mask + unsigned substr_i_= 0; ///< i + size_type sv_off_ = 0; ///< SV read offset }; @@ -1238,7 +1364,7 @@ class str_sparse_vector : public base_sparse_vector protected: enum insert_buf_size_e { - ins_buf_size = 1024 * 8 + ins_buf_size = bm::gap_max_bits // 1024 * 8 }; /// @internal @@ -1378,7 +1504,7 @@ class str_sparse_vector : public base_sparse_vector { bvector_type* bv = this->get_create_slice((unsigned)(char_slice_idx * 8) + bi); - bv->import_sorted(&bit_list[0], n_bits); + bv->import_sorted(&bit_list[0], n_bits, false); } } // for ch_acc } @@ -1444,6 +1570,18 @@ class str_sparse_vector : public base_sparse_vector remap_matrix_type* get_remap_matrix() { return &remap_matrix1_; } + /** + reamp using statistics table from inserter + */ + void remap(back_insert_iterator& iit); + + /** + Remap from implementation, please note that move_data flag can violate cosnt-ness + */ + void remap_from_impl(const str_sparse_vector& str_sv, + octet_freq_matrix_type* omatrix, + bool move_data); + protected: template friend class sparse_vector_serializer; template friend class sparse_vector_deserializer; @@ -1488,6 +1626,23 @@ str_sparse_vector::str_sparse_vector( //--------------------------------------------------------------------- +template +str_sparse_vector::str_sparse_vector( + const str_sparse_vector& str_sv, bm::remap_setup remap_mode) +: parent_type(str_sv.get_null_support()), + remap_flags_(str_sv.remap_flags_), + remap_matrix1_(str_sv.remap_matrix1_), + remap_matrix2_(str_sv.remap_matrix2_) +{ + BM_ASSERT(str_sv.remap_flags_); // source vector should be remapped + BM_ASSERT(remap_mode == bm::remap_setup::COPY_RTABLES); + static_assert(STR_SIZE > 1, + "BM:: String vector size must be > 1 (to accomodate 0 terminator)"); + (void) remap_mode; +} + +//--------------------------------------------------------------------- + template void str_sparse_vector::swap( str_sparse_vector& str_sv) BMNOEXCEPT @@ -1527,6 +1682,18 @@ void str_sparse_vector::insert( //--------------------------------------------------------------------- +template +void str_sparse_vector::swap(size_type idx1, + size_type idx2) +{ + BM_ASSERT(idx1 < this->size()); + BM_ASSERT(idx2 < this->size()); + + this->swap_elements(idx1, idx2); +} + +//--------------------------------------------------------------------- + template void str_sparse_vector::erase(size_type idx) { @@ -1715,6 +1882,106 @@ void str_sparse_vector::calc_stat( } } +//--------------------------------------------------------------------- + +template +int str_sparse_vector::compare_str( + const value_type* str1, const value_type* str2) BMNOEXCEPT +{ + BM_ASSERT(str1 && str2); + int res = 0; + for (unsigned i = 0; true; ++i) + { + CharType octet2 = str2[i]; + CharType octet1 = str1[i]; + if (!octet1) + { + res = -octet2; // -1 || 0 + break; + } + res = (octet1 > octet2) - (octet1 < octet2); + if (res || !octet2) + break; + } // for i + return res; +} + +//--------------------------------------------------------------------- + +template +int str_sparse_vector::compare_str( + const value_type* str1, const value_type* str2, size_t min_len) BMNOEXCEPT +{ + BM_ASSERT(str1 && str2); + + int res = 0; + size_t i = 0; + + CharType octet2, octet1; + if (min_len >= 4) + { + for (; i < min_len-3; i+=4) + { + unsigned i2, i1; + ::memcpy(&i2, &str2[i], sizeof(i2)); + ::memcpy(&i1, &str1[i], sizeof(i1)); + BM_ASSERT(!bm::has_zero_byte_u64(bm::id64_t(i2) | (bm::id64_t(i1) << 32))); + if (i1 != i2) + { + octet2 = str2[i]; + octet1 = str1[i]; + res = (octet1 > octet2) - (octet1 < octet2); + if (res) + return res; + octet2 = str2[i+1]; + octet1 = str1[i+1]; + res = (octet1 > octet2) - (octet1 < octet2); + if (res) + return res; + octet2 = str2[i+2]; + octet1 = str1[i+2]; + res = (octet1 > octet2) - (octet1 < octet2); + if (res) + return res; + octet2 = str2[i+3]; + octet1 = str1[i+3]; + res = (octet1 > octet2) - (octet1 < octet2); + if (res) + return res; + BM_ASSERT(0); + break; + } + } // for i + } + + + for (; i < min_len; ++i) + { + octet2 = str2[i]; + octet1 = str1[i]; + BM_ASSERT(octet1 && octet2); + res = (octet1 > octet2) - (octet1 < octet2); + if (res) + return res; + } // for i + + for (; true; ++i) + { + octet2 = str2[i]; + octet1 = str1[i]; + if (!octet1) + { + res = -octet2; // -1 || 0 + break; + } + res = (octet1 > octet2) - (octet1 < octet2); + if (res || !octet2) + break; + } // for i + return res; +} + + //--------------------------------------------------------------------- template @@ -1735,7 +2002,7 @@ int str_sparse_vector::compare_remap( break; } const unsigned char* remap_row = remap_matrix1_.row(i); - unsigned char remap_value1 = remap_row[unsigned(octet1)]; + CharType remap_value1 = (CharType)remap_row[unsigned(octet1)]; BM_ASSERT(remap_value1); res = (remap_value1 > octet2) - (remap_value1 < octet2); if (res || !octet2) @@ -1791,6 +2058,7 @@ int str_sparse_vector::compare( size_type idx1, size_type idx2) const BMNOEXCEPT { + BM_ASSERT(idx1 < size() && idx2 < size()); int res = 0; if (idx1 == idx2) return 0; @@ -1808,9 +2076,9 @@ int str_sparse_vector::compare( } const unsigned char* remap_row = remap_matrix1_.row(i); unsigned char remap_value1 = remap_row[unsigned(octet1)]; - BM_ASSERT(remap_value1); + //BM_ASSERT(remap_value1); unsigned char remap_value2 = remap_row[unsigned(octet2)]; - BM_ASSERT(remap_value2); + //BM_ASSERT(remap_value2); res = (remap_value1 > remap_value2) - (remap_value1 < remap_value2); if (res || !octet2) break; @@ -1839,27 +2107,36 @@ int str_sparse_vector::compare( //--------------------------------------------------------------------- template +template unsigned str_sparse_vector::common_prefix_length( - size_type idx1, size_type idx2) const BMNOEXCEPT + size_type idx1, size_type idx2, + value_type* prefix_buf) const BMNOEXCEPT { + BM_ASSERT (!(prefix_buf && !USE_PREFIX_BUF)); unsigned i = 0; - for (; true; ++i) + CharType ch1 = CharType(this->bmatr_.get_octet(idx1, i)); + CharType ch2 = CharType(this->bmatr_.get_octet(idx2, i)); + if (ch1 == ch2 && (ch1|ch2)) { - CharType ch1 = CharType(this->bmatr_.get_octet(idx1, i)); - CharType ch2 = CharType(this->bmatr_.get_octet(idx2, i)); - if (!ch1 || !ch2) + if constexpr(USE_PREFIX_BUF) { - if (i) - --i; - break; + BM_ASSERT(prefix_buf); + *prefix_buf++ = ch1; } - if (ch1 != ch2) - break; - } // for - + for (++i; true; ++i) + { + ch1 = CharType(this->bmatr_.get_octet(idx1, i)); + ch2 = CharType(this->bmatr_.get_octet(idx2, i)); + if (ch1 != ch2 || (!(ch1|ch2))) // chs not the same or both zero + return i; + if constexpr(USE_PREFIX_BUF) + *prefix_buf++ = ch1; + } // for i + } return i; } + //--------------------------------------------------------------------- template @@ -1892,7 +2169,7 @@ void str_sparse_vector::calc_octet_stat( size_type max_str_len = effective_max_str(); octet_matrix.resize(max_str_len, 256, false); octet_matrix.set_zero(); //init(true); - + { const_iterator it(this); for(; it.valid(); ++it) { @@ -1910,7 +2187,7 @@ void str_sparse_vector::calc_octet_stat( row[ch_idx] += 1; } // for i } // for it - + } } //--------------------------------------------------------------------- @@ -1962,25 +2239,27 @@ void str_sparse_vector::recalc_remap_matrix2() { BM_ASSERT(remap_flags_); - remap_matrix2_.resize(remap_matrix1_.rows(), remap_matrix1_.cols(), false); - remap_matrix2_.set_zero(); - - //remap_matrix2_.init(true); - - for (unsigned i = 0; i < remap_matrix1_.rows(); ++i) + auto rows = remap_matrix1_.rows(); + remap_matrix2_.resize(rows, remap_matrix1_.cols(), false); + if (rows) { - const unsigned char* remap_row1 = remap_matrix1_.row(i); - unsigned char* remap_row2 = remap_matrix2_.row(i); - for (unsigned j = 1; j < remap_matrix1_.cols(); ++j) + remap_matrix2_.set_zero(); + //remap_matrix2_.init(true); + for (unsigned i = 0; i < remap_matrix1_.rows(); ++i) { - if (remap_row1[j]) + const unsigned char* remap_row1 = remap_matrix1_.row(i); + unsigned char* remap_row2 = remap_matrix2_.row(i); + for (unsigned j = 1; j < remap_matrix1_.cols(); ++j) { - unsigned ch_code = remap_row1[j]; - remap_row2[ch_code] = (unsigned char)j; - BM_ASSERT(ch_code < 256); - } - } // for j - } // for i + if (remap_row1[j]) + { + unsigned ch_code = remap_row1[j]; + remap_row2[ch_code] = (unsigned char)j; + BM_ASSERT(ch_code < 256); + } + } // for j + } // for i + } // if rows } //--------------------------------------------------------------------- @@ -1992,7 +2271,8 @@ bool str_sparse_vector::remap_tosv( const value_type* BMRESTRICT str, const slice_octet_matrix_type& BMRESTRICT octet_remap_matrix2) BMNOEXCEPT { - for (unsigned i = 0; i < buf_size; ++i) + const unsigned char* remap_row = octet_remap_matrix2.row(0); + for (unsigned i = 0; i < buf_size; ++i, remap_row += 256) { CharType ch = str[i]; if (!ch) @@ -2000,15 +2280,44 @@ bool str_sparse_vector::remap_tosv( sv_str[i] = ch; break; } - const unsigned char* remap_row = octet_remap_matrix2.row(i); +// const unsigned char* remap_row = octet_remap_matrix2.row(i); unsigned char remap_value = remap_row[unsigned(ch)]; + sv_str[i] = CharType(remap_value); if (!remap_value) // unknown dictionary element return false; + } // for i + return true; +} + +//--------------------------------------------------------------------- + +template +bool str_sparse_vector::remap_n_tosv_2way( + value_type* BMRESTRICT sv_str, + value_type* BMRESTRICT str_cp, + size_type buf_size, + const value_type* BMRESTRICT str, + size_t in_len, + const slice_octet_matrix_type& BMRESTRICT octet_remap_matrix2) BMNOEXCEPT +{ + BM_ASSERT(in_len <= buf_size); (void) buf_size; + + const unsigned char* remap_row = octet_remap_matrix2.row(0); + for (unsigned i = 0; i < in_len; ++i, remap_row += 256) + { + CharType ch = str[i]; + str_cp[i] = value_type(ch); + BM_ASSERT(ch); + unsigned char remap_value = remap_row[unsigned(ch)]; sv_str[i] = CharType(remap_value); + if (!remap_value) // unknown dictionary element + return false; } // for i + sv_str[in_len] = str_cp[in_len] = 0; // gurantee zero termination return true; } + //--------------------------------------------------------------------- template @@ -2019,7 +2328,8 @@ bool str_sparse_vector::remap_fromsv( const slice_octet_matrix_type& BMRESTRICT octet_remap_matrix1 ) BMNOEXCEPT { - for (unsigned i = 0; i < buf_size; ++i) + const unsigned char* remap_row = octet_remap_matrix1.row(0); + for (unsigned i = 0; i < buf_size; ++i, remap_row += 256) { CharType ch = sv_str[i]; if (!ch) @@ -2027,11 +2337,10 @@ bool str_sparse_vector::remap_fromsv( str[i] = ch; break; } - const unsigned char* remap_row = octet_remap_matrix1.row(i); unsigned char remap_value = remap_row[unsigned(ch)]; + str[i] = CharType(remap_value); if (!remap_value) // unknown dictionary element return false; - str[i] = CharType(remap_value); } // for i return true; } @@ -2041,57 +2350,116 @@ bool str_sparse_vector::remap_fromsv( template void str_sparse_vector::remap() { - // TODO: get rid of tmp, implement a move-remapping str_sparse_vector sv_tmp(this->get_null_support()); - sv_tmp.remap_from(*this); + sv_tmp.remap_from_impl(*this, 0, true /*move data*/); sv_tmp.swap(*this); } //--------------------------------------------------------------------- +template +void str_sparse_vector::remap( + back_insert_iterator& iit) +{ + if (iit.remap_flags_ && iit.omatrix_.rows()) + { + str_sparse_vector + sv_tmp(this->get_null_support()); + sv_tmp.remap_from_impl(*this, &iit.omatrix_, true /*move data*/); + sv_tmp.swap(*this); + } + else + remap(); +} + +//--------------------------------------------------------------------- + template void str_sparse_vector::remap_from( - const str_sparse_vector& str_sv) + const str_sparse_vector& str_sv, + octet_freq_matrix_type* omatrix) +{ + remap_from_impl(str_sv, omatrix, false); +} + +//--------------------------------------------------------------------- + +template +void +str_sparse_vector::remap_from_impl( + const str_sparse_vector& str_sv, + octet_freq_matrix_type* omatrix, + bool move_data) { + const unsigned buffer_size = ins_buf_size; // bm::gap_max_bits; // 65536; + if (str_sv.is_remap()) { *this = str_sv; return; } + + typename bvector_type::allocator_pool_type pool; + typename + bm::alloc_pool_guard g1, g2; + if (move_data) + { + str_sparse_vector& sv = const_cast(str_sv); + g1.assign_if_not_set(pool, *this); + g2.assign_if_not_set(pool, sv); + + auto r = sv.get_bmatrix().rows(); + pool.set_block_limit(r + 10); + } + this->clear_all(true); if (str_sv.empty()) // no content to remap return; - octet_freq_matrix_type omatrix; // occupancy map - str_sv.calc_octet_stat(omatrix); - - str_sv.build_octet_remap(remap_matrix1_, remap_matrix2_, omatrix); + octet_freq_matrix_type occ_matrix; // occupancy map + if (!omatrix) + { + str_sv.calc_octet_stat(occ_matrix); + omatrix = &occ_matrix; + } + str_sv.build_octet_remap(remap_matrix1_, remap_matrix2_, *omatrix); remap_flags_ = 1; // turn ON remapped mode - - const unsigned buffer_size = ins_buf_size; // 1024 * 8; typedef bm::dynamic_heap_matrix buffer_matrix_type; size_type str_len = str_sv.effective_max_str()+1; - //remap_buffer_type cmatr(true); buffer_matrix_type cmatr(buffer_size, str_len); cmatr.init(true); // init and set zero - for (size_type i = 0; true; ) + for (size_type i{0}, dsize; true; i += dsize) { - size_type dsize = str_sv.decode(cmatr, i, buffer_size, true); + dsize = str_sv.decode(cmatr, i, buffer_size, true); if (!dsize) break; + if (move_data && (dsize == ins_buf_size)) // free the src.vect blocks + { + // here const_cast is OK, because we violate cosnt-ness only + // in internal safe cases controlled by the upper level call + // + str_sparse_vector& sv = const_cast(str_sv); + sv.clear_range(i, i+dsize-1, false); + } + this->import(cmatr, i, dsize); - i += dsize; } // for i if (bvector_type* bv_null = this->get_null_bvect()) { if (const bvector_type* bv_null_arg = str_sv.get_null_bvector()) - *bv_null = *bv_null_arg; + if (move_data) + { + bvector_type* bv = const_cast(bv_null_arg); + bv_null->swap(*bv); + } + else + *bv_null = *bv_null_arg; else { // TODO: exception? assert? maybe it is OK... @@ -2107,6 +2475,7 @@ void str_sparse_vector::sync(bool /*force*/) { if (remap_flags_) recalc_remap_matrix2(); + this->sync_ro(); } //--------------------------------------------------------------------- @@ -2420,7 +2789,7 @@ str_sparse_vector::const_iterator::advance() BMNOEXCEPT template str_sparse_vector::back_insert_iterator::back_insert_iterator() BMNOEXCEPT -: sv_(0), bv_null_(0), pos_in_buf_(~size_type(0)), prev_nb_(0) +: sv_(0), bv_null_(0), pos_in_buf_(~size_type(0)) {} //--------------------------------------------------------------------- @@ -2451,7 +2820,8 @@ template str_sparse_vector::back_insert_iterator::back_insert_iterator( const str_sparse_vector::back_insert_iterator& bi) BMNOEXCEPT : sv_(bi.sv_), bv_null_(bi.bv_null_), buf_matrix_(bi.buf_matrix_.rows(), bi.buf_matrix_.cols()), - pos_in_buf_(~size_type(0)), prev_nb_(bi.prev_nb_) + pos_in_buf_(~size_type(0)), prev_nb_(bi.prev_nb_), opt_mode_(bi.opt_mode_), + remap_flags_(bi.remap_flags_), omatrix_(bi.omatrix_) { BM_ASSERT(bi.empty()); } @@ -2478,6 +2848,20 @@ str_sparse_vector::back_insert_iterator::empty() template void str_sparse_vector::back_insert_iterator::flush() +{ + flush_impl(); + if (remap_flags_) + { + buf_matrix_.free(); + sv_->remap(*this); + remap_flags_ = 0; + } +} + +//--------------------------------------------------------------------- + +template +void str_sparse_vector::back_insert_iterator::flush_impl() { if (this->empty()) return; @@ -2488,12 +2872,12 @@ void str_sparse_vector::back_insert_iterator::flush() block_idx_type nb = sv_->size() >> bm::set_block_shift; if (nb != prev_nb_) { - // optimize all previous blocks in all planes - sv_->optimize_block(prev_nb_); + sv_->optimize_block(prev_nb_, opt_mode_); prev_nb_ = nb; } } + //--------------------------------------------------------------------- template @@ -2533,6 +2917,47 @@ typename str_sparse_vector::back_insert_iterator::size_t //--------------------------------------------------------------------- + +template +void +str_sparse_vector::back_insert_iterator::add_remap_stat( +const str_sparse_vector::back_insert_iterator::value_type* v) +{ + BM_ASSERT(remap_flags_); + + size_t slen = ::strlen(v); + + auto orows = omatrix_.rows(); + if (slen > orows) + { + if (!orows) + { + omatrix_.resize(slen, 256, false); + omatrix_.set_zero(); + } + else + { + omatrix_.resize(slen, 256, true); + for (; orows < omatrix_.rows(); ++orows) + { + typename + octet_freq_matrix_type::value_type* r = omatrix_.row(orows); + ::memset(r, 0, 256 * sizeof(r[0])); + } // for orows + } + } + for (size_t i = 0; i < slen; ++i) + { + value_type ch = v[i]; + typename + octet_freq_matrix_type::value_type* row = omatrix_.row(i); + unsigned ch_idx = (unsigned char)ch; + row[ch_idx] += 1; + } // for i +} + +//--------------------------------------------------------------------- + template void str_sparse_vector::back_insert_iterator::add_value( @@ -2547,7 +2972,7 @@ const str_sparse_vector::back_insert_iterator::value_typ if (pos_in_buf_ == ~size_type(0) && (!buf_matrix_.is_init())) buf_matrix_.init(); else - this->flush(); + this->flush_impl(); pos_in_buf_ = 0; buf_matrix_.set_zero(); } else @@ -2555,6 +2980,9 @@ const str_sparse_vector::back_insert_iterator::value_typ ++pos_in_buf_; } + if (remap_flags_) + add_remap_stat(v); + value_type* r = buf_matrix_.row(pos_in_buf_); typename buffer_matrix_type::size_type i; diff --git a/tools/tax/src/bm/bmundef.h b/tools/tax/src/bm/bmundef.h index ee3e4f9e..07fa1de2 100644 --- a/tools/tax/src/bm/bmundef.h +++ b/tools/tax/src/bm/bmundef.h @@ -47,7 +47,10 @@ For more information please visit: http://bitmagic.io #undef BMVECTOPT #undef VECT_XOR_ARR_2_MASK #undef VECT_ANDNOT_ARR_2_MASK + #undef VECT_BITCOUNT +#undef VECT_BIT_COUNT_DIGEST + #undef VECT_BITCOUNT_AND #undef VECT_BITCOUNT_OR #undef VECT_BITCOUNT_XOR @@ -83,14 +86,23 @@ For more information please visit: http://bitmagic.io #undef VECT_AND_DIGEST_2WAY #undef VECT_AND_OR_DIGEST_2WAY #undef VECT_AND_DIGEST_5WAY +#undef VECT_AND_DIGEST_3WAY #undef VECT_BLOCK_SET_DIGEST +#undef VECT_SUB_DIGEST +#undef VECT_SUB_DIGEST_5WAY +#undef VECT_SUB_DIGEST_2WAY +#undef VECT_SUB_BLOCK + #undef VECT_BLOCK_XOR_CHANGE #undef VECT_BIT_BLOCK_XOR #undef VECT_BIT_FIND_FIRST #undef VECT_BIT_FIND_DIFF +#undef VECT_BIT_FIND_FIRST_IF_1 + #undef VECT_GAP_BFIND +#undef VECT_GAP_TEST #undef BMI1_SELECT64 #undef BMI2_SELECT64 @@ -104,4 +116,5 @@ For more information please visit: http://bitmagic.io #undef BM_UNALIGNED_ACCESS_OK #undef BM_x86 +#undef BM_ALLOC_ALIGN diff --git a/tools/tax/src/bm/bmutil.h b/tools/tax/src/bm/bmutil.h index 8be7eef8..8d438ca9 100644 --- a/tools/tax/src/bm/bmutil.h +++ b/tools/tax/src/bm/bmutil.h @@ -22,6 +22,7 @@ For more information please visit: http://bitmagic.io \brief Bit manipulation primitives (internal) */ + #include "bmdef.h" #include "bmconst.h" @@ -30,11 +31,8 @@ For more information please visit: http://bitmagic.io #else #if defined(_M_AMD64) || defined(_M_X64) #include - #elif defined(BMSSE2OPT) || defined(BMSSE42OPT) - #include - #elif defined(BMAVX2OPT) - #include - #include + #elif defined(__x86_64__) + #include #endif #endif @@ -555,7 +553,7 @@ BMFORCEINLINE void xor_swap(W& x, W& y) BMNOEXCEPT @internal */ inline -unsigned compute_h64_mask(unsigned long long w) +unsigned compute_h64_mask(unsigned long long w) BMNOEXCEPT { unsigned h_mask = 0; for (unsigned i = 0; w && (i < 8); ++i, w >>= 8) @@ -566,6 +564,15 @@ unsigned compute_h64_mask(unsigned long long w) return h_mask; } +/** + Returns true if INT64 contains 0 octet + */ +BMFORCEINLINE +bool has_zero_byte_u64(bm::id64_t v) BMNOEXCEPT +{ + return (v - 0x0101010101010101ULL) & ~(v) & 0x8080808080808080ULL; +} + /*! Returns bit count @@ -622,6 +629,20 @@ unsigned word_bitcount64(bm::id64_t x) BMNOEXCEPT #endif } +/** + Check pointer alignment + @internal + */ +template< typename T > +bool is_aligned(T* p) BMNOEXCEPT +{ +#if defined (BM_ALLOC_ALIGN) + return !(reinterpret_cast(p) % BM_ALLOC_ALIGN); +#else + (void)p; + return true; +#endif +} } // bm diff --git a/tools/tax/src/bm/bmxor.h b/tools/tax/src/bm/bmxor.h index 2e525a2e..8c70b48e 100644 --- a/tools/tax/src/bm/bmxor.h +++ b/tools/tax/src/bm/bmxor.h @@ -727,11 +727,8 @@ class bv_ref_vector { size_type rows = bmatr.rows(); for (size_type r = 0; r < rows; ++r) - { - bvector_type_const_ptr bv = bmatr.get_row(r); - if (bv) + if (bvector_type_const_ptr bv = bmatr.get_row(r)) add(bv, rows_acc_ + r); - } // for r rows_acc_ += unsigned(rows); } @@ -757,13 +754,17 @@ class bv_ref_vector /** Calculate blocks digest and resize XOR distance matrix based on total number of available blocks + @return true if created ok (false if no blocks found) */ - void build_nb_digest_and_xor_matrix(matrix_chain_type& matr, + bool build_nb_digest_and_xor_matrix(matrix_chain_type& matr, bvector_type& bv_blocks) const { fill_alloc_digest(bv_blocks); size_type cnt = bv_blocks.count(); + if (!cnt) + return false; resize_xor_matrix(matr, cnt); + return true; } protected: @@ -892,16 +893,18 @@ class xor_scanner /** Calculate matrix of best XOR match metrics per block for the attached collection of bit-vectors + @return true if computed successfully */ - void compute_sim_model(xor_sim_model& sim_model, + bool compute_sim_model(xor_sim_model& sim_model, const bv_ref_vector_type& ref_vect, const bm::xor_sim_params& params); /** Calculate matrix of best XOR match metrics per block for the attached collection of bit-vectors + @return true if computed successfully */ - void compute_sim_model(xor_sim_model &sim_model, + bool compute_sim_model(xor_sim_model &sim_model, const bm::xor_sim_params& params); /** @@ -1434,27 +1437,30 @@ typename xor_scanner::size_type xor_scanner::refine_match_chain() // -------------------------------------------------------------------------- template -void xor_scanner::compute_sim_model(xor_sim_model &sim_model, +bool xor_scanner::compute_sim_model(xor_sim_model &sim_model, const bv_ref_vector_type& ref_vect, const bm::xor_sim_params& params) { const bv_ref_vector_type* ref_vect_curr = this->ref_vect_; // save ref-vect ref_vect_ = &ref_vect; - compute_sim_model(sim_model, params); + bool sim_ok = compute_sim_model(sim_model, params); ref_vect_ = ref_vect_curr; // restore state + return sim_ok; } template -void xor_scanner::compute_sim_model(bm::xor_sim_model& sim_model, +bool xor_scanner::compute_sim_model(bm::xor_sim_model& sim_model, const bm::xor_sim_params& params) { BM_ASSERT(ref_vect_); sim_model.bv_blocks.clear(false); - ref_vect_->build_nb_digest_and_xor_matrix(sim_model.matr, - sim_model.bv_blocks); + bool ret = ref_vect_->build_nb_digest_and_xor_matrix(sim_model.matr, + sim_model.bv_blocks); + if (!ret) + return ret; sync_nb_vect(); @@ -1464,6 +1470,7 @@ void xor_scanner::compute_sim_model(bm::xor_sim_model& sim_model, size_type nb = *en; compute_sim_model(sim_model.matr, nb, col, params); } // for en + return true; } // -------------------------------------------------------------------------- From eaf2be64779a771b3c35b61a1a382ad6e064abb1 Mon Sep 17 00:00:00 2001 From: "Shutov, Oleg" Date: Tue, 8 Nov 2022 12:29:24 -0500 Subject: [PATCH 2/2] reader_test nullptr fix --- tools/tax/src/tests/reader_test.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/tax/src/tests/reader_test.cpp b/tools/tax/src/tests/reader_test.cpp index a359643d..8dc37866 100644 --- a/tools/tax/src/tests/reader_test.cpp +++ b/tools/tax/src/tests/reader_test.cpp @@ -108,20 +108,20 @@ TEST(fasta_reader) { Reader::Fragment f; ASSERT(reader.read(&f)); - ASSERT_EQUALS(f.spotid, "SRR1068106.1 1 length=499"); + ASSERT_EQUALS(f.spotid, "SRR1068106.1"); ASSERT_EQUALS(f.bases, "AAGTCGTAACAAGGTCTCCGTTGGTGAACCAGCGGAGGGATCATTACCGAGTTTACAACTCCCAAACCCCTGTGAACATACCACTTGTTGCCTCGGCGGATCAGCCCGCTCCCGGTAAAACGGGACGGCCCGCCAGAGGACCCCTAAACTCTGTTTCTATATGTAACTTCTGAGTAAAACCATAAATAAATCAAAACTTTCAACAACGGATCTCTTGGTTCTGGCATCGATGAAGAACGCAGCAAAATGCGATAAGTAATGTGAATTGCAGAATTCAGTGAATCATCGAATCTTTGAACGCACATTGCGCCCGCCAGTATTCTGGCGGGCATGCCTGTTCGAGCGTCATTTCAACCCTCAAGCACAGCTTGGTGTTGGGACTCGCGTTAATTCGCGTTCCTCAAATTGATTGGCGGTCACGTCGAGCTTCCATAGCGTAGTAGTAAAACCCTCGTTACTGGTAATCGTCGCGGCCACGCCGTTAAACCCCACTTCTGAA"); - ASSERT(reader.read(nullptr)); + ASSERT(reader.read(&f)); ASSERT(reader.read(&f)); ASSERT(reader.read(&f)); - ASSERT_EQUALS(f.spotid, "SRR1068106.4 4 length=512"); + ASSERT_EQUALS(f.spotid, "SRR1068106.4"); ASSERT_EQUALS(f.bases, "AAGTCGTAACAAGGTCTCCGTTGGTGAACCAGCGGAGGGATCATTACCGAGTTTACAACTCCCAAACCCCTGTGAACATACCACTTGTTGCCTCGGCGGATCAGCCCGCTCCCGGTAAAACGGGACGGCCCGCCAGAGGACCCCTAAACTCTGTTTCTATATGTAACTTCTGAGTAAAACCATAAATAAATCAAAACTTTCAACAACGGATCTCTTGGTTCTGGCATCGATGAAGAACGCAGCAAAATGCGATAAGTAATGTGAATTGCAGAATTCAGTGAATCATCGAATCTTTGAACGCACATTGCGCCCGCCAGTATTCTGGCGGGCATGCCTGTTCGAGCGTCATTTCAACCCTCAAGCACAGCTTGGTGTTGGGACTCGCGTTAATTCGCGTTCCTCAAATTGATTGGCGGTCACGTCGAGCTTCCATAGCGTAGTAGTAAAACCCTCGTTACTGGTAATCGTCGCGGCCACGCCGTTAAACCCCAACTTCCTGAATGTTGACCTCG"); size_t got_spots = 4; - while (reader.read(nullptr)) { + while (reader.read(&f)) ++got_spots; - } + ASSERT_EQUALS(got_spots, 226); ASSERT(reader.stats() == Reader::SourceStats(got_spots, got_spots)); ASSERT_EQUALS(reader.progress(), 1);