Skip to content

Commit

Permalink
optimize popcount implementation
Browse files Browse the repository at this point in the history
In the current implementation, the gcc backend of rustc currently emits the
following for a function that implements popcount for a u32 (x86_64 targeting
AVX2, using standard unix calling convention):

    popcount:
        mov     eax, edi
        and     edi, 1431655765
        shr     eax
        and     eax, 1431655765
        add     edi, eax
        mov     edx, edi
        and     edi, 858993459
        shr     edx, 2
        and     edx, 858993459
        add     edx, edi
        mov     eax, edx
        and     edx, 252645135
        shr     eax, 4
        and     eax, 252645135
        add     eax, edx
        mov     edx, eax
        and     eax, 16711935
        shr     edx, 8
        and     edx, 16711935
        add     edx, eax
        movzx   eax, dx
        shr     edx, 16
        add     eax, edx
        ret

Rather than using this implementation, gcc could be told to use Wenger's
algorithm.  This would give the same function the following implementation:

    popcount:
        xor eax, eax
        xor edx, edx
        popcnt eax, edi
        test edi, edi
        cmove eax, edx
        ret

This patch implements the popcount operation in terms of Wenger's algorithm in
all cases.

Signed-off-by: Andy Sadler <[email protected]>
  • Loading branch information
sadlerap committed Oct 17, 2023
1 parent 4dce75f commit 64abf58
Showing 1 changed file with 39 additions and 61 deletions.
100 changes: 39 additions & 61 deletions src/intrinsic/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ mod simd;
#[cfg(feature="master")]
use std::iter;

use gccjit::{ComparisonOp, Function, RValue, ToRValue, Type, UnaryOp, FunctionType};
use gccjit::{BinaryOp, ComparisonOp, Function, RValue, ToRValue, Type, UnaryOp, FunctionType};
use rustc_codegen_ssa::MemFlags;
use rustc_codegen_ssa::base::wants_msvc_seh;
use rustc_codegen_ssa::common::IntPredicate;
Expand Down Expand Up @@ -820,74 +820,52 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
};

if value_type.is_u128(&self.cx) {
// TODO(antoyo): implement in the normal algorithm below to have a more efficient
// implementation (that does not require a call to __popcountdi2).
let popcount = self.context.get_builtin_function("__builtin_popcountll");
let sixty_four = self.gcc_int(value_type, 64);
let right_shift = self.gcc_lshr(value, sixty_four);
let high = self.gcc_int_cast(right_shift, self.cx.ulonglong_type);
let high = self.context.new_call(None, popcount, &[high]);
let high = self.pop_count(high);
let low = self.gcc_int_cast(value, self.cx.ulonglong_type);
let low = self.context.new_call(None, popcount, &[low]);
let low = self.pop_count(low);
let res = high + low;
return self.gcc_int_cast(res, result_type);
}

// First step.
let mask = self.context.new_rvalue_from_long(value_type, 0x5555555555555555);
let left = value & mask;
let shifted = value >> self.context.new_rvalue_from_int(value_type, 1);
let right = shifted & mask;
let value = left + right;

// Second step.
let mask = self.context.new_rvalue_from_long(value_type, 0x3333333333333333);
let left = value & mask;
let shifted = value >> self.context.new_rvalue_from_int(value_type, 2);
let right = shifted & mask;
let value = left + right;

// Third step.
let mask = self.context.new_rvalue_from_long(value_type, 0x0F0F0F0F0F0F0F0F);
let left = value & mask;
let shifted = value >> self.context.new_rvalue_from_int(value_type, 4);
let right = shifted & mask;
let value = left + right;

if value_type.is_u8(&self.cx) {
return self.context.new_cast(None, value, result_type);
}

// Fourth step.
let mask = self.context.new_rvalue_from_long(value_type, 0x00FF00FF00FF00FF);
let left = value & mask;
let shifted = value >> self.context.new_rvalue_from_int(value_type, 8);
let right = shifted & mask;
let value = left + right;

if value_type.is_u16(&self.cx) {
return self.context.new_cast(None, value, result_type);
}

// Fifth step.
let mask = self.context.new_rvalue_from_long(value_type, 0x0000FFFF0000FFFF);
let left = value & mask;
let shifted = value >> self.context.new_rvalue_from_int(value_type, 16);
let right = shifted & mask;
let value = left + right;

if value_type.is_u32(&self.cx) {
return self.context.new_cast(None, value, result_type);
}

// Sixth step.
let mask = self.context.new_rvalue_from_long(value_type, 0x00000000FFFFFFFF);
let left = value & mask;
let shifted = value >> self.context.new_rvalue_from_int(value_type, 32);
let right = shifted & mask;
let value = left + right;

self.context.new_cast(None, value, result_type)
// Use Wenger's algorithm for population count, gcc's seems to play better with it
// for (int counter = 0; value != 0; counter++) {
// value &= value - 1;
// }
let func = self.current_func.borrow().expect("func");
let loop_head = func.new_block("head");
let loop_body = func.new_block("body");
let loop_tail = func.new_block("tail");

let counter_type = self.int_type;
let counter = self.current_func().new_local(None, counter_type, "popcount_counter");
let val = self.current_func().new_local(None, value_type, "popcount_value");
let zero = self.context.new_rvalue_zero(counter_type);
self.llbb().add_assignment(None, counter, zero);
self.llbb().add_assignment(None, val, value);
self.br(loop_head);

// check if value isn't zero
self.switch_to_block(loop_head);
let zero = self.context.new_rvalue_zero(value_type);
let cond = self.context.new_comparison(None, ComparisonOp::NotEquals, val.to_rvalue(), zero);
self.cond_br(cond, loop_body, loop_tail);

// val &= val - 1;
self.switch_to_block(loop_body);
let sub = val.to_rvalue() - self.context.new_rvalue_one(value_type);
loop_body.add_assignment_op(None, val, BinaryOp::BitwiseAnd, sub);

// counter += 1
let one = self.context.new_rvalue_one(counter_type);
loop_body.add_assignment_op(None, counter, BinaryOp::Plus, one);
self.br(loop_head);

// end of loop
self.switch_to_block(loop_tail);
self.context.new_cast(None, counter.to_rvalue(), result_type)
}

// Algorithm from: https://blog.regehr.org/archives/1063
Expand Down

0 comments on commit 64abf58

Please sign in to comment.