From 64abf5862ffb5b32f1555642550eb18f383fdc3a Mon Sep 17 00:00:00 2001 From: Andy Sadler Date: Thu, 5 Oct 2023 19:23:46 -0500 Subject: [PATCH] optimize popcount implementation In the current implementation, the gcc backend of rustc currently emits the following for a function that implements popcount for a u32 (x86_64 targeting AVX2, using standard unix calling convention): popcount: mov eax, edi and edi, 1431655765 shr eax and eax, 1431655765 add edi, eax mov edx, edi and edi, 858993459 shr edx, 2 and edx, 858993459 add edx, edi mov eax, edx and edx, 252645135 shr eax, 4 and eax, 252645135 add eax, edx mov edx, eax and eax, 16711935 shr edx, 8 and edx, 16711935 add edx, eax movzx eax, dx shr edx, 16 add eax, edx ret Rather than using this implementation, gcc could be told to use Wenger's algorithm. This would give the same function the following implementation: popcount: xor eax, eax xor edx, edx popcnt eax, edi test edi, edi cmove eax, edx ret This patch implements the popcount operation in terms of Wenger's algorithm in all cases. Signed-off-by: Andy Sadler --- src/intrinsic/mod.rs | 100 +++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 61 deletions(-) diff --git a/src/intrinsic/mod.rs b/src/intrinsic/mod.rs index 9caed459a2926..f0437bf4cc83e 100644 --- a/src/intrinsic/mod.rs +++ b/src/intrinsic/mod.rs @@ -4,7 +4,7 @@ mod simd; #[cfg(feature="master")] use std::iter; -use gccjit::{ComparisonOp, Function, RValue, ToRValue, Type, UnaryOp, FunctionType}; +use gccjit::{BinaryOp, ComparisonOp, Function, RValue, ToRValue, Type, UnaryOp, FunctionType}; use rustc_codegen_ssa::MemFlags; use rustc_codegen_ssa::base::wants_msvc_seh; use rustc_codegen_ssa::common::IntPredicate; @@ -820,74 +820,52 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> { }; if value_type.is_u128(&self.cx) { - // TODO(antoyo): implement in the normal algorithm below to have a more efficient - // implementation (that does not require a call to __popcountdi2). - let popcount = self.context.get_builtin_function("__builtin_popcountll"); let sixty_four = self.gcc_int(value_type, 64); let right_shift = self.gcc_lshr(value, sixty_four); let high = self.gcc_int_cast(right_shift, self.cx.ulonglong_type); - let high = self.context.new_call(None, popcount, &[high]); + let high = self.pop_count(high); let low = self.gcc_int_cast(value, self.cx.ulonglong_type); - let low = self.context.new_call(None, popcount, &[low]); + let low = self.pop_count(low); let res = high + low; return self.gcc_int_cast(res, result_type); } - // First step. - let mask = self.context.new_rvalue_from_long(value_type, 0x5555555555555555); - let left = value & mask; - let shifted = value >> self.context.new_rvalue_from_int(value_type, 1); - let right = shifted & mask; - let value = left + right; - - // Second step. - let mask = self.context.new_rvalue_from_long(value_type, 0x3333333333333333); - let left = value & mask; - let shifted = value >> self.context.new_rvalue_from_int(value_type, 2); - let right = shifted & mask; - let value = left + right; - - // Third step. - let mask = self.context.new_rvalue_from_long(value_type, 0x0F0F0F0F0F0F0F0F); - let left = value & mask; - let shifted = value >> self.context.new_rvalue_from_int(value_type, 4); - let right = shifted & mask; - let value = left + right; - - if value_type.is_u8(&self.cx) { - return self.context.new_cast(None, value, result_type); - } - - // Fourth step. - let mask = self.context.new_rvalue_from_long(value_type, 0x00FF00FF00FF00FF); - let left = value & mask; - let shifted = value >> self.context.new_rvalue_from_int(value_type, 8); - let right = shifted & mask; - let value = left + right; - - if value_type.is_u16(&self.cx) { - return self.context.new_cast(None, value, result_type); - } - - // Fifth step. - let mask = self.context.new_rvalue_from_long(value_type, 0x0000FFFF0000FFFF); - let left = value & mask; - let shifted = value >> self.context.new_rvalue_from_int(value_type, 16); - let right = shifted & mask; - let value = left + right; - - if value_type.is_u32(&self.cx) { - return self.context.new_cast(None, value, result_type); - } - - // Sixth step. - let mask = self.context.new_rvalue_from_long(value_type, 0x00000000FFFFFFFF); - let left = value & mask; - let shifted = value >> self.context.new_rvalue_from_int(value_type, 32); - let right = shifted & mask; - let value = left + right; - - self.context.new_cast(None, value, result_type) + // Use Wenger's algorithm for population count, gcc's seems to play better with it + // for (int counter = 0; value != 0; counter++) { + // value &= value - 1; + // } + let func = self.current_func.borrow().expect("func"); + let loop_head = func.new_block("head"); + let loop_body = func.new_block("body"); + let loop_tail = func.new_block("tail"); + + let counter_type = self.int_type; + let counter = self.current_func().new_local(None, counter_type, "popcount_counter"); + let val = self.current_func().new_local(None, value_type, "popcount_value"); + let zero = self.context.new_rvalue_zero(counter_type); + self.llbb().add_assignment(None, counter, zero); + self.llbb().add_assignment(None, val, value); + self.br(loop_head); + + // check if value isn't zero + self.switch_to_block(loop_head); + let zero = self.context.new_rvalue_zero(value_type); + let cond = self.context.new_comparison(None, ComparisonOp::NotEquals, val.to_rvalue(), zero); + self.cond_br(cond, loop_body, loop_tail); + + // val &= val - 1; + self.switch_to_block(loop_body); + let sub = val.to_rvalue() - self.context.new_rvalue_one(value_type); + loop_body.add_assignment_op(None, val, BinaryOp::BitwiseAnd, sub); + + // counter += 1 + let one = self.context.new_rvalue_one(counter_type); + loop_body.add_assignment_op(None, counter, BinaryOp::Plus, one); + self.br(loop_head); + + // end of loop + self.switch_to_block(loop_tail); + self.context.new_cast(None, counter.to_rvalue(), result_type) } // Algorithm from: https://blog.regehr.org/archives/1063