Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize and clean countl, countr, popcount, has_single_bit #3414

Open
wants to merge 41 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
5d25e33
optimize and clean bit functions
fbusato Dec 31, 2024
3613a85
Merge branch 'main' into optimized-bit-operations
fbusato Dec 31, 2024
7b6329c
remove builtin constexpr functions
fbusato Dec 31, 2024
0b02422
disable c++11 tests
fbusato Dec 31, 2024
1ea3673
remove c++11 tests related to optimized functions
fbusato Jan 1, 2025
00398ec
remove constexpr from runtime functions
fbusato Jan 1, 2025
ae10736
remove c++11 bit_width tests
fbusato Jan 1, 2025
f02a1e5
remove constexpr from runtime ctz
fbusato Jan 1, 2025
492486b
add is_constant_evaluated header
fbusato Jan 1, 2025
15d4d3e
remove __detail namespace and minor fixes
fbusato Jan 2, 2025
d424217
Merge branch 'main' into optimized-bit-operations
fbusato Jan 2, 2025
298d9a1
simplify low-level functions
fbusato Jan 9, 2025
dabacb4
add concept-like macros and assumptions
fbusato Jan 9, 2025
fba192f
revert integral.h and rotate.h ro avoid conflicts
fbusato Jan 15, 2025
043b643
add comment
fbusato Jan 16, 2025
7bc515b
fix concept-like macro
fbusato Jan 16, 2025
963865d
fix wrong include
fbusato Jan 16, 2025
d17031b
update copyright
fbusato Jan 16, 2025
6fd4da7
remove unneeded cuda::std qualifications
fbusato Jan 16, 2025
e3fd4fd
Merge branch 'main' into optimized-countl-countr-has_single_bit
fbusato Jan 21, 2025
b1bac2f
move is_unsigned_integer to this PR
fbusato Jan 21, 2025
7a02475
missing header
fbusato Jan 21, 2025
fa060e7
add _CCCL_INLINE_VAR
fbusato Jan 21, 2025
c849bcc
Merge branch 'main' into optimized-countl-countr-has_single_bit
fbusato Jan 21, 2025
67fffa5
add missing path in_CCCL_BUILTIN_ASSUME
fbusato Jan 21, 2025
0b4345b
workaround for is_constant_evaluated() with old GCC
fbusato Jan 21, 2025
c22a462
use _CCCL_ASSUME instead of builtin
fbusato Jan 21, 2025
98c5979
simplify runtime popc conditions
fbusato Jan 21, 2025
1478151
simplify __runtime_clz
fbusato Jan 21, 2025
72f5d22
workaround for GCC9 constexpr evaluation
fbusato Jan 21, 2025
84b4d3e
workaround for gcc9 and bit_cast
fbusato Jan 22, 2025
7c141e9
remove global namespace for MSVC intrinsics
fbusato Jan 22, 2025
af6807b
use if constexpr to simplify the code
fbusato Jan 22, 2025
86697d2
use _CCCL_BUILTIN_CLZ
fbusato Jan 22, 2025
54a4b29
simplify intrinsics with c++17
fbusato Jan 22, 2025
413282b
add cuda std namespace
fbusato Jan 22, 2025
6126b28
sync builtin with main
fbusato Jan 22, 2025
46f8522
Merge branch 'main' into optimized-countl-countr-has_single_bit
fbusato Jan 22, 2025
5f8d181
fix builtin assume
fbusato Jan 23, 2025
3be860c
fix msvc function parameter
fbusato Jan 23, 2025
f4836d2
fix _BitScanReverse
fbusato Jan 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 32 additions & 106 deletions libcudacxx/include/cuda/std/__bit/clz.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

Expand All @@ -22,132 +22,58 @@
#endif // no system header

#include <cuda/std/__type_traits/is_constant_evaluated.h>
#include <cuda/std/__type_traits/is_same.h>
#include <cuda/std/cstdint>
#include <cuda/std/limits>

#if _CCCL_COMPILER(MSVC)
# include <intrin.h>
#endif // _CCCL_COMPILER(MSVC)

_LIBCUDACXX_BEGIN_NAMESPACE_STD

_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz2(uint64_t __x, int __c)
{
return !!(~__x & 0x2) ^ __c;
}
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz4(uint64_t __x, int __c)
{
return __binary_clz2(__x >> 2 * !!(__x & 0xC), __c + 2 * !(__x & 0xC));
}
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz8(uint64_t __x, int __c)
{
return __binary_clz4(__x >> 4 * !!(__x & 0xF0), __c + 4 * !(__x & 0xF0));
}
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz16(uint64_t __x, int __c)
{
return __binary_clz8(__x >> 8 * !!(__x & 0xFF00), __c + 8 * !(__x & 0xFF00));
}
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz32(uint64_t __x, int __c)
{
return __binary_clz16(__x >> 16 * !!(__x & 0xFFFF0000), __c + 16 * !(__x & 0xFFFF0000));
}
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz64(uint64_t __x)
{
return __binary_clz32(__x >> 32 * !!(__x & 0xFFFFFFFF00000000), 32 * !(__x & 0xFFFFFFFF00000000));
}

#if !_CCCL_COMPILER(MSVC)

_LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_clz(uint32_t __x) noexcept
{
# if defined(__CUDA_ARCH__)
return __binary_clz32(static_cast<uint64_t>(__x), 0); // no device constexpr builtins
# else
return __builtin_clz(__x);
# endif
}

_LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_clz(uint64_t __x) noexcept
{
# if defined(__CUDA_ARCH__)
return __binary_clz64(__x); // no device constexpr builtins
# else
return __builtin_clzll(__x);
# endif
}
#if _CCCL_COMPILER(MSVC)

_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(uint32_t __x) noexcept
template <typename _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __msvc_constexpr_clz(_Tp __x) noexcept
{
# if _CCCL_STD_VER >= 2014
if (!__cccl_default_is_constant_evaluated())
constexpr auto __digits = numeric_limits<_Tp>::digits;
for (int __i = __digits - 1; __i >= 0; --__i)
{
NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clz(__x);), (return __builtin_clz(__x);))
if (__x & (_Tp{1} << __i))
{
return __digits - 1 - __i;
}
}
# endif
return __constexpr_clz(__x);
return __digits;
}

_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(uint64_t __x) noexcept
template <typename _Tp>
_LIBCUDACXX_HIDE_FROM_ABI int __msvc_runtime_clz(_Tp __x) noexcept
{
# if _CCCL_STD_VER >= 2014
if (!__cccl_default_is_constant_evaluated())
{
NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clzll(__x);), (return __builtin_clzll(__x);))
}
# endif
return __constexpr_clz(__x);
constexpr auto __digits = numeric_limits<_Tp>::digits;
unsigned long __where;
auto __res = sizeof(_Tp) == sizeof(uint32_t)
? _BitScanReverse(&__where, static_cast<uint32_t>(__x))
: _BitScanReverse64(&__where, static_cast<uint64_t>(__x));
return (__res) ? __digits - 1 - static_cast<int>(__where) : __digits;
}

#else // _CCCL_COMPILER(MSVC)
#endif // _CCCL_COMPILER(MSVC)

// Precondition: __x != 0
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(uint32_t __x)
template <typename _Tp>
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(_Tp __x) noexcept
{
# if !defined(__CUDA_ARCH__)
if (!__cccl_default_is_constant_evaluated())
{
unsigned long __where = 0;
if (_BitScanReverse(&__where, __x))
{
return static_cast<int>(31 - __where);
}
return 32; // Undefined Behavior.
}
# endif // _CCCL_BUILTIN_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)

return __binary_clz32(static_cast<uint64_t>(__x), 0);
static_assert(is_same_v<_Tp, uint32_t> || is_same_v<_Tp, uint64_t>);
#if _CCCL_COMPILER(MSVC) && !defined(__CUDA_ARCH__)
return is_constant_evaluated() ? _CUDA_VSTD::__msvc_constexpr_clz(__x) : _CUDA_VSTD::__msvc_runtime_clz(__x);
#else // _CCCL_COMPILER(MSVC) ^^^ / !_CCCL_COMPILER(MSVC) vvv
return sizeof(_Tp) == sizeof(uint32_t)
? _CCCL_BUILTIN_CLZ(static_cast<uint32_t>(__x))
: _CCCL_BUILTIN_CLZLL(static_cast<uint64_t>(__x));
#endif // !_CCCL_COMPILER(MSVC) ^^^
}

_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(uint64_t __x)
{
# if !defined(__CUDA_ARCH__)
if (!__cccl_default_is_constant_evaluated())
{
unsigned long __where = 0;
# if defined(_LIBCUDACXX_HAS_BITSCAN64)
if (_BitScanReverse64(&__where, __x))
{
return static_cast<int>(63 - __where);
}
# else
// Win32 doesn't have _BitScanReverse64 so emulate it with two 32 bit calls.
if (_BitScanReverse(&__where, static_cast<uint32_t>(__x >> 32)))
{
return static_cast<int>(63 - (__where + 32));
}
if (_BitScanReverse(&__where, static_cast<uint32_t>(__x)))
{
return static_cast<int>(63 - __where);
}
# endif
return 64; // Undefined Behavior.
}
# endif // _CCCL_BUILTIN_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)

return __binary_clz64(static_cast<uint64_t>(__x));
}

#endif

_LIBCUDACXX_END_NAMESPACE_STD

#endif // _LIBCUDACXX__BIT_CLZ_H
112 changes: 40 additions & 72 deletions libcudacxx/include/cuda/std/__bit/countl.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

Expand All @@ -21,97 +21,65 @@
# pragma system_header
#endif // no system header

#include <cuda/std/__bit/bit_cast.h>
#include <cuda/std/__bit/clz.h>
#include <cuda/std/__bit/rotate.h>
#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__concepts/concept_macros.h>
#include <cuda/std/__type_traits/conditional.h>
#include <cuda/std/__type_traits/is_constant_evaluated.h>
#include <cuda/std/__type_traits/is_unsigned_integer.h>
#include <cuda/std/cstdint>
#include <cuda/std/limits>

_LIBCUDACXX_BEGIN_NAMESPACE_STD

// Forward decl for recursive use in split word operations
template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_zero(_Tp __t) noexcept;

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) <= sizeof(uint32_t), int>
__countl_zero_dispatch(_Tp __t) noexcept
{
return __cccl_clz(static_cast<uint32_t>(__t)) - (numeric_limits<uint32_t>::digits - numeric_limits<_Tp>::digits);
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) == sizeof(uint64_t), int>
__countl_zero_dispatch(_Tp __t) noexcept
{
return __cccl_clz(static_cast<uint64_t>(__t)) - (numeric_limits<uint64_t>::digits - numeric_limits<_Tp>::digits);
}

template <typename _Tp, int _St = sizeof(_Tp) / sizeof(uint64_t)>
struct __countl_zero_rotl_impl
{
static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __short_circuit(_Tp __t, int __cur)
{
// This stops processing early if the current word is not empty
return (__cur == numeric_limits<uint64_t>::digits)
? __cur + __countl_zero_rotl_impl<_Tp, _St - 1>::__count(__t)
: __cur;
}

static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_iter(_Tp __t)
{
// After rotating pass result of clz to another step for processing
return __short_circuit(__t, __countl_zero(static_cast<uint64_t>(__t)));
}

static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __count(_Tp __t)
{
return __countl_iter(__rotl(__t, numeric_limits<uint64_t>::digits));
}
};

template <typename _Tp>
struct __countl_zero_rotl_impl<_Tp, 1>
_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp) _CCCL_AND(sizeof(_Tp) <= sizeof(uint64_t)))
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_zero(_Tp __t) noexcept
{
static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __count(_Tp __t)
if (is_constant_evaluated() && __t == 0)
{
return __countl_zero(static_cast<uint64_t>(__rotl(__t, numeric_limits<uint64_t>::digits)));
return numeric_limits<_Tp>::digits;
}
};

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<(sizeof(_Tp) > sizeof(uint64_t)), int>
__countl_zero_dispatch(_Tp __t) noexcept
{
return __countl_zero_rotl_impl<_Tp>::__count(__t);
using _Sp = _If<sizeof(_Tp) <= sizeof(uint32_t), uint32_t, uint64_t>;
auto __clz_result = _CUDA_VSTD::__cccl_clz(static_cast<_Sp>(__t));
__clz_result -= numeric_limits<_Sp>::digits - numeric_limits<_Tp>::digits;
NV_IF_ELSE_TARGET(NV_IS_DEVICE,
(return __clz_result;), // if __t == 0 __clz_result is already equal to numeric_limits<_Tp>::digits
(return __t ? __clz_result : numeric_limits<_Tp>::digits;))
}

template <class _Tp>
_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp) _CCCL_AND(sizeof(_Tp) > sizeof(uint64_t)))
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_zero(_Tp __t) noexcept
{
static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__countl_zero requires unsigned");
return __t ? __countl_zero_dispatch(__t) : numeric_limits<_Tp>::digits;
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_one(_Tp __t) noexcept
{
static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__countl_one requires unsigned");
return __t != numeric_limits<_Tp>::max() ? __countl_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
constexpr int _Ratio = sizeof(_Tp) / sizeof(uint64_t);
for (int __i = _Ratio - 1; __i >= 0; --__i)
{
auto __value64 = static_cast<uint64_t>(__t >> (__i * numeric_limits<uint64_t>::digits));
if (static_cast<uint64_t>(__value64))
{
return _CUDA_VSTD::__countl_zero(__value64) + (_Ratio - 1 - __i) * numeric_limits<uint64_t>::digits;
}
}
return numeric_limits<_Tp>::digits;
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int>
countl_zero(_Tp __t) noexcept
_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int countl_zero(_Tp __t) noexcept
{
return __countl_zero(__t);
auto __ret = _CUDA_VSTD::__countl_zero(static_cast<_Tp>(__t));
_CCCL_ASSUME(__ret >= 0 && __ret <= numeric_limits<_Tp>::digits);
return __ret;
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> countl_one(_Tp __t) noexcept
_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int countl_one(_Tp __t) noexcept
{
return __countl_one(__t);
auto __ret = _CUDA_VSTD::countl_zero(static_cast<_Tp>(~__t));
_CCCL_ASSUME(__ret >= 0 && __ret <= numeric_limits<_Tp>::digits);
return __ret;
}

_LIBCUDACXX_END_NAMESPACE_STD
Expand Down
Loading
Loading