Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize bit_floor, bit_ceil, bit_width #3296

Merged
merged 18 commits into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions docs/libcudacxx/standard_api/numerics_library/bit.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,23 @@
``<cuda/std/bit>``
======================

Extensions
----------
CUDA Performance Considerations
-------------------------------

- All features of ``<bit>`` are made available in C++11 onwards
- ``bit_width()`` translates into a single ``FLO`` SASS instruction. The result is assumed to be in the range ``[0, N-bit]``.
- ``bit_ceil()`` translates into ``FLO, SHL`` SASS instructions. The result is assumed to be greater than or equal to the input.
- ``bit_floor()`` translates into ``ADD, FLO, SHL, IMINMAX`` SASS instructions. The result is assumed to be less than or equal to the input.
- ``popcount()`` translates into a single ``POPC`` SASS instruction. The result is assumed to be in the range ``[0, N-bit]``.
- ``has_single_bit()`` translates into ``POPC + ISETP`` SASS instructions.
- ``rotl()/rotr()`` translate into a single ``SHF`` (funned shift) SASS instruction.
- ``countl_zero()`` translates into ``FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
- ``countl_one()`` translates into ``LOP3, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
- ``countr_zero()`` translates into ``BREV, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
- ``countr_one()`` translates into ``LOP3, BREV, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.

Additional Notes
----------------

- All functions are marked ``[[nodiscard]]`` and ``noexcept``
- All functions support ``__uint128_t``
- ``bit_ceil()`` checks for overflow in debug mode
135 changes: 111 additions & 24 deletions libcudacxx/include/cuda/std/__bit/integral.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

Expand All @@ -21,53 +21,140 @@
# pragma system_header
#endif // no system header

// #include <cuda/__ptx/instructions/bfind.h>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__bit/countl.h>
#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__type_traits/is_unsigned_integer.h>
#include <cuda/std/__concepts/concept_macros.h>
#include <cuda/std/__type_traits/is_constant_evaluated.h>
#include <cuda/std/__type_traits/is_unsigned.h>
#include <cuda/std/__type_traits/make_unsigned.h>
#include <cuda/std/cstdint>
#include <cuda/std/limits>

_LIBCUDACXX_BEGIN_NAMESPACE_STD
// the following section will be removed when the ptx bfind is added
namespace cuda::ptx
{

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept
_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(unsigned a)
{
static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned");
return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
uint32_t d;
asm volatile(
"{ \n\t\t"
"bfind.u32 %0, %1; \n\t\t"
"}"
: "=r"(d)
: "r"(a));
return d;
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) >= sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(uint64_t a)
{
return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)));
uint32_t d;
asm volatile(
"{ \n\t\t"
"bfind.u64 %0, %1; \n\t\t"
"}"
: "=r"(d)
: "l"(a));
return d;
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) < sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(unsigned long long a)
{
return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)))
+ (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits)))
>> (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits));
uint32_t d;
asm volatile(
"{ \n\t\t"
"bfind.u64 %0, %1; \n\t\t"
"}"
: "=r"(d)
: "l"(a));
return d;
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> bit_floor(_Tp __t) noexcept
_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(__uint128_t a)
{
return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t));
return 0;
}

} // namespace cuda::ptx

_LIBCUDACXX_BEGIN_NAMESPACE_STD

#define _CCCL_CUDA_BUILTIN_ASSUME(...) \
if (!_CUDA_VSTD::is_constant_evaluated()) \
{ \
NV_IF_TARGET(NV_IS_DEVICE, (__builtin_assume(__VA_ARGS__);)) \
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> bit_ceil(_Tp __t) noexcept
_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::uint32_t __bit_log2(_Tp __t) noexcept
{
return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t));
if (!_CUDA_VSTD::is_constant_evaluated() && sizeof(_Tp) <= 8)
{
NV_IF_ELSE_TARGET(NV_IS_DEVICE,
(return ::cuda::ptx::bfind(__t);), //
(return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);))
}
else
{
return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);
}
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> bit_width(_Tp __t) noexcept
_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept
{
// __bit_log2 returns 0xFFFFFFFF if __t == 0. Since unsigned overflow is well-defined, the result is -1 + 1 = 0
using _Up = _CUDA_VSTD::make_unsigned_t<decltype(+_Tp{})>;
auto __ret = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)) + 1; // type of__ret is int
_CCCL_CUDA_BUILTIN_ASSUME((is_unsigned_v<_Tp> ? true : __ret >= 0) && __ret <= numeric_limits<_Tp>::digits);
return __ret;
}

_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept
{
return __t == 0 ? 0 : static_cast<int>(__bit_log2(__t) + 1);
_CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow");
if (_CUDA_VSTD::is_constant_evaluated() && __t <= 1)
{
return 1;
}
// if __t == 0, bit_width() applies to 0xFFFFFFFF and returns 32
// In CUDA, unsigned{1} << 32 --> 0
// The result is computed as max(1, bit_width(__t - 1)) because max() requires less instructions than ternary operator
using _Up = _CUDA_VSTD::make_unsigned_t<decltype(+_Tp{})>;
auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t - 1)); // type of __ret is _Up
// clang-format off
NV_IF_ELSE_TARGET(NV_IS_DEVICE,
(auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, _Up{1} << __width));
_CCCL_BUILTIN_ASSUME(__ret >= __t)
return __ret;),
(return static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width);))
// clang-format on
}

_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept
{
// __bit_log2 returns 0xFFFFFFFF if __t == 0
// (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type
// -> the result is 0 is __t == 0
if (_CUDA_VSTD::is_constant_evaluated() && __t == 0)
{
return 0;
}
using _Up = _CUDA_VSTD::make_unsigned_t<decltype(+_Tp{})>;
auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t));
NV_IF_ELSE_TARGET(NV_IS_DEVICE,
(return _Tp{1} << __log2;), //
(return __t == 0 ? 0 : _Tp{1} << __log2;))
}

#undef _CCCL_CUDA_BUILTIN_ASSUME

_LIBCUDACXX_END_NAMESPACE_STD

#endif // _LIBCUDACXX___BIT_INTEGRAL_H
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

#include <cuda/std/detail/__config>

#include "cuda/std/__cccl/dialect.h"

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
Expand Down Expand Up @@ -48,6 +50,9 @@ struct __cccl_is_unsigned_integer<__uint128_t> : public true_type
{};
#endif

template <typename _Tp>
_CCCL_INLINE_VAR constexpr auto __cccl_is_unsigned_integer_v = __cccl_is_unsigned_integer<_Tp>::value;

_LIBCUDACXX_END_NAMESPACE_STD

#endif // _LIBCUDACXX___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H
Loading