Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize bit_floor, bit_ceil, bit_width #3296

Merged
merged 18 commits into from
Feb 26, 2025
Merged
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 63 additions & 28 deletions libcudacxx/include/cuda/std/__bit/integral.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

Expand All @@ -21,51 +21,86 @@
# pragma system_header
#endif // no system header

#include <cuda/__ptx/instructions/bfind.h>
// #include <cuda/__ptx/instructions/shl.h>
// #include <cuda/__ptx/instructions/shr.h>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove commented out includes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this PR is marked blocked because it depends on these two instructions

#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__bit/countl.h>
#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__concepts/concept_macros.h>
#include <cuda/std/__type_traits/conditional.h>
#include <cuda/std/__type_traits/is_constant_evaluated.h>
#include <cuda/std/__type_traits/is_unsigned_integer.h>
#include <cuda/std/cstdint>
#include <cuda/std/limits>

_LIBCUDACXX_BEGIN_NAMESPACE_STD

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __bit_log2(_Tp __t) noexcept
{
static_assert(__cccl_is_unsigned_integer_v<_Tp>, "__bit_log2 requires unsigned");
return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8)
{
using _Up [[maybe_unused]] = _If<sizeof(_Tp) <= 4, uint32_t, uint64_t>;
NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::ptx::bfind(static_cast<_Up>(__t));))
}
return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) >= sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
{
return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)));
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) < sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept
{
return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)))
+ (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits)))
>> (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits));
// if __t == 0, __bit_log2(0) returns 0xFFFFFFFF. Since unsigned overflow is well-defined, the result is -1 + 1 = 0
auto __ret = _CUDA_VSTD::__bit_log2(__t) + 1;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment and the code do not agree. __bit_log2 is returning an int so this would be signed overflow aka UB.

Please add the appropriate casts if you want to intermittently cast to unsigned

_CCCL_BUILTIN_ASSUME(__ret >= 0 && __ret <= numeric_limits<_Tp>::digits);
return __ret;
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, _Tp> bit_floor(_Tp __t) noexcept
_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept
{
return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t));
_CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow");
// if __t == 0, __t - 1 == 0xFFFFFFFF, bit_width(0xFFFFFFFF) returns 32
auto __width = _CUDA_VSTD::bit_width(static_cast<_Tp>(__t - 1));
#ifdef PTX_SHL_SHR
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8 && false)
{
// CUDA right shift (ptx::shr) returns 0 if the right operand is larger than the number of bits of the type
// The result is computed as max(1, bit_width(__t - 1)) because it is more efficient than ternary operator
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please file a backend bug for that

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it is a bug. Recent GPU archs provide MNMX instructions to compute minimum and maximum efficiently. The ternary operator has a different semantic. I don't think the compiler is able to understand the program logic enough to exploit this optimization.

NV_IF_TARGET(NV_IS_DEVICE, //
(auto __shift = ::cuda::ptx::shl(_Tp{1}, __width); // 1 << width
auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Tp{1}, __shift)); //
_CCCL_BUILTIN_ASSUME(__ret >= __t);
return __ret;))
}
#endif
auto __ret = __t <= 1 ? 1 : _Tp{1} << __width;
_CCCL_BUILTIN_ASSUME(__ret >= __t);
return static_cast<_Tp>(__ret);
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, _Tp> bit_ceil(_Tp __t) noexcept
{
return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t));
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, int> bit_width(_Tp __t) noexcept
_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept
{
return __t == 0 ? 0 : static_cast<int>(__bit_log2(__t) + 1);
using _Up = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t));
// __bit_log2 returns 0xFFFFFFFF if __t == 0
#ifdef PTX_SHL_SHR
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8)
{
// CUDA left shift (ptx::shl) returns 0 if the right operand is larger than the number of bits of the type
// -> the result is 0 if __t == 0
NV_IF_TARGET(NV_IS_DEVICE, //
(auto __ret = static_cast<_Tp>(::cuda::ptx::shl(_Tp{1}, __log2))); //
_CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
return __ret;)
}
#endif
auto __ret = __t == 0 ? 0 : _Tp{1} << __log2;
_CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
return static_cast<_Tp>(__ret);
}

_LIBCUDACXX_END_NAMESPACE_STD
Expand Down
Loading