-
Notifications
You must be signed in to change notification settings - Fork 196
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize bit_floor
, bit_ceil
, bit_width
#3296
Changes from 15 commits
2aeb329
187ff31
5f40fc1
294222b
4d9fa39
31743f8
079780e
66a0710
2dce21e
977a355
499b4f6
573fb29
6c3b6e7
f6091f6
cba67a2
3d0ded9
fc0e646
bacdd2b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. | ||
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
|
@@ -21,51 +21,86 @@ | |
# pragma system_header | ||
#endif // no system header | ||
|
||
#include <cuda/__ptx/instructions/bfind.h> | ||
// #include <cuda/__ptx/instructions/shl.h> | ||
// #include <cuda/__ptx/instructions/shr.h> | ||
#include <cuda/std/__algorithm/max.h> | ||
#include <cuda/std/__bit/countl.h> | ||
#include <cuda/std/__type_traits/enable_if.h> | ||
#include <cuda/std/__concepts/concept_macros.h> | ||
#include <cuda/std/__type_traits/conditional.h> | ||
#include <cuda/std/__type_traits/is_constant_evaluated.h> | ||
#include <cuda/std/__type_traits/is_unsigned_integer.h> | ||
#include <cuda/std/cstdint> | ||
#include <cuda/std/limits> | ||
|
||
_LIBCUDACXX_BEGIN_NAMESPACE_STD | ||
|
||
template <class _Tp> | ||
_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept | ||
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __bit_log2(_Tp __t) noexcept | ||
{ | ||
static_assert(__cccl_is_unsigned_integer_v<_Tp>, "__bit_log2 requires unsigned"); | ||
return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t); | ||
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8) | ||
{ | ||
using _Up [[maybe_unused]] = _If<sizeof(_Tp) <= 4, uint32_t, uint64_t>; | ||
NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::ptx::bfind(static_cast<_Up>(__t));)) | ||
} | ||
return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t); | ||
} | ||
|
||
template <class _Tp> | ||
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) >= sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept | ||
{ | ||
return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u))); | ||
} | ||
|
||
template <class _Tp> | ||
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) < sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept | ||
_CCCL_TEMPLATE(class _Tp) | ||
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) | ||
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept | ||
{ | ||
return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u))) | ||
+ (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits))) | ||
>> (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits)); | ||
// if __t == 0, __bit_log2(0) returns 0xFFFFFFFF. Since unsigned overflow is well-defined, the result is -1 + 1 = 0 | ||
auto __ret = _CUDA_VSTD::__bit_log2(__t) + 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The comment and the code do not agree. Please add the appropriate casts if you want to intermittently cast to unsigned |
||
_CCCL_BUILTIN_ASSUME(__ret >= 0 && __ret <= numeric_limits<_Tp>::digits); | ||
return __ret; | ||
miscco marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
template <class _Tp> | ||
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, _Tp> bit_floor(_Tp __t) noexcept | ||
_CCCL_TEMPLATE(class _Tp) | ||
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) | ||
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept | ||
{ | ||
return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t)); | ||
_CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow"); | ||
// if __t == 0, __t - 1 == 0xFFFFFFFF, bit_width(0xFFFFFFFF) returns 32 | ||
auto __width = _CUDA_VSTD::bit_width(static_cast<_Tp>(__t - 1)); | ||
#ifdef PTX_SHL_SHR | ||
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8 && false) | ||
{ | ||
// CUDA right shift (ptx::shr) returns 0 if the right operand is larger than the number of bits of the type | ||
// The result is computed as max(1, bit_width(__t - 1)) because it is more efficient than ternary operator | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please file a backend bug for that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it is a bug. Recent GPU archs provide |
||
NV_IF_TARGET(NV_IS_DEVICE, // | ||
(auto __shift = ::cuda::ptx::shl(_Tp{1}, __width); // 1 << width | ||
auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Tp{1}, __shift)); // | ||
_CCCL_BUILTIN_ASSUME(__ret >= __t); | ||
return __ret;)) | ||
} | ||
#endif | ||
auto __ret = __t <= 1 ? 1 : _Tp{1} << __width; | ||
_CCCL_BUILTIN_ASSUME(__ret >= __t); | ||
return static_cast<_Tp>(__ret); | ||
} | ||
|
||
template <class _Tp> | ||
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, _Tp> bit_ceil(_Tp __t) noexcept | ||
{ | ||
return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t)); | ||
} | ||
|
||
template <class _Tp> | ||
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, int> bit_width(_Tp __t) noexcept | ||
_CCCL_TEMPLATE(class _Tp) | ||
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) | ||
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept | ||
{ | ||
return __t == 0 ? 0 : static_cast<int>(__bit_log2(__t) + 1); | ||
using _Up = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>; | ||
auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)); | ||
// __bit_log2 returns 0xFFFFFFFF if __t == 0 | ||
#ifdef PTX_SHL_SHR | ||
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8) | ||
{ | ||
// CUDA left shift (ptx::shl) returns 0 if the right operand is larger than the number of bits of the type | ||
// -> the result is 0 if __t == 0 | ||
NV_IF_TARGET(NV_IS_DEVICE, // | ||
(auto __ret = static_cast<_Tp>(::cuda::ptx::shl(_Tp{1}, __log2))); // | ||
_CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t); | ||
return __ret;) | ||
} | ||
#endif | ||
auto __ret = __t == 0 ? 0 : _Tp{1} << __log2; | ||
_CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t); | ||
return static_cast<_Tp>(__ret); | ||
} | ||
|
||
_LIBCUDACXX_END_NAMESPACE_STD | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove commented out includes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this PR is marked blocked because it depends on these two instructions