Skip to content

Commit

Permalink
add PTX shl and shr
Browse files Browse the repository at this point in the history
  • Loading branch information
fbusato committed Feb 25, 2025
1 parent 3d0ded9 commit fc0e646
Showing 1 changed file with 43 additions and 31 deletions.
74 changes: 43 additions & 31 deletions libcudacxx/include/cuda/std/__bit/integral.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
#endif // no system header

#include <cuda/__ptx/instructions/bfind.h>
// #include <cuda/__ptx/instructions/shl.h>
// #include <cuda/__ptx/instructions/shr.h>
#include <cuda/__ptx/instructions/shl.h>
#include <cuda/__ptx/instructions/shr.h>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__bit/countl.h>
#include <cuda/std/__concepts/concept_macros.h>
Expand All @@ -36,12 +36,21 @@
_LIBCUDACXX_BEGIN_NAMESPACE_STD

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr int __bit_log2(_Tp __t) noexcept
_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept
{
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8)
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
{
using _Up [[maybe_unused]] = _If<sizeof(_Tp) <= 4, uint32_t, uint64_t>;
NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::ptx::bfind(static_cast<_Up>(__t));))
if constexpr (sizeof(_Tp) <= 8)
{
using _Up [[maybe_unused]] = _If<sizeof(_Tp) <= 4, uint32_t, uint64_t>;
NV_IF_TARGET(NV_IS_DEVICE, (return _CUDA_VPTX::bfind(static_cast<_Up>(__t));))
}
else
{
NV_IF_TARGET(NV_IS_DEVICE,
(auto __high = _CUDA_VPTX::bfind(static_cast<uint64_t>(__t >> 64));
return __high == ~uint32_t{0} ? _CUDA_VPTX::bfind(static_cast<uint64_t>(__t)) : __high + 64;))
}
}
return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);
}
Expand All @@ -52,32 +61,34 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexc
{
// if __t == 0, __bit_log2(0) returns 0xFFFFFFFF. Since unsigned overflow is well-defined, the result is -1 + 1 = 0
auto __ret = _CUDA_VSTD::__bit_log2(__t) + 1;
_CCCL_BUILTIN_ASSUME(__ret >= 0 && __ret <= numeric_limits<_Tp>::digits);
_CCCL_BUILTIN_ASSUME(__ret <= numeric_limits<_Tp>::digits);
return __ret;
}

_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept
{
using _Up = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
_CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow");
// if __t == 0, __t - 1 == 0xFFFFFFFF, bit_width(0xFFFFFFFF) returns 32
auto __width = _CUDA_VSTD::bit_width(static_cast<_Tp>(__t - 1));
#ifdef PTX_SHL_SHR
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8 && false)
auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t) - 1);
if constexpr (sizeof(_Tp) <= 8)
{
// CUDA right shift (ptx::shr) returns 0 if the right operand is larger than the number of bits of the type
// The result is computed as max(1, bit_width(__t - 1)) because it is more efficient than ternary operator
NV_IF_TARGET(NV_IS_DEVICE, //
(auto __shift = ::cuda::ptx::shl(_Tp{1}, __width); // 1 << width
auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Tp{1}, __shift)); //
_CCCL_BUILTIN_ASSUME(__ret >= __t);
return __ret;))
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
{
// CUDA right shift (ptx::shr) returns 0 if the right operand is larger than the number of bits of the type
// The result is computed as max(1, bit_width(__t - 1)) because it is more efficient than the ternary operator
NV_IF_TARGET(NV_IS_DEVICE, //
(auto __shift = _CUDA_VPTX::shl(_Up{1}, __width); // 2^(ceil(log2(__t - 1)))
auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, __shift)); //
_CCCL_BUILTIN_ASSUME(__ret >= __t);
return __ret;))
}
}
#endif
auto __ret = __t <= 1 ? 1 : _Tp{1} << __width;
auto __ret = static_cast<_Tp>(__t <= 1 ? _Up{1} : _Up{1} << __width);
_CCCL_BUILTIN_ASSUME(__ret >= __t);
return static_cast<_Tp>(__ret);
return __ret;
}

_CCCL_TEMPLATE(class _Tp)
Expand All @@ -87,20 +98,21 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexc
using _Up = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t));
// __bit_log2 returns 0xFFFFFFFF if __t == 0
#ifdef PTX_SHL_SHR
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8)
if constexpr (sizeof(_Tp) <= 8)
{
// CUDA left shift (ptx::shl) returns 0 if the right operand is larger than the number of bits of the type
// -> the result is 0 if __t == 0
NV_IF_TARGET(NV_IS_DEVICE, //
(auto __ret = static_cast<_Tp>(::cuda::ptx::shl(_Tp{1}, __log2))); //
_CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
return __ret;)
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
{
// CUDA left shift (ptx::shl) returns 0 if the right operand is larger than the number of bits of the type
// -> the result is 0 if __t == 0
NV_IF_TARGET(NV_IS_DEVICE, //
(auto __ret = static_cast<_Tp>(_CUDA_VPTX::shl(_Up{1}, __log2)); // 2^(log2(t))
_CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
return __ret;))
}
}
#endif
auto __ret = __t == 0 ? 0 : _Tp{1} << __log2;
auto __ret = static_cast<_Tp>(__t == 0 ? _Up{0} : _Up{1} << __log2);
_CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
return static_cast<_Tp>(__ret);
return __ret;
}

_LIBCUDACXX_END_NAMESPACE_STD
Expand Down

0 comments on commit fc0e646

Please sign in to comment.