Skip to content

Commit

Permalink
Merge branch 'main' into stf_fix_cudastf_debug
Browse files Browse the repository at this point in the history
  • Loading branch information
caugonnet authored Feb 26, 2025
2 parents 1a0e755 + 00b545b commit d3aa270
Showing 1 changed file with 74 additions and 27 deletions.
101 changes: 74 additions & 27 deletions libcudacxx/include/cuda/std/__bit/integral.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

Expand All @@ -21,8 +21,14 @@
# pragma system_header
#endif // no system header

#include <cuda/__ptx/instructions/bfind.h>
#include <cuda/__ptx/instructions/shl.h>
#include <cuda/__ptx/instructions/shr.h>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__bit/countl.h>
#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__concepts/concept_macros.h>
#include <cuda/std/__type_traits/conditional.h>
#include <cuda/std/__type_traits/is_constant_evaluated.h>
#include <cuda/std/__type_traits/is_unsigned_integer.h>
#include <cuda/std/cstdint>
#include <cuda/std/limits>
Expand All @@ -32,40 +38,81 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept
{
static_assert(__cccl_is_unsigned_integer_v<_Tp>, "__bit_log2 requires unsigned");
return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
{
if constexpr (sizeof(_Tp) <= 8)
{
using _Up [[maybe_unused]] = _If<sizeof(_Tp) <= 4, uint32_t, uint64_t>;
NV_IF_TARGET(NV_IS_DEVICE, (return _CUDA_VPTX::bfind(static_cast<_Up>(__t));))
}
else
{
NV_IF_TARGET(NV_IS_DEVICE,
(auto __high = _CUDA_VPTX::bfind(static_cast<uint64_t>(__t >> 64));
return __high == ~uint32_t{0} ? _CUDA_VPTX::bfind(static_cast<uint64_t>(__t)) : __high + 64;))
}
}
return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) >= sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
{
return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)));
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) < sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept
{
return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)))
+ (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits)))
>> (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits));
// if __t == 0, __bit_log2(0) returns 0xFFFFFFFF. Since unsigned overflow is well-defined, the result is -1 + 1 = 0
auto __ret = _CUDA_VSTD::__bit_log2(__t) + 1;
_CCCL_BUILTIN_ASSUME(__ret <= numeric_limits<_Tp>::digits);
return static_cast<int>(__ret);
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, _Tp> bit_floor(_Tp __t) noexcept
_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept
{
return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t));
using _Up = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
_CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow");
// if __t == 0, __t - 1 == 0xFFFFFFFF, bit_width(0xFFFFFFFF) returns 32
auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t) - 1);
if constexpr (sizeof(_Tp) <= 8)
{
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
{
// CUDA right shift (ptx::shr) returns 0 if the right operand is larger than the number of bits of the type
// The result is computed as max(1, bit_width(__t - 1)) because it is more efficient than the ternary operator
NV_IF_TARGET(NV_IS_DEVICE, //
(auto __shift = _CUDA_VPTX::shl(_Up{1}, __width); // 2^(ceil(log2(__t - 1)))
auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, __shift)); //
_CCCL_BUILTIN_ASSUME(__ret >= __t);
return __ret;))
}
}
auto __ret = static_cast<_Tp>(__t <= 1 ? _Up{1} : _Up{1} << __width);
_CCCL_BUILTIN_ASSUME(__ret >= __t);
return __ret;
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, _Tp> bit_ceil(_Tp __t) noexcept
{
return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t));
}

template <class _Tp>
_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, int> bit_width(_Tp __t) noexcept
_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept
{
return __t == 0 ? 0 : static_cast<int>(__bit_log2(__t) + 1);
using _Up = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t));
// __bit_log2 returns 0xFFFFFFFF if __t == 0
if constexpr (sizeof(_Tp) <= 8)
{
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
{
// CUDA left shift (ptx::shl) returns 0 if the right operand is larger than the number of bits of the type
// -> the result is 0 if __t == 0
NV_IF_TARGET(NV_IS_DEVICE, //
(auto __ret = static_cast<_Tp>(_CUDA_VPTX::shl(_Up{1}, __log2)); // 2^(log2(t))
_CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
return __ret;))
}
}
auto __ret = static_cast<_Tp>(__t == 0 ? _Up{0} : _Up{1} << __log2);
_CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
return __ret;
}

_LIBCUDACXX_END_NAMESPACE_STD
Expand Down

0 comments on commit d3aa270

Please sign in to comment.