From 2aeb329489abcfd35bcbbbc1e92ed85bd41fcd82 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 9 Jan 2025 00:10:57 +0000 Subject: [PATCH 01/14] optimize bit_floor, bit_ceil, bit_width --- libcudacxx/include/cuda/std/__bit/integral.h | 135 ++++++++++++++---- .../std/__type_traits/is_unsigned_integer.h | 5 + 2 files changed, 116 insertions(+), 24 deletions(-) diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h index f0186ad9f5f..41edc6c5a0a 100644 --- a/libcudacxx/include/cuda/std/__bit/integral.h +++ b/libcudacxx/include/cuda/std/__bit/integral.h @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -21,53 +21,140 @@ # pragma system_header #endif // no system header +// #include +#include #include -#include -#include +#include +#include +#include +#include #include #include -_LIBCUDACXX_BEGIN_NAMESPACE_STD +// the following section will be removed when the ptx bfind is added +namespace cuda::ptx +{ -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(unsigned a) { - static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned"); - return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t); + uint32_t d; + asm volatile( + "{ \n\t\t" + "bfind.u32 %0, %1; \n\t\t" + "}" + : "=r"(d) + : "r"(a)); + return d; } -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t= sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(uint64_t a) { - return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u))); + uint32_t d; + asm volatile( + "{ \n\t\t" + "bfind.u64 %0, %1; \n\t\t" + "}" + : "=r"(d) + : "l"(a)); + return d; } -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t __ceil2(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(unsigned long long a) { - return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u))) - + (numeric_limits::digits - numeric_limits<_Tp>::digits))) - >> (numeric_limits::digits - numeric_limits<_Tp>::digits)); + uint32_t d; + asm volatile( + "{ \n\t\t" + "bfind.u64 %0, %1; \n\t\t" + "}" + : "=r"(d) + : "l"(a)); + return d; } -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> bit_floor(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(__uint128_t a) { - return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t)); + return 0; } +} // namespace cuda::ptx + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +#define _CCCL_CUDA_BUILTIN_ASSUME(...) \ + if (!_CUDA_VSTD::is_constant_evaluated()) \ + { \ + NV_IF_TARGET(NV_IS_DEVICE, (__builtin_assume(__VA_ARGS__);)) \ + } + template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> bit_ceil(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::uint32_t __bit_log2(_Tp __t) noexcept { - return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t)); + if (!_CUDA_VSTD::is_constant_evaluated() && sizeof(_Tp) <= 8) + { + NV_IF_ELSE_TARGET(NV_IS_DEVICE, + (return ::cuda::ptx::bfind(__t);), // + (return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);)) + } + else + { + return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t); + } } -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> bit_width(_Tp __t) noexcept +_CCCL_TEMPLATE(class _Tp) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept +{ + // __bit_log2 returns 0xFFFFFFFF if __t == 0. Since unsigned overflow is well-defined, the result is -1 + 1 = 0 + using _Up = _CUDA_VSTD::make_unsigned_t; + auto __ret = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)) + 1; // type of__ret is int + _CCCL_CUDA_BUILTIN_ASSUME((is_unsigned_v<_Tp> ? true : __ret >= 0) && __ret <= numeric_limits<_Tp>::digits); + return __ret; +} + +_CCCL_TEMPLATE(class _Tp) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept { - return __t == 0 ? 0 : static_cast(__bit_log2(__t) + 1); + _CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow"); + if (_CUDA_VSTD::is_constant_evaluated() && __t <= 1) + { + return 1; + } + // if __t == 0, bit_width() applies to 0xFFFFFFFF and returns 32 + // In CUDA, unsigned{1} << 32 --> 0 + // The result is computed as max(1, bit_width(__t - 1)) because max() requires less instructions than ternary operator + using _Up = _CUDA_VSTD::make_unsigned_t; + auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t - 1)); // type of __ret is _Up + // clang-format off + NV_IF_ELSE_TARGET(NV_IS_DEVICE, + (auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, _Up{1} << __width)); + _CCCL_BUILTIN_ASSUME(__ret >= __t) + return __ret;), + (return static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width);)) + // clang-format on } +_CCCL_TEMPLATE(class _Tp) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept +{ + // __bit_log2 returns 0xFFFFFFFF if __t == 0 + // (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type + // -> the result is 0 is __t == 0 + if (_CUDA_VSTD::is_constant_evaluated() && __t == 0) + { + return 0; + } + using _Up = _CUDA_VSTD::make_unsigned_t; + auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)); + NV_IF_ELSE_TARGET(NV_IS_DEVICE, + (return _Tp{1} << __log2;), // + (return __t == 0 ? 0 : _Tp{1} << __log2;)) +} + +#undef _CCCL_CUDA_BUILTIN_ASSUME + _LIBCUDACXX_END_NAMESPACE_STD #endif // _LIBCUDACXX___BIT_INTEGRAL_H diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h index 088c98af66a..c7a44cd8583 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h @@ -12,6 +12,8 @@ #include +#include "cuda/std/__cccl/dialect.h" + #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -48,6 +50,9 @@ struct __cccl_is_unsigned_integer<__uint128_t> : public true_type {}; #endif +template +_CCCL_INLINE_VAR constexpr auto __cccl_is_unsigned_integer_v = __cccl_is_unsigned_integer<_Tp>::value; + _LIBCUDACXX_END_NAMESPACE_STD #endif // _LIBCUDACXX___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H From 187ff3107c2eb045262f1d35db126fbc8a4a7cdd Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 9 Jan 2025 00:11:11 +0000 Subject: [PATCH 02/14] document performance considerations --- .../standard_api/numerics_library/bit.rst | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/docs/libcudacxx/standard_api/numerics_library/bit.rst b/docs/libcudacxx/standard_api/numerics_library/bit.rst index 75aaf12792e..e2c19a6c17d 100644 --- a/docs/libcudacxx/standard_api/numerics_library/bit.rst +++ b/docs/libcudacxx/standard_api/numerics_library/bit.rst @@ -3,7 +3,23 @@ ```` ====================== -Extensions ----------- +CUDA Performance Considerations +------------------------------- -- All features of ```` are made available in C++11 onwards +- ``bit_width()`` translates into a single ``FLO`` SASS instruction. The result is assumed to be in the range ``[0, N-bit]``. +- ``bit_ceil()`` translates into ``FLO, SHL`` SASS instructions. The result is assumed to be greater than or equal to the input. +- ``bit_floor()`` translates into ``ADD, FLO, SHL, IMINMAX`` SASS instructions. The result is assumed to be less than or equal to the input. +- ``popcount()`` translates into a single ``POPC`` SASS instruction. The result is assumed to be in the range ``[0, N-bit]``. +- ``has_single_bit()`` translates into ``POPC + ISETP`` SASS instructions. +- ``rotl()/rotr()`` translate into a single ``SHF`` (funned shift) SASS instruction. +- ``countl_zero()`` translates into ``FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``. +- ``countl_one()`` translates into ``LOP3, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``. +- ``countr_zero()`` translates into ``BREV, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``. +- ``countr_one()`` translates into ``LOP3, BREV, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``. + +Additional Notes +---------------- + +- All functions are marked ``[[nodiscard]]`` and ``noexcept`` +- All functions support ``__uint128_t`` +- ``bit_ceil()`` checks for overflow in debug mode From 5f40fc164b64d14b40a7932fd2c1635d11f663d1 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Thu, 9 Jan 2025 09:52:28 -0800 Subject: [PATCH 03/14] Update libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h Co-authored-by: Michael Schellenberger Costa --- libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h index c7a44cd8583..7bc6652d3d0 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h @@ -12,8 +12,6 @@ #include -#include "cuda/std/__cccl/dialect.h" - #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) From 294222bb8abd8843aa590f72aeb78ce971578a58 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 9 Jan 2025 22:51:49 +0000 Subject: [PATCH 04/14] use default macro for builtin_assume and avoid integer promotion --- libcudacxx/include/cuda/std/__bit/integral.h | 34 +++++++++---------- .../std/__type_traits/is_unsigned_integer.h | 4 +++ 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h index 41edc6c5a0a..dc8200bae55 100644 --- a/libcudacxx/include/cuda/std/__bit/integral.h +++ b/libcudacxx/include/cuda/std/__bit/integral.h @@ -26,11 +26,13 @@ #include #include #include -#include +#include #include #include #include +#include "cuda/std/__type_traits/conditional.h" + // the following section will be removed when the ptx bfind is added namespace cuda::ptx { @@ -80,12 +82,6 @@ _LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(__uint128_t a) _LIBCUDACXX_BEGIN_NAMESPACE_STD -#define _CCCL_CUDA_BUILTIN_ASSUME(...) \ - if (!_CUDA_VSTD::is_constant_evaluated()) \ - { \ - NV_IF_TARGET(NV_IS_DEVICE, (__builtin_assume(__VA_ARGS__);)) \ - } - template _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::uint32_t __bit_log2(_Tp __t) noexcept { @@ -106,9 +102,9 @@ _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept { // __bit_log2 returns 0xFFFFFFFF if __t == 0. Since unsigned overflow is well-defined, the result is -1 + 1 = 0 - using _Up = _CUDA_VSTD::make_unsigned_t; + using _Up = conditional_t; auto __ret = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)) + 1; // type of__ret is int - _CCCL_CUDA_BUILTIN_ASSUME((is_unsigned_v<_Tp> ? true : __ret >= 0) && __ret <= numeric_limits<_Tp>::digits); + _CCCL_BUILTIN_ASSUME((is_unsigned_v<_Tp> ? true : __ret >= 0) && __ret <= numeric_limits<_Tp>::digits); return __ret; } @@ -124,12 +120,12 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexce // if __t == 0, bit_width() applies to 0xFFFFFFFF and returns 32 // In CUDA, unsigned{1} << 32 --> 0 // The result is computed as max(1, bit_width(__t - 1)) because max() requires less instructions than ternary operator - using _Up = _CUDA_VSTD::make_unsigned_t; + using _Up = conditional_t; auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t - 1)); // type of __ret is _Up // clang-format off NV_IF_ELSE_TARGET(NV_IS_DEVICE, (auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, _Up{1} << __width)); - _CCCL_BUILTIN_ASSUME(__ret >= __t) + _CCCL_BUILTIN_ASSUME(__ret >= __t); return __ret;), (return static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width);)) // clang-format on @@ -139,21 +135,25 @@ _CCCL_TEMPLATE(class _Tp) _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept { - // __bit_log2 returns 0xFFFFFFFF if __t == 0 - // (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type - // -> the result is 0 is __t == 0 if (_CUDA_VSTD::is_constant_evaluated() && __t == 0) { return 0; } - using _Up = _CUDA_VSTD::make_unsigned_t; + using _Up = conditional_t; auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)); + // __bit_log2 returns 0xFFFFFFFF if __t == 0 + // (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type + // -> the result is 0 is __t == 0 + // clang-format off NV_IF_ELSE_TARGET(NV_IS_DEVICE, - (return _Tp{1} << __log2;), // + (auto __ret = _Tp{1} << __log2; + _CCCL_BUILTIN_ASSUME(__ret <= __t); + return __ret;), (return __t == 0 ? 0 : _Tp{1} << __log2;)) + // clang-format on } -#undef _CCCL_CUDA_BUILTIN_ASSUME +#undef _CCCL_BUILTIN_ASSUME _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h index 7bc6652d3d0..504c076f57c 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h @@ -48,9 +48,13 @@ struct __cccl_is_unsigned_integer<__uint128_t> : public true_type {}; #endif +#if !defined(_CCCL_NO_VARIABLE_TEMPLATES) + template _CCCL_INLINE_VAR constexpr auto __cccl_is_unsigned_integer_v = __cccl_is_unsigned_integer<_Tp>::value; +#endif // !defined(_CCCL_NO_VARIABLE_TEMPLATES) + _LIBCUDACXX_END_NAMESPACE_STD #endif // _LIBCUDACXX___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H From 31743f82af74450901d1b070599bb32cef526629 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Wed, 15 Jan 2025 15:37:58 -0800 Subject: [PATCH 05/14] Update libcudacxx/include/cuda/std/__bit/integral.h Co-authored-by: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> --- libcudacxx/include/cuda/std/__bit/integral.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h index dc8200bae55..e65123510b7 100644 --- a/libcudacxx/include/cuda/std/__bit/integral.h +++ b/libcudacxx/include/cuda/std/__bit/integral.h @@ -87,14 +87,9 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::uint32_t __bit_log2(_Tp __t) noe { if (!_CUDA_VSTD::is_constant_evaluated() && sizeof(_Tp) <= 8) { - NV_IF_ELSE_TARGET(NV_IS_DEVICE, - (return ::cuda::ptx::bfind(__t);), // - (return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);)) - } - else - { - return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t); + NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::ptx::bfind(__t);)) } + return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t); } _CCCL_TEMPLATE(class _Tp) From 079780eb87d80517fa7e1a39ab1411ab280931ce Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 15 Jan 2025 16:58:53 -0800 Subject: [PATCH 06/14] revert __cccl_is_unsigned_integer_v to avoid conflict --- .../include/cuda/std/__type_traits/is_unsigned_integer.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h index 504c076f57c..088c98af66a 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h @@ -48,13 +48,6 @@ struct __cccl_is_unsigned_integer<__uint128_t> : public true_type {}; #endif -#if !defined(_CCCL_NO_VARIABLE_TEMPLATES) - -template -_CCCL_INLINE_VAR constexpr auto __cccl_is_unsigned_integer_v = __cccl_is_unsigned_integer<_Tp>::value; - -#endif // !defined(_CCCL_NO_VARIABLE_TEMPLATES) - _LIBCUDACXX_END_NAMESPACE_STD #endif // _LIBCUDACXX___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H From 66a07102f62b9070a5314c52abd2cd4b8c4dbacf Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 15 Jan 2025 16:59:17 -0800 Subject: [PATCH 07/14] remove bfind --- libcudacxx/include/cuda/std/__bit/integral.h | 49 -------------------- 1 file changed, 49 deletions(-) diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h index e65123510b7..74e64a812ef 100644 --- a/libcudacxx/include/cuda/std/__bit/integral.h +++ b/libcudacxx/include/cuda/std/__bit/integral.h @@ -31,55 +31,6 @@ #include #include -#include "cuda/std/__type_traits/conditional.h" - -// the following section will be removed when the ptx bfind is added -namespace cuda::ptx -{ - -_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(unsigned a) -{ - uint32_t d; - asm volatile( - "{ \n\t\t" - "bfind.u32 %0, %1; \n\t\t" - "}" - : "=r"(d) - : "r"(a)); - return d; -} - -_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(uint64_t a) -{ - uint32_t d; - asm volatile( - "{ \n\t\t" - "bfind.u64 %0, %1; \n\t\t" - "}" - : "=r"(d) - : "l"(a)); - return d; -} - -_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(unsigned long long a) -{ - uint32_t d; - asm volatile( - "{ \n\t\t" - "bfind.u64 %0, %1; \n\t\t" - "}" - : "=r"(d) - : "l"(a)); - return d; -} - -_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(__uint128_t a) -{ - return 0; -} - -} // namespace cuda::ptx - _LIBCUDACXX_BEGIN_NAMESPACE_STD template From 2dce21e606a8da78a951f8b4bcb1b3ecc5536e2e Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 15 Jan 2025 16:59:57 -0800 Subject: [PATCH 08/14] simplify the logic --- libcudacxx/include/cuda/std/__bit/integral.h | 41 ++++++++++++-------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h index 74e64a812ef..415c1a21c77 100644 --- a/libcudacxx/include/cuda/std/__bit/integral.h +++ b/libcudacxx/include/cuda/std/__bit/integral.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -36,7 +37,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::uint32_t __bit_log2(_Tp __t) noexcept { - if (!_CUDA_VSTD::is_constant_evaluated() && sizeof(_Tp) <= 8) + if (!is_constant_evaluated() && sizeof(_Tp) <= 8) { NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::ptx::bfind(__t);)) } @@ -59,7 +60,7 @@ _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept { _CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow"); - if (_CUDA_VSTD::is_constant_evaluated() && __t <= 1) + if (is_constant_evaluated() && __t <= 1) { return 1; } @@ -68,20 +69,23 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexce // The result is computed as max(1, bit_width(__t - 1)) because max() requires less instructions than ternary operator using _Up = conditional_t; auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t - 1)); // type of __ret is _Up - // clang-format off - NV_IF_ELSE_TARGET(NV_IS_DEVICE, - (auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, _Up{1} << __width)); - _CCCL_BUILTIN_ASSUME(__ret >= __t); - return __ret;), - (return static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width);)) - // clang-format on + if (!is_constant_evaluated() && sizeof(_Tp) <= 8) + { + NV_IF_TARGET(NV_IS_DEVICE, // + (auto __ret = static_cast<_Tp>(::max(1u, ::cuda::ptx::shl(1u, __width))); // + _CCCL_BUILTIN_ASSUME(__ret >= __t); + return __ret;)) + } + auto __ret = static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width); + _CCCL_BUILTIN_ASSUME(__ret >= __t); + return __ret; } _CCCL_TEMPLATE(class _Tp) _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept { - if (_CUDA_VSTD::is_constant_evaluated() && __t == 0) + if (is_constant_evaluated() && __t == 0) { return 0; } @@ -90,13 +94,16 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexc // __bit_log2 returns 0xFFFFFFFF if __t == 0 // (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type // -> the result is 0 is __t == 0 - // clang-format off - NV_IF_ELSE_TARGET(NV_IS_DEVICE, - (auto __ret = _Tp{1} << __log2; - _CCCL_BUILTIN_ASSUME(__ret <= __t); - return __ret;), - (return __t == 0 ? 0 : _Tp{1} << __log2;)) - // clang-format on + if (!is_constant_evaluated() && sizeof(_Tp) <= 8) + { + NV_IF_TARGET(NV_IS_DEVICE, // + (auto __ret = static_cast<_Tp>(::cuda::ptx::shl(1u, __log2))); // + _CCCL_BUILTIN_ASSUME(__ret <= __t); + return __ret;) + } + auto __ret = __t == 0 ? 0 : _Tp{1} << __log2; + _CCCL_BUILTIN_ASSUME(__ret <= __t); + return __ret; } #undef _CCCL_BUILTIN_ASSUME From 499b4f6a59cf1b9dd2d84a7f74930d4b30f6417b Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 24 Jan 2025 11:23:17 -0800 Subject: [PATCH 09/14] clean up --- libcudacxx/include/cuda/std/__bit/integral.h | 22 ++++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h index 415c1a21c77..668705fe2e2 100644 --- a/libcudacxx/include/cuda/std/__bit/integral.h +++ b/libcudacxx/include/cuda/std/__bit/integral.h @@ -35,7 +35,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template -_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::uint32_t __bit_log2(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept { if (!is_constant_evaluated() && sizeof(_Tp) <= 8) { @@ -49,7 +49,7 @@ _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept { // __bit_log2 returns 0xFFFFFFFF if __t == 0. Since unsigned overflow is well-defined, the result is -1 + 1 = 0 - using _Up = conditional_t; + using _Up = _If; auto __ret = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)) + 1; // type of__ret is int _CCCL_BUILTIN_ASSUME((is_unsigned_v<_Tp> ? true : __ret >= 0) && __ret <= numeric_limits<_Tp>::digits); return __ret; @@ -67,17 +67,17 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexce // if __t == 0, bit_width() applies to 0xFFFFFFFF and returns 32 // In CUDA, unsigned{1} << 32 --> 0 // The result is computed as max(1, bit_width(__t - 1)) because max() requires less instructions than ternary operator - using _Up = conditional_t; + using _Up = _If; auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t - 1)); // type of __ret is _Up if (!is_constant_evaluated() && sizeof(_Tp) <= 8) { NV_IF_TARGET(NV_IS_DEVICE, // - (auto __ret = static_cast<_Tp>(::max(1u, ::cuda::ptx::shl(1u, __width))); // - _CCCL_BUILTIN_ASSUME(__ret >= __t); + (auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Tp{1}, ::cuda::ptx::shl(_Tp{1}, __width))); // + _CCCL_BUILTIN_ASSUME(__ret >= __t && __ret <= __t * 2); return __ret;)) } auto __ret = static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width); - _CCCL_BUILTIN_ASSUME(__ret >= __t); + _CCCL_BUILTIN_ASSUME(__ret >= __t && __ret <= __t * 2); return __ret; } @@ -89,20 +89,20 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexc { return 0; } - using _Up = conditional_t; + using _Up = _If; auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)); // __bit_log2 returns 0xFFFFFFFF if __t == 0 // (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type - // -> the result is 0 is __t == 0 + // -> the result is 0 if __t == 0 if (!is_constant_evaluated() && sizeof(_Tp) <= 8) { NV_IF_TARGET(NV_IS_DEVICE, // - (auto __ret = static_cast<_Tp>(::cuda::ptx::shl(1u, __log2))); // - _CCCL_BUILTIN_ASSUME(__ret <= __t); + (auto __ret = static_cast<_Tp>(::cuda::ptx::shl(_Tp{1}, __log2))); // + _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t); return __ret;) } auto __ret = __t == 0 ? 0 : _Tp{1} << __log2; - _CCCL_BUILTIN_ASSUME(__ret <= __t); + _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t); return __ret; } From 6c3b6e76a675ba234fbafddda979032c44e17e27 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 21 Feb 2025 12:26:17 -0800 Subject: [PATCH 10/14] revert bit documentation --- .../standard_api/numerics_library/bit.rst | 22 +++---------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/docs/libcudacxx/standard_api/numerics_library/bit.rst b/docs/libcudacxx/standard_api/numerics_library/bit.rst index e2c19a6c17d..75aaf12792e 100644 --- a/docs/libcudacxx/standard_api/numerics_library/bit.rst +++ b/docs/libcudacxx/standard_api/numerics_library/bit.rst @@ -3,23 +3,7 @@ ```` ====================== -CUDA Performance Considerations -------------------------------- +Extensions +---------- -- ``bit_width()`` translates into a single ``FLO`` SASS instruction. The result is assumed to be in the range ``[0, N-bit]``. -- ``bit_ceil()`` translates into ``FLO, SHL`` SASS instructions. The result is assumed to be greater than or equal to the input. -- ``bit_floor()`` translates into ``ADD, FLO, SHL, IMINMAX`` SASS instructions. The result is assumed to be less than or equal to the input. -- ``popcount()`` translates into a single ``POPC`` SASS instruction. The result is assumed to be in the range ``[0, N-bit]``. -- ``has_single_bit()`` translates into ``POPC + ISETP`` SASS instructions. -- ``rotl()/rotr()`` translate into a single ``SHF`` (funned shift) SASS instruction. -- ``countl_zero()`` translates into ``FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``. -- ``countl_one()`` translates into ``LOP3, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``. -- ``countr_zero()`` translates into ``BREV, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``. -- ``countr_one()`` translates into ``LOP3, BREV, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``. - -Additional Notes ----------------- - -- All functions are marked ``[[nodiscard]]`` and ``noexcept`` -- All functions support ``__uint128_t`` -- ``bit_ceil()`` checks for overflow in debug mode +- All features of ```` are made available in C++11 onwards From f6091f6a0a4b9a32a8f0dfe603592890d1c693c7 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 21 Feb 2025 14:59:18 -0800 Subject: [PATCH 11/14] simplify the code --- libcudacxx/include/cuda/std/__bit/integral.h | 61 +++++++++----------- 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h index 668705fe2e2..26847d7ef23 100644 --- a/libcudacxx/include/cuda/std/__bit/integral.h +++ b/libcudacxx/include/cuda/std/__bit/integral.h @@ -21,25 +21,27 @@ # pragma system_header #endif // no system header -// #include +#include +// #include +// #include #include #include #include #include #include #include -#include #include #include _LIBCUDACXX_BEGIN_NAMESPACE_STD template -_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr int __bit_log2(_Tp __t) noexcept { - if (!is_constant_evaluated() && sizeof(_Tp) <= 8) + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8) { - NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::ptx::bfind(__t);)) + using _Up [[maybe_unused]] = _If; + NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::ptx::bfind(static_cast<_Up>(__t));)) } return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t); } @@ -48,10 +50,9 @@ _CCCL_TEMPLATE(class _Tp) _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept { - // __bit_log2 returns 0xFFFFFFFF if __t == 0. Since unsigned overflow is well-defined, the result is -1 + 1 = 0 - using _Up = _If; - auto __ret = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)) + 1; // type of__ret is int - _CCCL_BUILTIN_ASSUME((is_unsigned_v<_Tp> ? true : __ret >= 0) && __ret <= numeric_limits<_Tp>::digits); + // if __t == 0, __bit_log2(0) returns 0xFFFFFFFF. Since unsigned overflow is well-defined, the result is -1 + 1 = 0 + auto __ret = _CUDA_VSTD::__bit_log2(__t) + 1; + _CCCL_BUILTIN_ASSUME(__ret >= 0 && __ret <= numeric_limits<_Tp>::digits); return __ret; } @@ -60,24 +61,22 @@ _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept { _CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow"); - if (is_constant_evaluated() && __t <= 1) - { - return 1; - } - // if __t == 0, bit_width() applies to 0xFFFFFFFF and returns 32 - // In CUDA, unsigned{1} << 32 --> 0 - // The result is computed as max(1, bit_width(__t - 1)) because max() requires less instructions than ternary operator - using _Up = _If; - auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t - 1)); // type of __ret is _Up - if (!is_constant_evaluated() && sizeof(_Tp) <= 8) + // if __t == 0, __t - 1 == 0xFFFFFFFF, bit_width(0xFFFFFFFF) returns 32 + auto __width = _CUDA_VSTD::bit_width(static_cast<_Tp>(__t - 1)); +#ifdef PTX_SHL_SHR + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8 && false) { + // CUDA right shift (ptx::shr) returns 0 if the right operand is larger than the number of bits of the type + // The result is computed as max(1, bit_width(__t - 1)) because it is more efficient than ternary operator NV_IF_TARGET(NV_IS_DEVICE, // - (auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Tp{1}, ::cuda::ptx::shl(_Tp{1}, __width))); // - _CCCL_BUILTIN_ASSUME(__ret >= __t && __ret <= __t * 2); + (auto __shift = ::cuda::ptx::shl(_Tp{1}, __width); // 1 << width + auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Tp{1}, __shift)); // + _CCCL_BUILTIN_ASSUME(__ret >= __t); return __ret;)) } - auto __ret = static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width); - _CCCL_BUILTIN_ASSUME(__ret >= __t && __ret <= __t * 2); +#endif + auto __ret = __t <= 1 ? _Tp{1} : _Tp{1} << __width; + _CCCL_BUILTIN_ASSUME(__ret >= __t); return __ret; } @@ -85,29 +84,25 @@ _CCCL_TEMPLATE(class _Tp) _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept { - if (is_constant_evaluated() && __t == 0) - { - return 0; - } using _Up = _If; auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)); // __bit_log2 returns 0xFFFFFFFF if __t == 0 - // (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type - // -> the result is 0 if __t == 0 - if (!is_constant_evaluated() && sizeof(_Tp) <= 8) +#ifdef PTX_SHL_SHR + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8) { + // CUDA left shift (ptx::shl) returns 0 if the right operand is larger than the number of bits of the type + // -> the result is 0 if __t == 0 NV_IF_TARGET(NV_IS_DEVICE, // (auto __ret = static_cast<_Tp>(::cuda::ptx::shl(_Tp{1}, __log2))); // _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t); return __ret;) } - auto __ret = __t == 0 ? 0 : _Tp{1} << __log2; +#endif + auto __ret = __t == 0 ? _Tp{0} : _Tp{1} << __log2; _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t); return __ret; } -#undef _CCCL_BUILTIN_ASSUME - _LIBCUDACXX_END_NAMESPACE_STD #endif // _LIBCUDACXX___BIT_INTEGRAL_H From cba67a2d1f473241c08b1c23cff1411839766470 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 21 Feb 2025 17:09:34 -0800 Subject: [PATCH 12/14] fix MSVC warning --- libcudacxx/include/cuda/std/__bit/integral.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h index 26847d7ef23..070b0940382 100644 --- a/libcudacxx/include/cuda/std/__bit/integral.h +++ b/libcudacxx/include/cuda/std/__bit/integral.h @@ -75,9 +75,9 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexce return __ret;)) } #endif - auto __ret = __t <= 1 ? _Tp{1} : _Tp{1} << __width; + auto __ret = __t <= 1 ? 1 : _Tp{1} << __width; _CCCL_BUILTIN_ASSUME(__ret >= __t); - return __ret; + return static_cast<_Tp>(__ret); } _CCCL_TEMPLATE(class _Tp) @@ -98,9 +98,9 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexc return __ret;) } #endif - auto __ret = __t == 0 ? _Tp{0} : _Tp{1} << __log2; + auto __ret = __t == 0 ? 0 : _Tp{1} << __log2; _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t); - return __ret; + return static_cast<_Tp>(__ret); } _LIBCUDACXX_END_NAMESPACE_STD From fc0e646fa7466720742766824a16b0b20e45666e Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 25 Feb 2025 15:10:09 -0800 Subject: [PATCH 13/14] add PTX shl and shr --- libcudacxx/include/cuda/std/__bit/integral.h | 74 ++++++++++++-------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h index 070b0940382..909599db757 100644 --- a/libcudacxx/include/cuda/std/__bit/integral.h +++ b/libcudacxx/include/cuda/std/__bit/integral.h @@ -22,8 +22,8 @@ #endif // no system header #include -// #include -// #include +#include +#include #include #include #include @@ -36,12 +36,21 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __bit_log2(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept { - if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8) + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) { - using _Up [[maybe_unused]] = _If; - NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::ptx::bfind(static_cast<_Up>(__t));)) + if constexpr (sizeof(_Tp) <= 8) + { + using _Up [[maybe_unused]] = _If; + NV_IF_TARGET(NV_IS_DEVICE, (return _CUDA_VPTX::bfind(static_cast<_Up>(__t));)) + } + else + { + NV_IF_TARGET(NV_IS_DEVICE, + (auto __high = _CUDA_VPTX::bfind(static_cast(__t >> 64)); + return __high == ~uint32_t{0} ? _CUDA_VPTX::bfind(static_cast(__t)) : __high + 64;)) + } } return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t); } @@ -52,7 +61,7 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexc { // if __t == 0, __bit_log2(0) returns 0xFFFFFFFF. Since unsigned overflow is well-defined, the result is -1 + 1 = 0 auto __ret = _CUDA_VSTD::__bit_log2(__t) + 1; - _CCCL_BUILTIN_ASSUME(__ret >= 0 && __ret <= numeric_limits<_Tp>::digits); + _CCCL_BUILTIN_ASSUME(__ret <= numeric_limits<_Tp>::digits); return __ret; } @@ -60,24 +69,26 @@ _CCCL_TEMPLATE(class _Tp) _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept { + using _Up = _If; _CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow"); // if __t == 0, __t - 1 == 0xFFFFFFFF, bit_width(0xFFFFFFFF) returns 32 - auto __width = _CUDA_VSTD::bit_width(static_cast<_Tp>(__t - 1)); -#ifdef PTX_SHL_SHR - if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8 && false) + auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t) - 1); + if constexpr (sizeof(_Tp) <= 8) { - // CUDA right shift (ptx::shr) returns 0 if the right operand is larger than the number of bits of the type - // The result is computed as max(1, bit_width(__t - 1)) because it is more efficient than ternary operator - NV_IF_TARGET(NV_IS_DEVICE, // - (auto __shift = ::cuda::ptx::shl(_Tp{1}, __width); // 1 << width - auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Tp{1}, __shift)); // - _CCCL_BUILTIN_ASSUME(__ret >= __t); - return __ret;)) + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + // CUDA right shift (ptx::shr) returns 0 if the right operand is larger than the number of bits of the type + // The result is computed as max(1, bit_width(__t - 1)) because it is more efficient than the ternary operator + NV_IF_TARGET(NV_IS_DEVICE, // + (auto __shift = _CUDA_VPTX::shl(_Up{1}, __width); // 2^(ceil(log2(__t - 1))) + auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, __shift)); // + _CCCL_BUILTIN_ASSUME(__ret >= __t); + return __ret;)) + } } -#endif - auto __ret = __t <= 1 ? 1 : _Tp{1} << __width; + auto __ret = static_cast<_Tp>(__t <= 1 ? _Up{1} : _Up{1} << __width); _CCCL_BUILTIN_ASSUME(__ret >= __t); - return static_cast<_Tp>(__ret); + return __ret; } _CCCL_TEMPLATE(class _Tp) @@ -87,20 +98,21 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexc using _Up = _If; auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)); // __bit_log2 returns 0xFFFFFFFF if __t == 0 -#ifdef PTX_SHL_SHR - if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8) + if constexpr (sizeof(_Tp) <= 8) { - // CUDA left shift (ptx::shl) returns 0 if the right operand is larger than the number of bits of the type - // -> the result is 0 if __t == 0 - NV_IF_TARGET(NV_IS_DEVICE, // - (auto __ret = static_cast<_Tp>(::cuda::ptx::shl(_Tp{1}, __log2))); // - _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t); - return __ret;) + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + // CUDA left shift (ptx::shl) returns 0 if the right operand is larger than the number of bits of the type + // -> the result is 0 if __t == 0 + NV_IF_TARGET(NV_IS_DEVICE, // + (auto __ret = static_cast<_Tp>(_CUDA_VPTX::shl(_Up{1}, __log2)); // 2^(log2(t)) + _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t); + return __ret;)) + } } -#endif - auto __ret = __t == 0 ? 0 : _Tp{1} << __log2; + auto __ret = static_cast<_Tp>(__t == 0 ? _Up{0} : _Up{1} << __log2); _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t); - return static_cast<_Tp>(__ret); + return __ret; } _LIBCUDACXX_END_NAMESPACE_STD From bacdd2b6f7047a14b91f8c4c7374200b292d276e Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 26 Feb 2025 10:38:58 +0100 Subject: [PATCH 14/14] Use proper return type --- libcudacxx/include/cuda/std/__bit/integral.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h index 909599db757..f7a73a05fe7 100644 --- a/libcudacxx/include/cuda/std/__bit/integral.h +++ b/libcudacxx/include/cuda/std/__bit/integral.h @@ -62,7 +62,7 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexc // if __t == 0, __bit_log2(0) returns 0xFFFFFFFF. Since unsigned overflow is well-defined, the result is -1 + 1 = 0 auto __ret = _CUDA_VSTD::__bit_log2(__t) + 1; _CCCL_BUILTIN_ASSUME(__ret <= numeric_limits<_Tp>::digits); - return __ret; + return static_cast(__ret); } _CCCL_TEMPLATE(class _Tp)