From 00b545b839131171da83b4efcd9c0b3b3e04adbb Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Wed, 26 Feb 2025 09:35:02 -0800 Subject: [PATCH] Optimize `bit_floor`, `bit_ceil`, `bit_width` (#3296) Co-authored-by: Michael Schellenberger Costa Co-authored-by: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> --- libcudacxx/include/cuda/std/__bit/integral.h | 101 ++++++++++++++----- 1 file changed, 74 insertions(+), 27 deletions(-) diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h index b753f12d227..f7a73a05fe7 100644 --- a/libcudacxx/include/cuda/std/__bit/integral.h +++ b/libcudacxx/include/cuda/std/__bit/integral.h @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -21,8 +21,14 @@ # pragma system_header #endif // no system header +#include +#include +#include +#include #include -#include +#include +#include +#include #include #include #include @@ -32,40 +38,81 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template _LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept { - static_assert(__cccl_is_unsigned_integer_v<_Tp>, "__bit_log2 requires unsigned"); - return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t); + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + if constexpr (sizeof(_Tp) <= 8) + { + using _Up [[maybe_unused]] = _If; + NV_IF_TARGET(NV_IS_DEVICE, (return _CUDA_VPTX::bfind(static_cast<_Up>(__t));)) + } + else + { + NV_IF_TARGET(NV_IS_DEVICE, + (auto __high = _CUDA_VPTX::bfind(static_cast(__t >> 64)); + return __high == ~uint32_t{0} ? _CUDA_VPTX::bfind(static_cast(__t)) : __high + 64;)) + } + } + return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t); } -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t= sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept -{ - return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u))); -} - -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t __ceil2(_Tp __t) noexcept +_CCCL_TEMPLATE(class _Tp) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept { - return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u))) - + (numeric_limits::digits - numeric_limits<_Tp>::digits))) - >> (numeric_limits::digits - numeric_limits<_Tp>::digits)); + // if __t == 0, __bit_log2(0) returns 0xFFFFFFFF. Since unsigned overflow is well-defined, the result is -1 + 1 = 0 + auto __ret = _CUDA_VSTD::__bit_log2(__t) + 1; + _CCCL_BUILTIN_ASSUME(__ret <= numeric_limits<_Tp>::digits); + return static_cast(__ret); } -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, _Tp> bit_floor(_Tp __t) noexcept +_CCCL_TEMPLATE(class _Tp) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept { - return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t)); + using _Up = _If; + _CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow"); + // if __t == 0, __t - 1 == 0xFFFFFFFF, bit_width(0xFFFFFFFF) returns 32 + auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t) - 1); + if constexpr (sizeof(_Tp) <= 8) + { + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + // CUDA right shift (ptx::shr) returns 0 if the right operand is larger than the number of bits of the type + // The result is computed as max(1, bit_width(__t - 1)) because it is more efficient than the ternary operator + NV_IF_TARGET(NV_IS_DEVICE, // + (auto __shift = _CUDA_VPTX::shl(_Up{1}, __width); // 2^(ceil(log2(__t - 1))) + auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, __shift)); // + _CCCL_BUILTIN_ASSUME(__ret >= __t); + return __ret;)) + } + } + auto __ret = static_cast<_Tp>(__t <= 1 ? _Up{1} : _Up{1} << __width); + _CCCL_BUILTIN_ASSUME(__ret >= __t); + return __ret; } -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, _Tp> bit_ceil(_Tp __t) noexcept -{ - return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t)); -} - -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, int> bit_width(_Tp __t) noexcept +_CCCL_TEMPLATE(class _Tp) +_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp)) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept { - return __t == 0 ? 0 : static_cast(__bit_log2(__t) + 1); + using _Up = _If; + auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)); + // __bit_log2 returns 0xFFFFFFFF if __t == 0 + if constexpr (sizeof(_Tp) <= 8) + { + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + // CUDA left shift (ptx::shl) returns 0 if the right operand is larger than the number of bits of the type + // -> the result is 0 if __t == 0 + NV_IF_TARGET(NV_IS_DEVICE, // + (auto __ret = static_cast<_Tp>(_CUDA_VPTX::shl(_Up{1}, __log2)); // 2^(log2(t)) + _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t); + return __ret;)) + } + } + auto __ret = static_cast<_Tp>(__t == 0 ? _Up{0} : _Up{1} << __log2); + _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t); + return __ret; } _LIBCUDACXX_END_NAMESPACE_STD