From 00b545b839131171da83b4efcd9c0b3b3e04adbb Mon Sep 17 00:00:00 2001
From: Federico Busato <50413820+fbusato@users.noreply.github.com>
Date: Wed, 26 Feb 2025 09:35:02 -0800
Subject: [PATCH] Optimize `bit_floor`, `bit_ceil`, `bit_width` (#3296)

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
Co-authored-by: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
---
 libcudacxx/include/cuda/std/__bit/integral.h | 101 ++++++++++++++-----
 1 file changed, 74 insertions(+), 27 deletions(-)
diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
index b753f12d227..f7a73a05fe7 100644
--- a/libcudacxx/include/cuda/std/__bit/integral.h
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,8 +21,14 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__ptx/instructions/bfind.h>
+#include <cuda/__ptx/instructions/shl.h>
+#include <cuda/__ptx/instructions/shr.h>
+#include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__bit/countl.h>
-#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
 #include <cuda/std/__type_traits/is_unsigned_integer.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
@@ -32,40 +38,81 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept
 {
-  static_assert(__cccl_is_unsigned_integer_v<_Tp>, "__bit_log2 requires unsigned");
-  return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
+  if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+  {
+    if constexpr (sizeof(_Tp) <= 8)
+    {
+      using _Up [[maybe_unused]] = _If<sizeof(_Tp) <= 4, uint32_t, uint64_t>;
+      NV_IF_TARGET(NV_IS_DEVICE, (return _CUDA_VPTX::bfind(static_cast<_Up>(__t));))
+    }
+    else
+    {
+      NV_IF_TARGET(NV_IS_DEVICE,
+                   (auto __high = _CUDA_VPTX::bfind(static_cast<uint64_t>(__t >> 64));
+                    return __high == ~uint32_t{0} ? _CUDA_VPTX::bfind(static_cast<uint64_t>(__t)) : __high + 64;))
+    }
+  }
+  return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);
 }
 
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) >= sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
-{
-  return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)));
-}
-
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) < sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept
 {
-  return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)))
-                        + (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits)))
-                >> (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits));
+  // if __t == 0, __bit_log2(0) returns 0xFFFFFFFF. Since unsigned overflow is well-defined, the result is -1 + 1 = 0
+  auto __ret = _CUDA_VSTD::__bit_log2(__t) + 1;
+  _CCCL_BUILTIN_ASSUME(__ret <= numeric_limits<_Tp>::digits);
+  return static_cast<int>(__ret);
 }
 
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, _Tp> bit_floor(_Tp __t) noexcept
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept
 {
-  return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t));
+  using _Up = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
+  _CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow");
+  // if __t == 0, __t - 1 == 0xFFFFFFFF, bit_width(0xFFFFFFFF) returns 32
+  auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t) - 1);
+  if constexpr (sizeof(_Tp) <= 8)
+  {
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      // CUDA right shift (ptx::shr) returns 0 if the right operand is larger than the number of bits of the type
+      // The result is computed as max(1, bit_width(__t - 1)) because it is more efficient than the ternary operator
+      NV_IF_TARGET(NV_IS_DEVICE, //
+                   (auto __shift = _CUDA_VPTX::shl(_Up{1}, __width); // 2^(ceil(log2(__t - 1)))
+                    auto __ret   = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, __shift)); //
+                    _CCCL_BUILTIN_ASSUME(__ret >= __t);
+                    return __ret;))
+    }
+  }
+  auto __ret = static_cast<_Tp>(__t <= 1 ? _Up{1} : _Up{1} << __width);
+  _CCCL_BUILTIN_ASSUME(__ret >= __t);
+  return __ret;
 }
 
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, _Tp> bit_ceil(_Tp __t) noexcept
-{
-  return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t));
-}
-
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer_v<_Tp>, int> bit_width(_Tp __t) noexcept
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept
 {
-  return __t == 0 ? 0 : static_cast<int>(__bit_log2(__t) + 1);
+  using _Up   = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
+  auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t));
+  // __bit_log2 returns 0xFFFFFFFF if __t == 0
+  if constexpr (sizeof(_Tp) <= 8)
+  {
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      // CUDA left shift (ptx::shl) returns 0 if the right operand is larger than the number of bits of the type
+      // -> the result is 0 if __t == 0
+      NV_IF_TARGET(NV_IS_DEVICE, //
+                   (auto __ret = static_cast<_Tp>(_CUDA_VPTX::shl(_Up{1}, __log2)); // 2^(log2(t))
+                    _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
+                    return __ret;))
+    }
+  }
+  auto __ret = static_cast<_Tp>(__t == 0 ? _Up{0} : _Up{1} << __log2);
+  _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
+  return __ret;
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD