From 2aeb329489abcfd35bcbbbc1e92ed85bd41fcd82 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 9 Jan 2025 00:10:57 +0000
Subject: [PATCH 01/14] optimize bit_floor, bit_ceil, bit_width

---
 libcudacxx/include/cuda/std/__bit/integral.h  | 135 ++++++++++++++----
 .../std/__type_traits/is_unsigned_integer.h   |   5 +
 2 files changed, 116 insertions(+), 24 deletions(-)
diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
index f0186ad9f5f..41edc6c5a0a 100644
--- a/libcudacxx/include/cuda/std/__bit/integral.h
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,53 +21,140 @@
 #  pragma system_header
 #endif // no system header
 
+// #include <cuda/__ptx/instructions/bfind.h>
+#include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__bit/countl.h>
-#include <cuda/std/__type_traits/enable_if.h>
-#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
+#include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
+// the following section will be removed when the ptx bfind is added
+namespace cuda::ptx
+{
 
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(unsigned a)
 {
-  static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned");
-  return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
+  uint32_t d;
+  asm volatile(
+    "{                                                   \n\t\t"
+    "bfind.u32 %0, %1;                      \n\t\t"
+    "}"
+    : "=r"(d)
+    : "r"(a));
+  return d;
 }
 
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) >= sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(uint64_t a)
 {
-  return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)));
+  uint32_t d;
+  asm volatile(
+    "{                                                   \n\t\t"
+    "bfind.u64 %0, %1;                      \n\t\t"
+    "}"
+    : "=r"(d)
+    : "l"(a));
+  return d;
 }
 
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) < sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(unsigned long long a)
 {
-  return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)))
-                        + (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits)))
-                >> (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits));
+  uint32_t d;
+  asm volatile(
+    "{                                                   \n\t\t"
+    "bfind.u64 %0, %1;                      \n\t\t"
+    "}"
+    : "=r"(d)
+    : "l"(a));
+  return d;
 }
 
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> bit_floor(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(__uint128_t a)
 {
-  return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t));
+  return 0;
 }
 
+} // namespace cuda::ptx
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#define _CCCL_CUDA_BUILTIN_ASSUME(...)                           \
+  if (!_CUDA_VSTD::is_constant_evaluated())                      \
+  {                                                              \
+    NV_IF_TARGET(NV_IS_DEVICE, (__builtin_assume(__VA_ARGS__);)) \
+  }
+
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> bit_ceil(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::uint32_t __bit_log2(_Tp __t) noexcept
 {
-  return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t));
+  if (!_CUDA_VSTD::is_constant_evaluated() && sizeof(_Tp) <= 8)
+  {
+    NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                      (return ::cuda::ptx::bfind(__t);), //
+                      (return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);))
+  }
+  else
+  {
+    return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);
+  }
 }
 
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> bit_width(_Tp __t) noexcept
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept
+{
+  // __bit_log2 returns 0xFFFFFFFF if __t == 0. Since unsigned overflow is well-defined, the result is -1 + 1 = 0
+  using _Up  = _CUDA_VSTD::make_unsigned_t<decltype(+_Tp{})>;
+  auto __ret = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)) + 1; // type of__ret is int
+  _CCCL_CUDA_BUILTIN_ASSUME((is_unsigned_v<_Tp> ? true : __ret >= 0) && __ret <= numeric_limits<_Tp>::digits);
+  return __ret;
+}
+
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept
 {
-  return __t == 0 ? 0 : static_cast<int>(__bit_log2(__t) + 1);
+  _CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow");
+  if (_CUDA_VSTD::is_constant_evaluated() && __t <= 1)
+  {
+    return 1;
+  }
+  // if __t == 0, bit_width() applies to 0xFFFFFFFF and returns 32
+  // In CUDA, unsigned{1} << 32 --> 0
+  // The result is computed as max(1, bit_width(__t - 1)) because max() requires less instructions than ternary operator
+  using _Up    = _CUDA_VSTD::make_unsigned_t<decltype(+_Tp{})>;
+  auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t - 1)); // type of __ret is _Up
+  // clang-format off
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                   (auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, _Up{1} << __width));
+                    _CCCL_BUILTIN_ASSUME(__ret >= __t)
+                    return __ret;),
+                   (return static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width);))
+  // clang-format on
 }
 
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept
+{
+  // __bit_log2 returns 0xFFFFFFFF if __t == 0
+  // (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type
+  // -> the result is 0 is __t == 0
+  if (_CUDA_VSTD::is_constant_evaluated() && __t == 0)
+  {
+    return 0;
+  }
+  using _Up   = _CUDA_VSTD::make_unsigned_t<decltype(+_Tp{})>;
+  auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t));
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                    (return _Tp{1} << __log2;), //
+                    (return __t == 0 ? 0 : _Tp{1} << __log2;))
+}
+
+#undef _CCCL_CUDA_BUILTIN_ASSUME
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___BIT_INTEGRAL_H
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
index 088c98af66a..c7a44cd8583 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
@@ -12,6 +12,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include "cuda/std/__cccl/dialect.h"
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -48,6 +50,9 @@ struct __cccl_is_unsigned_integer<__uint128_t> : public true_type
 {};
 #endif
 
+template <typename _Tp>
+_CCCL_INLINE_VAR constexpr auto __cccl_is_unsigned_integer_v = __cccl_is_unsigned_integer<_Tp>::value;
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H

From 187ff3107c2eb045262f1d35db126fbc8a4a7cdd Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 9 Jan 2025 00:11:11 +0000
Subject: [PATCH 02/14] document performance considerations

---
 .../standard_api/numerics_library/bit.rst     | 22 ++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/docs/libcudacxx/standard_api/numerics_library/bit.rst b/docs/libcudacxx/standard_api/numerics_library/bit.rst
index 75aaf12792e..e2c19a6c17d 100644
--- a/docs/libcudacxx/standard_api/numerics_library/bit.rst
+++ b/docs/libcudacxx/standard_api/numerics_library/bit.rst
@@ -3,7 +3,23 @@
 ``<cuda/std/bit>``
 ======================
 
-Extensions
-----------
+CUDA Performance Considerations
+-------------------------------
 
--  All features of ``<bit>`` are made available in C++11 onwards
+- ``bit_width()`` translates into a single ``FLO`` SASS instruction. The result is assumed to be in the range ``[0, N-bit]``.
+- ``bit_ceil()`` translates into ``FLO, SHL`` SASS instructions. The result is assumed to be greater than or equal to the input.
+- ``bit_floor()`` translates into ``ADD, FLO, SHL, IMINMAX`` SASS instructions. The result is assumed to be less than or equal to the input.
+- ``popcount()`` translates into a single ``POPC`` SASS instruction. The result is assumed to be in the range ``[0, N-bit]``.
+- ``has_single_bit()`` translates into ``POPC + ISETP`` SASS instructions.
+- ``rotl()/rotr()`` translate into a single ``SHF`` (funned shift) SASS instruction.
+- ``countl_zero()`` translates into ``FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
+- ``countl_one()`` translates into ``LOP3, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
+- ``countr_zero()`` translates into ``BREV, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
+- ``countr_one()`` translates into ``LOP3, BREV, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
+
+Additional Notes
+----------------
+
+- All functions are marked ``[[nodiscard]]`` and ``noexcept``
+- All functions support ``__uint128_t``
+- ``bit_ceil()`` checks for overflow in debug mode

From 5f40fc164b64d14b40a7932fd2c1635d11f663d1 Mon Sep 17 00:00:00 2001
From: Federico Busato <50413820+fbusato@users.noreply.github.com>
Date: Thu, 9 Jan 2025 09:52:28 -0800
Subject: [PATCH 03/14] Update
 libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
index c7a44cd8583..7bc6652d3d0 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
@@ -12,8 +12,6 @@
 
 #include <cuda/std/detail/__config>
 
-#include "cuda/std/__cccl/dialect.h"
-
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)

From 294222bb8abd8843aa590f72aeb78ce971578a58 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 9 Jan 2025 22:51:49 +0000
Subject: [PATCH 04/14] use default macro for builtin_assume and avoid integer
 promotion

---
 libcudacxx/include/cuda/std/__bit/integral.h  | 34 +++++++++----------
 .../std/__type_traits/is_unsigned_integer.h   |  4 +++
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
index 41edc6c5a0a..dc8200bae55 100644
--- a/libcudacxx/include/cuda/std/__bit/integral.h
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -26,11 +26,13 @@
 #include <cuda/std/__bit/countl.h>
 #include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__type_traits/is_constant_evaluated.h>
-#include <cuda/std/__type_traits/is_unsigned.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
+#include "cuda/std/__type_traits/conditional.h"
+
 // the following section will be removed when the ptx bfind is added
 namespace cuda::ptx
 {
@@ -80,12 +82,6 @@ _LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(__uint128_t a)
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#define _CCCL_CUDA_BUILTIN_ASSUME(...)                           \
-  if (!_CUDA_VSTD::is_constant_evaluated())                      \
-  {                                                              \
-    NV_IF_TARGET(NV_IS_DEVICE, (__builtin_assume(__VA_ARGS__);)) \
-  }
-
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::uint32_t __bit_log2(_Tp __t) noexcept
 {
@@ -106,9 +102,9 @@ _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept
 {
   // __bit_log2 returns 0xFFFFFFFF if __t == 0. Since unsigned overflow is well-defined, the result is -1 + 1 = 0
-  using _Up  = _CUDA_VSTD::make_unsigned_t<decltype(+_Tp{})>;
+  using _Up  = conditional_t<sizeof(_Tp) <= 4, uint32_t, _Tp>;
   auto __ret = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)) + 1; // type of__ret is int
-  _CCCL_CUDA_BUILTIN_ASSUME((is_unsigned_v<_Tp> ? true : __ret >= 0) && __ret <= numeric_limits<_Tp>::digits);
+  _CCCL_BUILTIN_ASSUME((is_unsigned_v<_Tp> ? true : __ret >= 0) && __ret <= numeric_limits<_Tp>::digits);
   return __ret;
 }
 
@@ -124,12 +120,12 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexce
   // if __t == 0, bit_width() applies to 0xFFFFFFFF and returns 32
   // In CUDA, unsigned{1} << 32 --> 0
   // The result is computed as max(1, bit_width(__t - 1)) because max() requires less instructions than ternary operator
-  using _Up    = _CUDA_VSTD::make_unsigned_t<decltype(+_Tp{})>;
+  using _Up    = conditional_t<sizeof(_Tp) <= 4, uint32_t, _Tp>;
   auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t - 1)); // type of __ret is _Up
   // clang-format off
   NV_IF_ELSE_TARGET(NV_IS_DEVICE,
                    (auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, _Up{1} << __width));
-                    _CCCL_BUILTIN_ASSUME(__ret >= __t)
+                    _CCCL_BUILTIN_ASSUME(__ret >= __t);
                     return __ret;),
                    (return static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width);))
   // clang-format on
@@ -139,21 +135,25 @@ _CCCL_TEMPLATE(class _Tp)
 _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept
 {
-  // __bit_log2 returns 0xFFFFFFFF if __t == 0
-  // (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type
-  // -> the result is 0 is __t == 0
   if (_CUDA_VSTD::is_constant_evaluated() && __t == 0)
   {
     return 0;
   }
-  using _Up   = _CUDA_VSTD::make_unsigned_t<decltype(+_Tp{})>;
+  using _Up   = conditional_t<sizeof(_Tp) <= 4, uint32_t, _Tp>;
   auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t));
+  // __bit_log2 returns 0xFFFFFFFF if __t == 0
+  // (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type
+  // -> the result is 0 is __t == 0
+  // clang-format off
   NV_IF_ELSE_TARGET(NV_IS_DEVICE,
-                    (return _Tp{1} << __log2;), //
+                    (auto __ret = _Tp{1} << __log2;
+                     _CCCL_BUILTIN_ASSUME(__ret <= __t);
+                     return __ret;),
                     (return __t == 0 ? 0 : _Tp{1} << __log2;))
+  // clang-format on
 }
 
-#undef _CCCL_CUDA_BUILTIN_ASSUME
+#undef _CCCL_BUILTIN_ASSUME
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
index 7bc6652d3d0..504c076f57c 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
@@ -48,9 +48,13 @@ struct __cccl_is_unsigned_integer<__uint128_t> : public true_type
 {};
 #endif
 
+#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
+
 template <typename _Tp>
 _CCCL_INLINE_VAR constexpr auto __cccl_is_unsigned_integer_v = __cccl_is_unsigned_integer<_Tp>::value;
 
+#endif // !defined(_CCCL_NO_VARIABLE_TEMPLATES)
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H

From 31743f82af74450901d1b070599bb32cef526629 Mon Sep 17 00:00:00 2001
From: Federico Busato <50413820+fbusato@users.noreply.github.com>
Date: Wed, 15 Jan 2025 15:37:58 -0800
Subject: [PATCH 05/14] Update libcudacxx/include/cuda/std/__bit/integral.h

Co-authored-by: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
---
 libcudacxx/include/cuda/std/__bit/integral.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
index dc8200bae55..e65123510b7 100644
--- a/libcudacxx/include/cuda/std/__bit/integral.h
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -87,14 +87,9 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::uint32_t __bit_log2(_Tp __t) noe
 {
   if (!_CUDA_VSTD::is_constant_evaluated() && sizeof(_Tp) <= 8)
   {
-    NV_IF_ELSE_TARGET(NV_IS_DEVICE,
-                      (return ::cuda::ptx::bfind(__t);), //
-                      (return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);))
-  }
-  else
-  {
-    return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);
+    NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::ptx::bfind(__t);))
   }
+  return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);
 }
 
 _CCCL_TEMPLATE(class _Tp)

From 079780eb87d80517fa7e1a39ab1411ab280931ce Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 15 Jan 2025 16:58:53 -0800
Subject: [PATCH 06/14] revert __cccl_is_unsigned_integer_v to avoid conflict

---
 .../include/cuda/std/__type_traits/is_unsigned_integer.h   | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
index 504c076f57c..088c98af66a 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
@@ -48,13 +48,6 @@ struct __cccl_is_unsigned_integer<__uint128_t> : public true_type
 {};
 #endif
 
-#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
-
-template <typename _Tp>
-_CCCL_INLINE_VAR constexpr auto __cccl_is_unsigned_integer_v = __cccl_is_unsigned_integer<_Tp>::value;
-
-#endif // !defined(_CCCL_NO_VARIABLE_TEMPLATES)
-
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H

From 66a07102f62b9070a5314c52abd2cd4b8c4dbacf Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 15 Jan 2025 16:59:17 -0800
Subject: [PATCH 07/14] remove bfind

---
 libcudacxx/include/cuda/std/__bit/integral.h | 49 --------------------
 1 file changed, 49 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
index e65123510b7..74e64a812ef 100644
--- a/libcudacxx/include/cuda/std/__bit/integral.h
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -31,55 +31,6 @@
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
-#include "cuda/std/__type_traits/conditional.h"
-
-// the following section will be removed when the ptx bfind is added
-namespace cuda::ptx
-{
-
-_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(unsigned a)
-{
-  uint32_t d;
-  asm volatile(
-    "{                                                   \n\t\t"
-    "bfind.u32 %0, %1;                      \n\t\t"
-    "}"
-    : "=r"(d)
-    : "r"(a));
-  return d;
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(uint64_t a)
-{
-  uint32_t d;
-  asm volatile(
-    "{                                                   \n\t\t"
-    "bfind.u64 %0, %1;                      \n\t\t"
-    "}"
-    : "=r"(d)
-    : "l"(a));
-  return d;
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(unsigned long long a)
-{
-  uint32_t d;
-  asm volatile(
-    "{                                                   \n\t\t"
-    "bfind.u64 %0, %1;                      \n\t\t"
-    "}"
-    : "=r"(d)
-    : "l"(a));
-  return d;
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(__uint128_t a)
-{
-  return 0;
-}
-
-} // namespace cuda::ptx
-
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>

From 2dce21e606a8da78a951f8b4bcb1b3ecc5536e2e Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 15 Jan 2025 16:59:57 -0800
Subject: [PATCH 08/14] simplify the logic

---
 libcudacxx/include/cuda/std/__bit/integral.h | 41 ++++++++++++--------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
index 74e64a812ef..415c1a21c77 100644
--- a/libcudacxx/include/cuda/std/__bit/integral.h
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -25,6 +25,7 @@
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__bit/countl.h>
 #include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/is_constant_evaluated.h>
 #include <cuda/std/__type_traits/is_unsigned_integer.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
@@ -36,7 +37,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::uint32_t __bit_log2(_Tp __t) noexcept
 {
-  if (!_CUDA_VSTD::is_constant_evaluated() && sizeof(_Tp) <= 8)
+  if (!is_constant_evaluated() && sizeof(_Tp) <= 8)
   {
     NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::ptx::bfind(__t);))
   }
@@ -59,7 +60,7 @@ _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept
 {
   _CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow");
-  if (_CUDA_VSTD::is_constant_evaluated() && __t <= 1)
+  if (is_constant_evaluated() && __t <= 1)
   {
     return 1;
   }
@@ -68,20 +69,23 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexce
   // The result is computed as max(1, bit_width(__t - 1)) because max() requires less instructions than ternary operator
   using _Up    = conditional_t<sizeof(_Tp) <= 4, uint32_t, _Tp>;
   auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t - 1)); // type of __ret is _Up
-  // clang-format off
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
-                   (auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, _Up{1} << __width));
-                    _CCCL_BUILTIN_ASSUME(__ret >= __t);
-                    return __ret;),
-                   (return static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width);))
-  // clang-format on
+  if (!is_constant_evaluated() && sizeof(_Tp) <= 8)
+  {
+    NV_IF_TARGET(NV_IS_DEVICE, //
+                 (auto __ret = static_cast<_Tp>(::max(1u, ::cuda::ptx::shl(1u, __width))); //
+                  _CCCL_BUILTIN_ASSUME(__ret >= __t);
+                  return __ret;))
+  }
+  auto __ret = static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width);
+  _CCCL_BUILTIN_ASSUME(__ret >= __t);
+  return __ret;
 }
 
 _CCCL_TEMPLATE(class _Tp)
 _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept
 {
-  if (_CUDA_VSTD::is_constant_evaluated() && __t == 0)
+  if (is_constant_evaluated() && __t == 0)
   {
     return 0;
   }
@@ -90,13 +94,16 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexc
   // __bit_log2 returns 0xFFFFFFFF if __t == 0
   // (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type
   // -> the result is 0 is __t == 0
-  // clang-format off
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
-                    (auto __ret = _Tp{1} << __log2;
-                     _CCCL_BUILTIN_ASSUME(__ret <= __t);
-                     return __ret;),
-                    (return __t == 0 ? 0 : _Tp{1} << __log2;))
-  // clang-format on
+  if (!is_constant_evaluated() && sizeof(_Tp) <= 8)
+  {
+    NV_IF_TARGET(NV_IS_DEVICE, //
+                 (auto __ret = static_cast<_Tp>(::cuda::ptx::shl(1u, __log2))); //
+                 _CCCL_BUILTIN_ASSUME(__ret <= __t);
+                 return __ret;)
+  }
+  auto __ret = __t == 0 ? 0 : _Tp{1} << __log2;
+  _CCCL_BUILTIN_ASSUME(__ret <= __t);
+  return __ret;
 }
 
 #undef _CCCL_BUILTIN_ASSUME

From 499b4f6a59cf1b9dd2d84a7f74930d4b30f6417b Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 24 Jan 2025 11:23:17 -0800
Subject: [PATCH 09/14] clean up

---
 libcudacxx/include/cuda/std/__bit/integral.h | 22 ++++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
index 415c1a21c77..668705fe2e2 100644
--- a/libcudacxx/include/cuda/std/__bit/integral.h
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -35,7 +35,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::uint32_t __bit_log2(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept
 {
   if (!is_constant_evaluated() && sizeof(_Tp) <= 8)
   {
@@ -49,7 +49,7 @@ _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept
 {
   // __bit_log2 returns 0xFFFFFFFF if __t == 0. Since unsigned overflow is well-defined, the result is -1 + 1 = 0
-  using _Up  = conditional_t<sizeof(_Tp) <= 4, uint32_t, _Tp>;
+  using _Up  = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
   auto __ret = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)) + 1; // type of__ret is int
   _CCCL_BUILTIN_ASSUME((is_unsigned_v<_Tp> ? true : __ret >= 0) && __ret <= numeric_limits<_Tp>::digits);
   return __ret;
@@ -67,17 +67,17 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexce
   // if __t == 0, bit_width() applies to 0xFFFFFFFF and returns 32
   // In CUDA, unsigned{1} << 32 --> 0
   // The result is computed as max(1, bit_width(__t - 1)) because max() requires less instructions than ternary operator
-  using _Up    = conditional_t<sizeof(_Tp) <= 4, uint32_t, _Tp>;
+  using _Up    = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
   auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t - 1)); // type of __ret is _Up
   if (!is_constant_evaluated() && sizeof(_Tp) <= 8)
   {
     NV_IF_TARGET(NV_IS_DEVICE, //
-                 (auto __ret = static_cast<_Tp>(::max(1u, ::cuda::ptx::shl(1u, __width))); //
-                  _CCCL_BUILTIN_ASSUME(__ret >= __t);
+                 (auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Tp{1}, ::cuda::ptx::shl(_Tp{1}, __width))); //
+                  _CCCL_BUILTIN_ASSUME(__ret >= __t && __ret <= __t * 2);
                   return __ret;))
   }
   auto __ret = static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width);
-  _CCCL_BUILTIN_ASSUME(__ret >= __t);
+  _CCCL_BUILTIN_ASSUME(__ret >= __t && __ret <= __t * 2);
   return __ret;
 }
 
@@ -89,20 +89,20 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexc
   {
     return 0;
   }
-  using _Up   = conditional_t<sizeof(_Tp) <= 4, uint32_t, _Tp>;
+  using _Up   = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
   auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t));
   // __bit_log2 returns 0xFFFFFFFF if __t == 0
   // (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type
-  // -> the result is 0 is __t == 0
+  // -> the result is 0 if __t == 0
   if (!is_constant_evaluated() && sizeof(_Tp) <= 8)
   {
     NV_IF_TARGET(NV_IS_DEVICE, //
-                 (auto __ret = static_cast<_Tp>(::cuda::ptx::shl(1u, __log2))); //
-                 _CCCL_BUILTIN_ASSUME(__ret <= __t);
+                 (auto __ret = static_cast<_Tp>(::cuda::ptx::shl(_Tp{1}, __log2))); //
+                 _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
                  return __ret;)
   }
   auto __ret = __t == 0 ? 0 : _Tp{1} << __log2;
-  _CCCL_BUILTIN_ASSUME(__ret <= __t);
+  _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
   return __ret;
 }
 

From 6c3b6e76a675ba234fbafddda979032c44e17e27 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 21 Feb 2025 12:26:17 -0800
Subject: [PATCH 10/14] revert bit documentation

---
 .../standard_api/numerics_library/bit.rst     | 22 +++----------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/docs/libcudacxx/standard_api/numerics_library/bit.rst b/docs/libcudacxx/standard_api/numerics_library/bit.rst
index e2c19a6c17d..75aaf12792e 100644
--- a/docs/libcudacxx/standard_api/numerics_library/bit.rst
+++ b/docs/libcudacxx/standard_api/numerics_library/bit.rst
@@ -3,23 +3,7 @@
 ``<cuda/std/bit>``
 ======================
 
-CUDA Performance Considerations
--------------------------------
+Extensions
+----------
 
-- ``bit_width()`` translates into a single ``FLO`` SASS instruction. The result is assumed to be in the range ``[0, N-bit]``.
-- ``bit_ceil()`` translates into ``FLO, SHL`` SASS instructions. The result is assumed to be greater than or equal to the input.
-- ``bit_floor()`` translates into ``ADD, FLO, SHL, IMINMAX`` SASS instructions. The result is assumed to be less than or equal to the input.
-- ``popcount()`` translates into a single ``POPC`` SASS instruction. The result is assumed to be in the range ``[0, N-bit]``.
-- ``has_single_bit()`` translates into ``POPC + ISETP`` SASS instructions.
-- ``rotl()/rotr()`` translate into a single ``SHF`` (funned shift) SASS instruction.
-- ``countl_zero()`` translates into ``FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
-- ``countl_one()`` translates into ``LOP3, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
-- ``countr_zero()`` translates into ``BREV, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
-- ``countr_one()`` translates into ``LOP3, BREV, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
-
-Additional Notes
-----------------
-
-- All functions are marked ``[[nodiscard]]`` and ``noexcept``
-- All functions support ``__uint128_t``
-- ``bit_ceil()`` checks for overflow in debug mode
+-  All features of ``<bit>`` are made available in C++11 onwards

From f6091f6a0a4b9a32a8f0dfe603592890d1c693c7 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 21 Feb 2025 14:59:18 -0800
Subject: [PATCH 11/14] simplify the code

---
 libcudacxx/include/cuda/std/__bit/integral.h | 61 +++++++++-----------
 1 file changed, 28 insertions(+), 33 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
index 668705fe2e2..26847d7ef23 100644
--- a/libcudacxx/include/cuda/std/__bit/integral.h
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -21,25 +21,27 @@
 #  pragma system_header
 #endif // no system header
 
-// #include <cuda/__ptx/instructions/bfind.h>
+#include <cuda/__ptx/instructions/bfind.h>
+// #include <cuda/__ptx/instructions/shl.h>
+// #include <cuda/__ptx/instructions/shr.h>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__bit/countl.h>
 #include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/is_constant_evaluated.h>
 #include <cuda/std/__type_traits/is_unsigned_integer.h>
-#include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __bit_log2(_Tp __t) noexcept
 {
-  if (!is_constant_evaluated() && sizeof(_Tp) <= 8)
+  if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8)
   {
-    NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::ptx::bfind(__t);))
+    using _Up [[maybe_unused]] = _If<sizeof(_Tp) <= 4, uint32_t, uint64_t>;
+    NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::ptx::bfind(static_cast<_Up>(__t));))
   }
   return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);
 }
@@ -48,10 +50,9 @@ _CCCL_TEMPLATE(class _Tp)
 _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept
 {
-  // __bit_log2 returns 0xFFFFFFFF if __t == 0. Since unsigned overflow is well-defined, the result is -1 + 1 = 0
-  using _Up  = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
-  auto __ret = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)) + 1; // type of__ret is int
-  _CCCL_BUILTIN_ASSUME((is_unsigned_v<_Tp> ? true : __ret >= 0) && __ret <= numeric_limits<_Tp>::digits);
+  // if __t == 0, __bit_log2(0) returns 0xFFFFFFFF. Since unsigned overflow is well-defined, the result is -1 + 1 = 0
+  auto __ret = _CUDA_VSTD::__bit_log2(__t) + 1;
+  _CCCL_BUILTIN_ASSUME(__ret >= 0 && __ret <= numeric_limits<_Tp>::digits);
   return __ret;
 }
 
@@ -60,24 +61,22 @@ _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept
 {
   _CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow");
-  if (is_constant_evaluated() && __t <= 1)
-  {
-    return 1;
-  }
-  // if __t == 0, bit_width() applies to 0xFFFFFFFF and returns 32
-  // In CUDA, unsigned{1} << 32 --> 0
-  // The result is computed as max(1, bit_width(__t - 1)) because max() requires less instructions than ternary operator
-  using _Up    = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
-  auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t - 1)); // type of __ret is _Up
-  if (!is_constant_evaluated() && sizeof(_Tp) <= 8)
+  // if __t == 0, __t - 1 == 0xFFFFFFFF, bit_width(0xFFFFFFFF) returns 32
+  auto __width = _CUDA_VSTD::bit_width(static_cast<_Tp>(__t - 1));
+#ifdef PTX_SHL_SHR
+  if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8 && false)
   {
+    // CUDA right shift (ptx::shr) returns 0 if the right operand is larger than the number of bits of the type
+    // The result is computed as max(1, bit_width(__t - 1)) because it is more efficient than ternary operator
     NV_IF_TARGET(NV_IS_DEVICE, //
-                 (auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Tp{1}, ::cuda::ptx::shl(_Tp{1}, __width))); //
-                  _CCCL_BUILTIN_ASSUME(__ret >= __t && __ret <= __t * 2);
+                 (auto __shift = ::cuda::ptx::shl(_Tp{1}, __width); // 1 << width
+                  auto __ret   = static_cast<_Tp>(_CUDA_VSTD::max(_Tp{1}, __shift)); //
+                  _CCCL_BUILTIN_ASSUME(__ret >= __t);
                   return __ret;))
   }
-  auto __ret = static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width);
-  _CCCL_BUILTIN_ASSUME(__ret >= __t && __ret <= __t * 2);
+#endif
+  auto __ret = __t <= 1 ? _Tp{1} : _Tp{1} << __width;
+  _CCCL_BUILTIN_ASSUME(__ret >= __t);
   return __ret;
 }
 
@@ -85,29 +84,25 @@ _CCCL_TEMPLATE(class _Tp)
 _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept
 {
-  if (is_constant_evaluated() && __t == 0)
-  {
-    return 0;
-  }
   using _Up   = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
   auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t));
   // __bit_log2 returns 0xFFFFFFFF if __t == 0
-  // (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type
-  // -> the result is 0 if __t == 0
-  if (!is_constant_evaluated() && sizeof(_Tp) <= 8)
+#ifdef PTX_SHL_SHR
+  if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8)
   {
+    // CUDA left shift (ptx::shl) returns 0 if the right operand is larger than the number of bits of the type
+    // -> the result is 0 if __t == 0
     NV_IF_TARGET(NV_IS_DEVICE, //
                  (auto __ret = static_cast<_Tp>(::cuda::ptx::shl(_Tp{1}, __log2))); //
                  _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
                  return __ret;)
   }
-  auto __ret = __t == 0 ? 0 : _Tp{1} << __log2;
+#endif
+  auto __ret = __t == 0 ? _Tp{0} : _Tp{1} << __log2;
   _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
   return __ret;
 }
 
-#undef _CCCL_BUILTIN_ASSUME
-
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___BIT_INTEGRAL_H

From cba67a2d1f473241c08b1c23cff1411839766470 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 21 Feb 2025 17:09:34 -0800
Subject: [PATCH 12/14] fix MSVC warning

---
 libcudacxx/include/cuda/std/__bit/integral.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
index 26847d7ef23..070b0940382 100644
--- a/libcudacxx/include/cuda/std/__bit/integral.h
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -75,9 +75,9 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexce
                   return __ret;))
   }
 #endif
-  auto __ret = __t <= 1 ? _Tp{1} : _Tp{1} << __width;
+  auto __ret = __t <= 1 ? 1 : _Tp{1} << __width;
   _CCCL_BUILTIN_ASSUME(__ret >= __t);
-  return __ret;
+  return static_cast<_Tp>(__ret);
 }
 
 _CCCL_TEMPLATE(class _Tp)
@@ -98,9 +98,9 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexc
                  return __ret;)
   }
 #endif
-  auto __ret = __t == 0 ? _Tp{0} : _Tp{1} << __log2;
+  auto __ret = __t == 0 ? 0 : _Tp{1} << __log2;
   _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
-  return __ret;
+  return static_cast<_Tp>(__ret);
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD

From fc0e646fa7466720742766824a16b0b20e45666e Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Tue, 25 Feb 2025 15:10:09 -0800
Subject: [PATCH 13/14] add PTX shl and shr

---
 libcudacxx/include/cuda/std/__bit/integral.h | 74 ++++++++++++--------
 1 file changed, 43 insertions(+), 31 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
index 070b0940382..909599db757 100644
--- a/libcudacxx/include/cuda/std/__bit/integral.h
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -22,8 +22,8 @@
 #endif // no system header
 
 #include <cuda/__ptx/instructions/bfind.h>
-// #include <cuda/__ptx/instructions/shl.h>
-// #include <cuda/__ptx/instructions/shr.h>
+#include <cuda/__ptx/instructions/shl.h>
+#include <cuda/__ptx/instructions/shr.h>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__bit/countl.h>
 #include <cuda/std/__concepts/concept_macros.h>
@@ -36,12 +36,21 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr int __bit_log2(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept
 {
-  if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8)
+  if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
   {
-    using _Up [[maybe_unused]] = _If<sizeof(_Tp) <= 4, uint32_t, uint64_t>;
-    NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::ptx::bfind(static_cast<_Up>(__t));))
+    if constexpr (sizeof(_Tp) <= 8)
+    {
+      using _Up [[maybe_unused]] = _If<sizeof(_Tp) <= 4, uint32_t, uint64_t>;
+      NV_IF_TARGET(NV_IS_DEVICE, (return _CUDA_VPTX::bfind(static_cast<_Up>(__t));))
+    }
+    else
+    {
+      NV_IF_TARGET(NV_IS_DEVICE,
+                   (auto __high = _CUDA_VPTX::bfind(static_cast<uint64_t>(__t >> 64));
+                    return __high == ~uint32_t{0} ? _CUDA_VPTX::bfind(static_cast<uint64_t>(__t)) : __high + 64;))
+    }
   }
   return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);
 }
@@ -52,7 +61,7 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexc
 {
   // if __t == 0, __bit_log2(0) returns 0xFFFFFFFF. Since unsigned overflow is well-defined, the result is -1 + 1 = 0
   auto __ret = _CUDA_VSTD::__bit_log2(__t) + 1;
-  _CCCL_BUILTIN_ASSUME(__ret >= 0 && __ret <= numeric_limits<_Tp>::digits);
+  _CCCL_BUILTIN_ASSUME(__ret <= numeric_limits<_Tp>::digits);
   return __ret;
 }
 
@@ -60,24 +69,26 @@ _CCCL_TEMPLATE(class _Tp)
 _CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept
 {
+  using _Up = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
   _CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow");
   // if __t == 0, __t - 1 == 0xFFFFFFFF, bit_width(0xFFFFFFFF) returns 32
-  auto __width = _CUDA_VSTD::bit_width(static_cast<_Tp>(__t - 1));
-#ifdef PTX_SHL_SHR
-  if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8 && false)
+  auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t) - 1);
+  if constexpr (sizeof(_Tp) <= 8)
   {
-    // CUDA right shift (ptx::shr) returns 0 if the right operand is larger than the number of bits of the type
-    // The result is computed as max(1, bit_width(__t - 1)) because it is more efficient than ternary operator
-    NV_IF_TARGET(NV_IS_DEVICE, //
-                 (auto __shift = ::cuda::ptx::shl(_Tp{1}, __width); // 1 << width
-                  auto __ret   = static_cast<_Tp>(_CUDA_VSTD::max(_Tp{1}, __shift)); //
-                  _CCCL_BUILTIN_ASSUME(__ret >= __t);
-                  return __ret;))
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      // CUDA right shift (ptx::shr) returns 0 if the right operand is larger than the number of bits of the type
+      // The result is computed as max(1, bit_width(__t - 1)) because it is more efficient than the ternary operator
+      NV_IF_TARGET(NV_IS_DEVICE, //
+                   (auto __shift = _CUDA_VPTX::shl(_Up{1}, __width); // 2^(ceil(log2(__t - 1)))
+                    auto __ret   = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, __shift)); //
+                    _CCCL_BUILTIN_ASSUME(__ret >= __t);
+                    return __ret;))
+    }
   }
-#endif
-  auto __ret = __t <= 1 ? 1 : _Tp{1} << __width;
+  auto __ret = static_cast<_Tp>(__t <= 1 ? _Up{1} : _Up{1} << __width);
   _CCCL_BUILTIN_ASSUME(__ret >= __t);
-  return static_cast<_Tp>(__ret);
+  return __ret;
 }
 
 _CCCL_TEMPLATE(class _Tp)
@@ -87,20 +98,21 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexc
   using _Up   = _If<sizeof(_Tp) <= 4, uint32_t, _Tp>;
   auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t));
   // __bit_log2 returns 0xFFFFFFFF if __t == 0
-#ifdef PTX_SHL_SHR
-  if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated() && sizeof(_Tp) <= 8)
+  if constexpr (sizeof(_Tp) <= 8)
   {
-    // CUDA left shift (ptx::shl) returns 0 if the right operand is larger than the number of bits of the type
-    // -> the result is 0 if __t == 0
-    NV_IF_TARGET(NV_IS_DEVICE, //
-                 (auto __ret = static_cast<_Tp>(::cuda::ptx::shl(_Tp{1}, __log2))); //
-                 _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
-                 return __ret;)
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      // CUDA left shift (ptx::shl) returns 0 if the right operand is larger than the number of bits of the type
+      // -> the result is 0 if __t == 0
+      NV_IF_TARGET(NV_IS_DEVICE, //
+                   (auto __ret = static_cast<_Tp>(_CUDA_VPTX::shl(_Up{1}, __log2)); // 2^(log2(t))
+                    _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
+                    return __ret;))
+    }
   }
-#endif
-  auto __ret = __t == 0 ? 0 : _Tp{1} << __log2;
+  auto __ret = static_cast<_Tp>(__t == 0 ? _Up{0} : _Up{1} << __log2);
   _CCCL_BUILTIN_ASSUME(__ret >= __t / 2 && __ret <= __t);
-  return static_cast<_Tp>(__ret);
+  return __ret;
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD

From bacdd2b6f7047a14b91f8c4c7374200b292d276e Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 26 Feb 2025 10:38:58 +0100
Subject: [PATCH 14/14] Use proper return type

---
 libcudacxx/include/cuda/std/__bit/integral.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
index 909599db757..f7a73a05fe7 100644
--- a/libcudacxx/include/cuda/std/__bit/integral.h
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -62,7 +62,7 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexc
   // if __t == 0, __bit_log2(0) returns 0xFFFFFFFF. Since unsigned overflow is well-defined, the result is -1 + 1 = 0
   auto __ret = _CUDA_VSTD::__bit_log2(__t) + 1;
   _CCCL_BUILTIN_ASSUME(__ret <= numeric_limits<_Tp>::digits);
-  return __ret;
+  return static_cast<int>(__ret);
 }
 
 _CCCL_TEMPLATE(class _Tp)