NVIDIA · fbusato · Feb 26, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/docs/libcudacxx/standard_api/numerics_library/bit.rst b/docs/libcudacxx/standard_api/numerics_library/bit.rst
@@ -3,7 +3,23 @@
 ``<cuda/std/bit>``
 ======================
 
-Extensions
-----------
+CUDA Performance Considerations
+-------------------------------
 
--  All features of ``<bit>`` are made available in C++11 onwards
+- ``bit_width()`` translates into a single ``FLO`` SASS instruction. The result is assumed to be in the range ``[0, N-bit]``.
+- ``bit_ceil()`` translates into ``FLO, SHL`` SASS instructions. The result is assumed to be greater than or equal to the input.
+- ``bit_floor()`` translates into ``ADD, FLO, SHL, IMINMAX`` SASS instructions. The result is assumed to be less than or equal to the input.
+- ``popcount()`` translates into a single ``POPC`` SASS instruction. The result is assumed to be in the range ``[0, N-bit]``.
+- ``has_single_bit()`` translates into ``POPC + ISETP`` SASS instructions.
+- ``rotl()/rotr()`` translate into a single ``SHF`` (funned shift) SASS instruction.
+- ``countl_zero()`` translates into ``FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
+- ``countl_one()`` translates into ``LOP3, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
+- ``countr_zero()`` translates into ``BREV, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
+- ``countr_one()`` translates into ``LOP3, BREV, FLO, IMINMAX`` SASS instructions. The result is assumed to be in the range ``[0, N-bit]``.
+
+Additional Notes
+----------------
+
+- All functions are marked ``[[nodiscard]]`` and ``noexcept``
+- All functions support ``__uint128_t``
+- ``bit_ceil()`` checks for overflow in debug mode
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,53 +21,140 @@
 #  pragma system_header
 #endif // no system header
 
+// #include <cuda/__ptx/instructions/bfind.h>
+#include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__bit/countl.h>
-#include <cuda/std/__type_traits/enable_if.h>
-#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
+#include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
+// the following section will be removed when the ptx bfind is added
+namespace cuda::ptx
+{
 
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(unsigned a)
 {
-  static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned");
-  return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
+  uint32_t d;
+  asm volatile(
+    "{                                                   \n\t\t"
+    "bfind.u32 %0, %1;                      \n\t\t"
+    "}"
+    : "=r"(d)
+    : "r"(a));
+  return d;
 }
 
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) >= sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(uint64_t a)
 {
-  return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)));
+  uint32_t d;
+  asm volatile(
+    "{                                                   \n\t\t"
+    "bfind.u64 %0, %1;                      \n\t\t"
+    "}"
+    : "=r"(d)
+    : "l"(a));
+  return d;
 }
 
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) < sizeof(uint32_t), _Tp> __ceil2(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(unsigned long long a)
 {
-  return (_Tp) ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp) (__t - 1u)))
-                        + (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits)))
-                >> (numeric_limits<unsigned>::digits - numeric_limits<_Tp>::digits));
+  uint32_t d;
+  asm volatile(
+    "{                                                   \n\t\t"
+    "bfind.u64 %0, %1;                      \n\t\t"
+    "}"
+    : "=r"(d)
+    : "l"(a));
+  return d;
 }
 
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> bit_floor(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI uint32_t bfind(__uint128_t a)
 {
-  return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t));
+  return 0;
 }
 
+} // namespace cuda::ptx
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#define _CCCL_CUDA_BUILTIN_ASSUME(...)                           \
+  if (!_CUDA_VSTD::is_constant_evaluated())                      \
+  {                                                              \
+    NV_IF_TARGET(NV_IS_DEVICE, (__builtin_assume(__VA_ARGS__);)) \
+  }
+
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> bit_ceil(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::uint32_t __bit_log2(_Tp __t) noexcept
 {
-  return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t));
+  if (!_CUDA_VSTD::is_constant_evaluated() && sizeof(_Tp) <= 8)
+  {
+    NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                      (return ::cuda::ptx::bfind(__t);), //
+                      (return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);))
+  }
+  else
+  {
+    return numeric_limits<_Tp>::digits - 1 - _CUDA_VSTD::countl_zero(__t);
+  }
 }
 
-template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> bit_width(_Tp __t) noexcept
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept
+{
+  // __bit_log2 returns 0xFFFFFFFF if __t == 0. Since unsigned overflow is well-defined, the result is -1 + 1 = 0
+  using _Up  = _CUDA_VSTD::make_unsigned_t<decltype(+_Tp{})>;
+  auto __ret = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t)) + 1; // type of__ret is int
+  _CCCL_CUDA_BUILTIN_ASSUME((is_unsigned_v<_Tp> ? true : __ret >= 0) && __ret <= numeric_limits<_Tp>::digits);
+  return __ret;
+}
+
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept
 {
-  return __t == 0 ? 0 : static_cast<int>(__bit_log2(__t) + 1);
+  _CCCL_ASSERT(__t <= numeric_limits<_Tp>::max() / 2, "bit_ceil overflow");
+  if (_CUDA_VSTD::is_constant_evaluated() && __t <= 1)
+  {
+    return 1;
+  }
+  // if __t == 0, bit_width() applies to 0xFFFFFFFF and returns 32
+  // In CUDA, unsigned{1} << 32 --> 0
+  // The result is computed as max(1, bit_width(__t - 1)) because max() requires less instructions than ternary operator
+  using _Up    = _CUDA_VSTD::make_unsigned_t<decltype(+_Tp{})>;
+  auto __width = _CUDA_VSTD::bit_width(static_cast<_Up>(__t - 1)); // type of __ret is _Up
+  // clang-format off
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                   (auto __ret = static_cast<_Tp>(_CUDA_VSTD::max(_Up{1}, _Up{1} << __width));
+                    _CCCL_BUILTIN_ASSUME(__ret >= __t)
+                    return __ret;),
+                   (return static_cast<_Tp>(__t <= 1 ? 1 : _Up{1} << __width);))
+  // clang-format on
 }
 
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::__cccl_is_unsigned_integer, _Tp))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept
+{
+  // __bit_log2 returns 0xFFFFFFFF if __t == 0
+  // (CUDA) shift returns 0 if the right operand is larger than the number of bits of the type
+  // -> the result is 0 is __t == 0
+  if (_CUDA_VSTD::is_constant_evaluated() && __t == 0)
+  {
+    return 0;
+  }
+  using _Up   = _CUDA_VSTD::make_unsigned_t<decltype(+_Tp{})>;
+  auto __log2 = _CUDA_VSTD::__bit_log2(static_cast<_Up>(__t));
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                    (return _Tp{1} << __log2;), //
+                    (return __t == 0 ? 0 : _Tp{1} << __log2;))
+}
+
+#undef _CCCL_CUDA_BUILTIN_ASSUME
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___BIT_INTEGRAL_H
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
@@ -12,6 +12,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include "cuda/std/__cccl/dialect.h"
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -48,6 +50,9 @@ struct __cccl_is_unsigned_integer<__uint128_t> : public true_type
 {};
 #endif
 
+template <typename _Tp>
+_CCCL_INLINE_VAR constexpr auto __cccl_is_unsigned_integer_v = __cccl_is_unsigned_integer<_Tp>::value;
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H