From cc4b86eef90e0f863ecc2ca06ef4f56ae77c9174 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Mon, 27 Jan 2025 21:06:02 +0100
Subject: [PATCH 1/8] Backport to 2.8: Implement `cuda::std::numeric_limits`
 for `__half` and `__nv_bfloat16` (#3361) (#3490)

With a dedicated C++11 fix

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 libcudacxx/include/cuda/std/limits            | 207 +++++++++++++++-
 .../std/containers/views/mdspan/my_int.hpp    |  15 ++
 .../limits/is_specialized.pass.cpp            |   7 +
 .../limits/numeric.limits.members/common.h    |  41 ++++
 .../const_data_members.pass.cpp               | 225 +++++-------------
 .../denorm_min.pass.cpp                       |  15 +-
 .../numeric.limits.members/digits.pass.cpp    |   7 +-
 .../numeric.limits.members/digits10.pass.cpp  |   6 +
 .../numeric.limits.members/epsilon.pass.cpp   |  15 +-
 .../has_denorm.pass.cpp                       |   6 +
 .../has_denorm_loss.pass.cpp                  |   6 +
 .../has_infinity.pass.cpp                     |   6 +
 .../has_quiet_NaN.pass.cpp                    |   6 +
 .../has_signaling_NaN.pass.cpp                |   6 +
 .../numeric.limits.members/infinity.pass.cpp  |  26 +-
 .../is_bounded.pass.cpp                       |   6 +
 .../numeric.limits.members/is_exact.pass.cpp  |   6 +
 .../numeric.limits.members/is_iec559.pass.cpp |   6 +
 .../is_integer.pass.cpp                       |   6 +
 .../numeric.limits.members/is_modulo.pass.cpp |   6 +
 .../numeric.limits.members/is_signed.pass.cpp |   6 +
 .../numeric.limits.members/lowest.pass.cpp    |  16 +-
 .../numeric.limits.members/max.pass.cpp       |  15 +-
 .../max_digits10.pass.cpp                     |   6 +
 .../max_exponent.pass.cpp                     |   6 +
 .../max_exponent10.pass.cpp                   |   6 +
 .../numeric.limits.members/min.pass.cpp       |  15 +-
 .../min_exponent.pass.cpp                     |   6 +
 .../min_exponent10.pass.cpp                   |   6 +
 .../numeric.limits.members/quiet_NaN.pass.cpp |  10 +-
 .../numeric.limits.members/radix.pass.cpp     |   6 +
 .../round_error.pass.cpp                      |  15 +-
 .../round_style.pass.cpp                      |   6 +
 .../signaling_NaN.pass.cpp                    |  10 +-
 .../tinyness_before.pass.cpp                  |   6 +
 .../numeric.limits.members/traps.pass.cpp     |   6 +
 36 files changed, 564 insertions(+), 201 deletions(-)
 create mode 100644 libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h
diff --git a/libcudacxx/include/cuda/std/limits b/libcudacxx/include/cuda/std/limits
index b5f6d847434..5e4df32270e 100644
--- a/libcudacxx/include/cuda/std/limits
+++ b/libcudacxx/include/cuda/std/limits
@@ -22,7 +22,10 @@
 #endif // no system header
 
 #include <cuda/std/__bit/bit_cast.h>
-#include <cuda/std/__type_traits/is_arithmetic.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_extended_floating_point.h>
+#include <cuda/std/__type_traits/is_floating_point.h>
+#include <cuda/std/__type_traits/is_integral.h>
 #include <cuda/std/climits>
 #include <cuda/std/version>
 
@@ -46,7 +49,47 @@ enum float_denorm_style
   denorm_present       = 1
 };
 
-template <class _Tp, bool = is_arithmetic<_Tp>::value>
+enum class __numeric_limits_type
+{
+  __integral,
+  __bool,
+  __floating_point,
+  __other,
+};
+
+template <class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr __numeric_limits_type __make_numeric_limits_type()
+{
+#if !defined(_CCCL_NO_IF_CONSTEXPR)
+  _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_same, _Tp, bool))
+  {
+    return __numeric_limits_type::__bool;
+  }
+  else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_integral, _Tp))
+  {
+    return __numeric_limits_type::__integral;
+  }
+  else _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp))
+  {
+    return __numeric_limits_type::__floating_point;
+  }
+  else
+  {
+    return __numeric_limits_type::__other;
+  }
+  _CCCL_UNREACHABLE();
+#else // ^^^ !_CCCL_NO_IF_CONSTEXPR ^^^ // vvv _CCCL_NO_IF_CONSTEXPR vvv
+  return _CCCL_TRAIT(is_same, _Tp, bool)
+         ? __numeric_limits_type::__bool
+         : (_CCCL_TRAIT(is_integral, _Tp)
+              ? __numeric_limits_type::__integral
+              : (_CCCL_TRAIT(is_floating_point, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp)
+                   ? __numeric_limits_type::__floating_point
+                   : __numeric_limits_type::__other));
+#endif // _CCCL_NO_IF_CONSTEXPR
+}
+
+template <class _Tp, __numeric_limits_type = __make_numeric_limits_type<_Tp>()>
 class __numeric_limits_impl
 {
 public:
@@ -135,7 +178,7 @@ struct __int_min<_Tp, __digits, false>
 };
 
 template <class _Tp>
-class __numeric_limits_impl<_Tp, true>
+class __numeric_limits_impl<_Tp, __numeric_limits_type::__integral>
 {
 public:
   using type = _Tp;
@@ -212,7 +255,7 @@ public:
 };
 
 template <>
-class __numeric_limits_impl<bool, true>
+class __numeric_limits_impl<bool, __numeric_limits_type::__bool>
 {
 public:
   using type = bool;
@@ -286,7 +329,7 @@ public:
 };
 
 template <>
-class __numeric_limits_impl<float, true>
+class __numeric_limits_impl<float, __numeric_limits_type::__floating_point>
 {
 public:
   using type = float;
@@ -381,7 +424,7 @@ public:
 };
 
 template <>
-class __numeric_limits_impl<double, true>
+class __numeric_limits_impl<double, __numeric_limits_type::__floating_point>
 {
 public:
   using type = double;
@@ -476,7 +519,7 @@ public:
 };
 
 template <>
-class __numeric_limits_impl<long double, true>
+class __numeric_limits_impl<long double, __numeric_limits_type::__floating_point>
 {
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
 
@@ -551,6 +594,156 @@ public:
 #endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
 };
 
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+template <>
+class __numeric_limits_impl<__half, __numeric_limits_type::__floating_point>
+{
+public:
+  using type = __half;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = 11;
+  static constexpr int digits10     = 3;
+  static constexpr int max_digits10 = 5;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return type(__half_raw{0x0400u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return type(__half_raw{0x7bffu});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return type(__half_raw{0xfbffu});
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return type(__half_raw{0x1400u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return type(__half_raw{0x3800u});
+  }
+
+  static constexpr int min_exponent   = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent   = 16;
+  static constexpr int max_exponent10 = 4;
+
+  static constexpr bool has_infinity             = true;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return type(__half_raw{0x7c00u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return type(__half_raw{0x7e00u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return type(__half_raw{0x7d00u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return type(__half_raw{0x0001u});
+  }
+
+  static constexpr bool is_iec559  = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+template <>
+class __numeric_limits_impl<__nv_bfloat16, __numeric_limits_type::__floating_point>
+{
+public:
+  using type = __nv_bfloat16;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = 8;
+  static constexpr int digits10     = 2;
+  static constexpr int max_digits10 = 4;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x0080u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x7f7fu});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return type(__nv_bfloat16_raw{0xff7fu});
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x3c00u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x3f00u});
+  }
+
+  static constexpr int min_exponent   = -125;
+  static constexpr int min_exponent10 = -37;
+  static constexpr int max_exponent   = 128;
+  static constexpr int max_exponent10 = 38;
+
+  static constexpr bool has_infinity             = true;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x7f80u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x7fc0u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x7fa0u});
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return type(__nv_bfloat16_raw{0x0001u});
+  }
+
+  static constexpr bool is_iec559  = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+#endif // _LIBCUDACXX_HAS_NVBF16
+
 template <class _Tp>
 class numeric_limits : public __numeric_limits_impl<_Tp>
 {};
diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_int.hpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_int.hpp
index 4f27784cd61..df34fa1d42e 100644
--- a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_int.hpp
+++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/my_int.hpp
@@ -1,6 +1,9 @@
 #ifndef _MY_INT_HPP
 #define _MY_INT_HPP
 
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
 #include "test_macros.h"
 
 struct my_int_non_convertible;
@@ -22,6 +25,10 @@ template <>
 struct cuda::std::is_integral<my_int> : cuda::std::true_type
 {};
 
+template <>
+class cuda::std::numeric_limits<my_int> : public cuda::std::numeric_limits<int>
+{};
+
 // Wrapper type that's not implicitly convertible
 
 struct my_int_non_convertible
@@ -43,6 +50,10 @@ template <>
 struct cuda::std::is_integral<my_int_non_convertible> : cuda::std::true_type
 {};
 
+template <>
+class cuda::std::numeric_limits<my_int_non_convertible> : public cuda::std::numeric_limits<int>
+{};
+
 // Wrapper type that's not nothrow-constructible
 
 struct my_int_non_nothrow_constructible
@@ -62,4 +73,8 @@ template <>
 struct cuda::std::is_integral<my_int_non_nothrow_constructible> : cuda::std::true_type
 {};
 
+template <>
+class cuda::std::numeric_limits<my_int_non_nothrow_constructible> : public cuda::std::numeric_limits<int>
+{};
+
 #endif
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp
index 2ecd59004bb..7113c0e2772 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp
@@ -68,6 +68,13 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>();
+#endif // _LIBCUDACXX_HAS_NVBF16
+
   static_assert(!cuda::std::numeric_limits<cuda::std::complex<double>>::is_specialized,
                 "!cuda::std::numeric_limits<cuda::std::complex<double> >::is_specialized");
 
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h
new file mode 100644
index 00000000000..15b48836839
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NUMERIC_LIMITS_MEMBERS_COMMON_H
+#define NUMERIC_LIMITS_MEMBERS_COMMON_H
+
+// Disable all the extended floating point operations and conversions
+#define __CUDA_NO_HALF_CONVERSIONS__     1
+#define __CUDA_NO_HALF_OPERATORS__       1
+#define __CUDA_NO_BFLOAT16_CONVERSIONS__ 1
+#define __CUDA_NO_BFLOAT16_OPERATORS__   1
+
+#include <cuda/std/limits>
+
+template <class T>
+__host__ __device__ bool float_eq(T x, T y)
+{
+  return x == y;
+}
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+__host__ __device__ inline bool float_eq(__half x, __half y)
+{
+  return __heq(x, y);
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+__host__ __device__ inline bool float_eq(__nv_bfloat16 x, __nv_bfloat16 y)
+{
+  return __heq(x, y);
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+#endif // NUMERIC_LIMITS_MEMBERS_COMMON_H
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
index 8db1a9f5f0c..769080cff83 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
@@ -42,173 +42,80 @@ template <class T>
 __host__ __device__ void test(T)
 {}
 
-#define TEST_NUMERIC_LIMITS(type)                           \
-  test(cuda::std::numeric_limits<type>::is_specialized);    \
-  test(cuda::std::numeric_limits<type>::digits);            \
-  test(cuda::std::numeric_limits<type>::digits10);          \
-  test(cuda::std::numeric_limits<type>::max_digits10);      \
-  test(cuda::std::numeric_limits<type>::is_signed);         \
-  test(cuda::std::numeric_limits<type>::is_integer);        \
-  test(cuda::std::numeric_limits<type>::is_exact);          \
-  test(cuda::std::numeric_limits<type>::radix);             \
-  test(cuda::std::numeric_limits<type>::min_exponent);      \
-  test(cuda::std::numeric_limits<type>::min_exponent10);    \
-  test(cuda::std::numeric_limits<type>::max_exponent);      \
-  test(cuda::std::numeric_limits<type>::max_exponent10);    \
-  test(cuda::std::numeric_limits<type>::has_infinity);      \
-  test(cuda::std::numeric_limits<type>::has_quiet_NaN);     \
-  test(cuda::std::numeric_limits<type>::has_signaling_NaN); \
-  test(cuda::std::numeric_limits<type>::has_denorm);        \
-  test(cuda::std::numeric_limits<type>::has_denorm_loss);   \
-  test(cuda::std::numeric_limits<type>::is_iec559);         \
-  test(cuda::std::numeric_limits<type>::is_bounded);        \
-  test(cuda::std::numeric_limits<type>::is_modulo);         \
-  test(cuda::std::numeric_limits<type>::traps);             \
-  test(cuda::std::numeric_limits<type>::tinyness_before);   \
-  test(cuda::std::numeric_limits<type>::round_style);
+template <class T>
+__host__ __device__ void test_type_helper()
+{
+  test(cuda::std::numeric_limits<T>::is_specialized);
+  test(cuda::std::numeric_limits<T>::digits);
+  test(cuda::std::numeric_limits<T>::digits10);
+  test(cuda::std::numeric_limits<T>::max_digits10);
+  test(cuda::std::numeric_limits<T>::is_signed);
+  test(cuda::std::numeric_limits<T>::is_integer);
+  test(cuda::std::numeric_limits<T>::is_exact);
+  test(cuda::std::numeric_limits<T>::radix);
+  test(cuda::std::numeric_limits<T>::min_exponent);
+  test(cuda::std::numeric_limits<T>::min_exponent10);
+  test(cuda::std::numeric_limits<T>::max_exponent);
+  test(cuda::std::numeric_limits<T>::max_exponent10);
+  test(cuda::std::numeric_limits<T>::has_infinity);
+  test(cuda::std::numeric_limits<T>::has_quiet_NaN);
+  test(cuda::std::numeric_limits<T>::has_signaling_NaN);
+  test(cuda::std::numeric_limits<T>::has_denorm);
+  test(cuda::std::numeric_limits<T>::has_denorm_loss);
+  test(cuda::std::numeric_limits<T>::is_iec559);
+  test(cuda::std::numeric_limits<T>::is_bounded);
+  test(cuda::std::numeric_limits<T>::is_modulo);
+  test(cuda::std::numeric_limits<T>::traps);
+  test(cuda::std::numeric_limits<T>::tinyness_before);
+  test(cuda::std::numeric_limits<T>::round_style);
+}
+
+template <class T>
+__host__ __device__ void test_type()
+{
+  test_type_helper<T>();
+  test_type_helper<const T>();
+  test_type_helper<volatile T>();
+  test_type_helper<const volatile T>();
+}
 
 struct other
 {};
 
 int main(int, char**)
 {
-  // bool
-  TEST_NUMERIC_LIMITS(bool)
-  TEST_NUMERIC_LIMITS(const bool)
-  TEST_NUMERIC_LIMITS(volatile bool)
-  TEST_NUMERIC_LIMITS(const volatile bool)
-
-  // char
-  TEST_NUMERIC_LIMITS(char)
-  TEST_NUMERIC_LIMITS(const char)
-  TEST_NUMERIC_LIMITS(volatile char)
-  TEST_NUMERIC_LIMITS(const volatile char)
-
-  // signed char
-  TEST_NUMERIC_LIMITS(signed char)
-  TEST_NUMERIC_LIMITS(const signed char)
-  TEST_NUMERIC_LIMITS(volatile signed char)
-  TEST_NUMERIC_LIMITS(const volatile signed char)
-
-  // unsigned char
-  TEST_NUMERIC_LIMITS(unsigned char)
-  TEST_NUMERIC_LIMITS(const unsigned char)
-  TEST_NUMERIC_LIMITS(volatile unsigned char)
-  TEST_NUMERIC_LIMITS(const volatile unsigned char)
-
-  // wchar_t
-  TEST_NUMERIC_LIMITS(wchar_t)
-  TEST_NUMERIC_LIMITS(const wchar_t)
-  TEST_NUMERIC_LIMITS(volatile wchar_t)
-  TEST_NUMERIC_LIMITS(const volatile wchar_t)
-
-#if TEST_STD_VER > 2017 && defined(__cpp_char8_t)
-  // char8_t
-  TEST_NUMERIC_LIMITS(char8_t)
-  TEST_NUMERIC_LIMITS(const char8_t)
-  TEST_NUMERIC_LIMITS(volatile char8_t)
-  TEST_NUMERIC_LIMITS(const volatile char8_t)
-#endif
-
-  // char16_t
-  TEST_NUMERIC_LIMITS(char16_t)
-  TEST_NUMERIC_LIMITS(const char16_t)
-  TEST_NUMERIC_LIMITS(volatile char16_t)
-  TEST_NUMERIC_LIMITS(const volatile char16_t)
-
-  // char32_t
-  TEST_NUMERIC_LIMITS(char32_t)
-  TEST_NUMERIC_LIMITS(const char32_t)
-  TEST_NUMERIC_LIMITS(volatile char32_t)
-  TEST_NUMERIC_LIMITS(const volatile char32_t)
-
-  // short
-  TEST_NUMERIC_LIMITS(short)
-  TEST_NUMERIC_LIMITS(const short)
-  TEST_NUMERIC_LIMITS(volatile short)
-  TEST_NUMERIC_LIMITS(const volatile short)
-
-  // int
-  TEST_NUMERIC_LIMITS(int)
-  TEST_NUMERIC_LIMITS(const int)
-  TEST_NUMERIC_LIMITS(volatile int)
-  TEST_NUMERIC_LIMITS(const volatile int)
-
-  // long
-  TEST_NUMERIC_LIMITS(long)
-  TEST_NUMERIC_LIMITS(const long)
-  TEST_NUMERIC_LIMITS(volatile long)
-  TEST_NUMERIC_LIMITS(const volatile long)
-
-#ifndef _LIBCUDACXX_HAS_NO_INT128
-  TEST_NUMERIC_LIMITS(__int128_t)
-  TEST_NUMERIC_LIMITS(const __int128_t)
-  TEST_NUMERIC_LIMITS(volatile __int128_t)
-  TEST_NUMERIC_LIMITS(const volatile __int128_t)
-#endif
-
-  // long long
-  TEST_NUMERIC_LIMITS(long long)
-  TEST_NUMERIC_LIMITS(const long long)
-  TEST_NUMERIC_LIMITS(volatile long long)
-  TEST_NUMERIC_LIMITS(const volatile long long)
-
-  // unsigned short
-  TEST_NUMERIC_LIMITS(unsigned short)
-  TEST_NUMERIC_LIMITS(const unsigned short)
-  TEST_NUMERIC_LIMITS(volatile unsigned short)
-  TEST_NUMERIC_LIMITS(const volatile unsigned short)
-
-  // unsigned int
-  TEST_NUMERIC_LIMITS(unsigned int)
-  TEST_NUMERIC_LIMITS(const unsigned int)
-  TEST_NUMERIC_LIMITS(volatile unsigned int)
-  TEST_NUMERIC_LIMITS(const volatile unsigned int)
-
-  // unsigned long
-  TEST_NUMERIC_LIMITS(unsigned long)
-  TEST_NUMERIC_LIMITS(const unsigned long)
-  TEST_NUMERIC_LIMITS(volatile unsigned long)
-  TEST_NUMERIC_LIMITS(const volatile unsigned long)
-
-  // unsigned long long
-  TEST_NUMERIC_LIMITS(unsigned long long)
-  TEST_NUMERIC_LIMITS(const unsigned long long)
-  TEST_NUMERIC_LIMITS(volatile unsigned long long)
-  TEST_NUMERIC_LIMITS(const volatile unsigned long long)
-
+  test_type<bool>();
+  test_type<char>();
+  test_type<signed char>();
+  test_type<unsigned char>();
+  test_type<wchar_t>();
+#if TEST_STD_VER >= 2020 && defined(__cpp_char8_t)
+  test_type<char8_t>();
+#endif // TEST_STD_VER >= 2020 && defined(__cpp_char8_t)
+  test_type<char16_t>();
+  test_type<char32_t>();
+  test_type<short>();
+  test_type<unsigned short>();
+  test_type<int>();
+  test_type<unsigned int>();
+  test_type<long>();
+  test_type<unsigned long>();
+  test_type<long long>();
+  test_type<unsigned long long>();
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-  TEST_NUMERIC_LIMITS(__uint128_t)
-  TEST_NUMERIC_LIMITS(const __uint128_t)
-  TEST_NUMERIC_LIMITS(volatile __uint128_t)
-  TEST_NUMERIC_LIMITS(const volatile __uint128_t)
-#endif
-
-  // float
-  TEST_NUMERIC_LIMITS(float)
-  TEST_NUMERIC_LIMITS(const float)
-  TEST_NUMERIC_LIMITS(volatile float)
-  TEST_NUMERIC_LIMITS(const volatile float)
-
-  // double
-  TEST_NUMERIC_LIMITS(double)
-  TEST_NUMERIC_LIMITS(const double)
-  TEST_NUMERIC_LIMITS(volatile double)
-  TEST_NUMERIC_LIMITS(const volatile double)
-
+  test_type<__int128_t>();
+#endif // _LIBCUDACXX_HAS_NO_INT128
+  test_type<float>();
+  test_type<double>();
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-  // long double
-  TEST_NUMERIC_LIMITS(long double)
-  TEST_NUMERIC_LIMITS(const long double)
-  TEST_NUMERIC_LIMITS(volatile long double)
-  TEST_NUMERIC_LIMITS(const volatile long double)
-#endif
-
-  // other
-  TEST_NUMERIC_LIMITS(other)
-  TEST_NUMERIC_LIMITS(const other)
-  TEST_NUMERIC_LIMITS(volatile other)
-  TEST_NUMERIC_LIMITS(const volatile other)
+  test_type<long double>();
+#endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test_type<__half>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test_type<__nv_bfloat16>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp
index 730adc30d36..cc64ed14686 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp
@@ -14,15 +14,16 @@
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
 
+#include "common.h"
 #include "test_macros.h"
 
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::denorm_min() == expected);
-  assert(cuda::std::numeric_limits<const T>::denorm_min() == expected);
-  assert(cuda::std::numeric_limits<volatile T>::denorm_min() == expected);
-  assert(cuda::std::numeric_limits<const volatile T>::denorm_min() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::denorm_min(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const T>::denorm_min(), expected));
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::denorm_min(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::denorm_min(), expected));
 }
 
 int main(int, char**)
@@ -65,6 +66,12 @@ int main(int, char**)
   test<long double>(LDBL_TRUE_MIN);
 #  endif
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(5.9604644775390625e-08));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(9.18354961579912115600575419705e-41));
+#endif // _LIBCUDACXX_HAS_NVBF16
 #if !defined(__FLT_DENORM_MIN__) && !defined(FLT_TRUE_MIN)
 #  error Test has no expected values for floating point types
 #endif
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp
index 63ecf93515f..efce1ccf678 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp
@@ -55,6 +55,11 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MANT_DIG>();
 #endif
-
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, 11>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, 8>();
+#endif // _LIBCUDACXX_HAS_NVBF16
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp
index 3295686ea49..32990ece4b1 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp
@@ -59,6 +59,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_DIG>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, 3>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, 2>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp
index 5bc22e7f5f2..fa42c5e8fe6 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp
@@ -14,15 +14,16 @@
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
 
+#include "common.h"
 #include "test_macros.h"
 
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::epsilon() == expected);
-  assert(cuda::std::numeric_limits<const T>::epsilon() == expected);
-  assert(cuda::std::numeric_limits<volatile T>::epsilon() == expected);
-  assert(cuda::std::numeric_limits<const volatile T>::epsilon() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::epsilon(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const T>::epsilon(), expected));
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::epsilon(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::epsilon(), expected));
 }
 
 int main(int, char**)
@@ -56,6 +57,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(LDBL_EPSILON);
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(0.0009765625));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(0.0078125));
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
index e62208d7e3b..5a0a05ab73b 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, cuda::std::denorm_present>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, cuda::std::denorm_present>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, cuda::std::denorm_present>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
index 3a665fe2c9b..450e51b8111 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, false>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, false>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp
index be41dabb02c..646f5e20160 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, true>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, true>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp
index 2d13db35438..626b4110695 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, true>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, true>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp
index d5cf5096bb7..20cd04d107e 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, true>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, true>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp
index 2d1c29f6f31..34527e300c5 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
 
+#include "common.h"
+
 // MSVC has issues with producing INF with divisions by zero.
 #if defined(_MSC_VER)
 #  include <cmath>
@@ -24,10 +26,10 @@
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::infinity() == expected);
-  assert(cuda::std::numeric_limits<const T>::infinity() == expected);
-  assert(cuda::std::numeric_limits<volatile T>::infinity() == expected);
-  assert(cuda::std::numeric_limits<const volatile T>::infinity() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::infinity(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const T>::infinity(), expected));
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::infinity(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::infinity(), expected));
 }
 
 int main(int, char**)
@@ -62,6 +64,12 @@ int main(int, char**)
 #  ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(1. / 0.);
 #  endif
+#  if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(1.0 / 0.0));
+#  endif // _LIBCUDACXX_HAS_NVFP16
+#  if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(1.0 / 0.0));
+#  endif // _LIBCUDACXX_HAS_NVBF16
 // MSVC has issues with producing INF with divisions by zero.
 #else
   test<float>(INFINITY);
@@ -69,11 +77,13 @@ int main(int, char**)
 #  ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(INFINITY);
 #  endif
+#  if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(INFINITY));
+#  endif // _LIBCUDACXX_HAS_NVFP16
+#  if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(INFINITY));
+#  endif // _LIBCUDACXX_HAS_NVBF16
 #endif
 
   return 0;
 }
-
-#ifndef TEST_COMPILER_NVRTC
-float zero = 0;
-#endif
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp
index 2dd4bd94fbc..9e671c5d905 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, true>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, true>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp
index be45efae70c..cfc9a6cab90 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, false>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, false>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
index 6221cd6ed59..945347ff4b5 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, true>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, true>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp
index 3d166f31f28..65dd98fdb04 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, false>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, false>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp
index 7b1adabf0c7..6d82269e1c8 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, false>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, false>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp
index d7f98766343..eb39869bf24 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, true>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, true>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
index 6fec93e4a3d..e3b832dfd9b 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
@@ -16,18 +16,19 @@
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
+#include "common.h"
 #include "test_macros.h"
 
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::lowest() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::lowest(), expected));
   assert(cuda::std::numeric_limits<T>::is_bounded);
-  assert(cuda::std::numeric_limits<const T>::lowest() == expected);
+  assert(float_eq(cuda::std::numeric_limits<const T>::lowest(), expected));
   assert(cuda::std::numeric_limits<const T>::is_bounded);
-  assert(cuda::std::numeric_limits<volatile T>::lowest() == expected);
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::lowest(), expected));
   assert(cuda::std::numeric_limits<volatile T>::is_bounded);
-  assert(cuda::std::numeric_limits<const volatile T>::lowest() == expected);
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::lowest(), expected));
   assert(cuda::std::numeric_limits<const volatile T>::is_bounded);
 }
 
@@ -35,6 +36,7 @@ int main(int, char**)
 {
   test<bool>(false);
   test<char>(CHAR_MIN);
+
   test<signed char>(SCHAR_MIN);
   test<unsigned char>(0);
 #ifndef TEST_COMPILER_NVRTC
@@ -64,6 +66,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(-LDBL_MAX);
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(-65504.0));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(-3.3895313892515355e+38));
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
index 67c94051729..7ba6dabb1d2 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
@@ -16,18 +16,19 @@
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
+#include "common.h"
 #include "test_macros.h"
 
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::max() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::max(), expected));
   assert(cuda::std::numeric_limits<T>::is_bounded);
-  assert(cuda::std::numeric_limits<const T>::max() == expected);
+  assert(float_eq(cuda::std::numeric_limits<const T>::max(), expected));
   assert(cuda::std::numeric_limits<const T>::is_bounded);
-  assert(cuda::std::numeric_limits<volatile T>::max() == expected);
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::max(), expected));
   assert(cuda::std::numeric_limits<volatile T>::is_bounded);
-  assert(cuda::std::numeric_limits<const volatile T>::max() == expected);
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::max(), expected));
   assert(cuda::std::numeric_limits<const volatile T>::is_bounded);
 }
 
@@ -64,6 +65,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(LDBL_MAX);
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(65504.0));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(3.3895313892515355e+38));
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp
index cd5892e6c8c..92b3d13ea61 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp
@@ -55,6 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, 2 + (LDBL_MANT_DIG * 30103) / 100000>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, 5>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, 4>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp
index aeb9189d315..81d5ae07795 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp
@@ -55,6 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MAX_EXP>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, 16>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, 128>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp
index ca0eb2917f6..4c426b37460 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp
@@ -55,6 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MAX_10_EXP>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, 4>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, 38>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
index 53d196d2a51..c24c3fde869 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
@@ -16,18 +16,19 @@
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
+#include "common.h"
 #include "test_macros.h"
 
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::min() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::min(), expected));
   assert(cuda::std::numeric_limits<T>::is_bounded || !cuda::std::numeric_limits<T>::is_signed);
-  assert(cuda::std::numeric_limits<const T>::min() == expected);
+  assert(float_eq(cuda::std::numeric_limits<const T>::min(), expected));
   assert(cuda::std::numeric_limits<const T>::is_bounded || !cuda::std::numeric_limits<const T>::is_signed);
-  assert(cuda::std::numeric_limits<volatile T>::min() == expected);
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::min(), expected));
   assert(cuda::std::numeric_limits<volatile T>::is_bounded || !cuda::std::numeric_limits<volatile T>::is_signed);
-  assert(cuda::std::numeric_limits<const volatile T>::min() == expected);
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::min(), expected));
   assert(cuda::std::numeric_limits<const volatile T>::is_bounded
          || !cuda::std::numeric_limits<const volatile T>::is_signed);
 }
@@ -65,6 +66,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(LDBL_MIN);
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(6.103515625e-05));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(1.17549435082228750796873653722e-38));
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp
index b075bcff87d..e3150f8dc8e 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp
@@ -55,6 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MIN_EXP>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, -13>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, -125>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp
index c787cf4caab..cbca8e04171 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp
@@ -55,6 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MIN_10_EXP>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, -4>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, -37>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp
index ce38b3ed60d..74e7f427941 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp
@@ -38,7 +38,9 @@ __host__ __device__ void test_imp(cuda::std::false_type)
 template <class T>
 __host__ __device__ inline void test()
 {
-  test_imp<T>(cuda::std::is_floating_point<T>());
+  constexpr bool is_float = cuda::std::is_floating_point<T>::value || cuda::std::__is_extended_floating_point<T>::value;
+
+  test_imp<T>(cuda::std::integral_constant<bool, is_float>{});
 }
 
 int main(int, char**)
@@ -72,6 +74,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp
index 5a709b3aefc..9765db6f760 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp
@@ -55,6 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, FLT_RADIX>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, FLT_RADIX>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, FLT_RADIX>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp
index 01d10e80fb9..ba5049fc49f 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp
@@ -14,15 +14,16 @@
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
 
+#include "common.h"
 #include "test_macros.h"
 
 template <class T>
 __host__ __device__ void test(T expected)
 {
-  assert(cuda::std::numeric_limits<T>::round_error() == expected);
-  assert(cuda::std::numeric_limits<const T>::round_error() == expected);
-  assert(cuda::std::numeric_limits<volatile T>::round_error() == expected);
-  assert(cuda::std::numeric_limits<const volatile T>::round_error() == expected);
+  assert(float_eq(cuda::std::numeric_limits<T>::round_error(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const T>::round_error(), expected));
+  assert(float_eq(cuda::std::numeric_limits<volatile T>::round_error(), expected));
+  assert(float_eq(cuda::std::numeric_limits<const volatile T>::round_error(), expected));
 }
 
 int main(int, char**)
@@ -56,6 +57,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(0.5);
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>(__double2half(0.5));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>(__double2bfloat16(0.5));
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp
index 7a7099662f0..3fb436381a7 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, cuda::std::round_to_nearest>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, cuda::std::round_to_nearest>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, cuda::std::round_to_nearest>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp
index 164d54c5741..69ba66038de 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp
@@ -38,7 +38,9 @@ __host__ __device__ void test_imp(cuda::std::false_type)
 template <class T>
 __host__ __device__ inline void test()
 {
-  test_imp<T>(cuda::std::is_floating_point<T>());
+  constexpr bool is_float = cuda::std::is_floating_point<T>::value || cuda::std::__is_extended_floating_point<T>::value;
+
+  test_imp<T>(cuda::std::integral_constant<bool, is_float>{});
 }
 
 int main(int, char**)
@@ -72,6 +74,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
index 70d832dc547..70cde2711a1 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
@@ -54,6 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, false>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, false>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
index 5c66acb56ce..7dd7eee68cc 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
@@ -60,6 +60,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+  test<__half, false>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+  test<__nv_bfloat16, false>();
+#endif // _LIBCUDACXX_HAS_NVBF16
 
   return 0;
 }

From 450b6a8462d0366afbd97930ba90677160a47e48 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 29 Jan 2025 15:42:37 -0800
Subject: [PATCH 2/8] Backport PRs #3201, #3523, #3547, #3580 to the 2.8.x
 branch. (#3536)

* [FEA]: Introduce Python module with CCCL headers (#3201)

* Add cccl/python/cuda_cccl directory and use from cuda_parallel, cuda_cooperative

* Run `copy_cccl_headers_to_aude_include()` before `setup()`

* Create python/cuda_cccl/cuda/_include/__init__.py, then simply import cuda._include to find the include path.

* Add cuda.cccl._version exactly as for cuda.cooperative and cuda.parallel

* Bug fix: cuda/_include only exists after shutil.copytree() ran.

* Use `f"cuda-cccl @ file://{cccl_path}/python/cuda_cccl"` in setup.py

* Remove CustomBuildCommand, CustomWheelBuild in cuda_parallel/setup.py (they are equivalent to the default functions)

* Replace := operator (needs Python 3.8+)

* Fix oversights: remove `pip3 install ./cuda_cccl` lines from README.md

* Restore original README.md: `pip3 install -e` now works on first pass.

* cuda_cccl/README.md: FOR INTERNAL USE ONLY

* Remove `$pymajor.$pyminor.` prefix in cuda_cccl _version.py (as suggested under https://github.com/NVIDIA/cccl/pull/3201#discussion_r1894035917)

Command used: ci/update_version.sh 2 8 0

* Modernize pyproject.toml, setup.py

Trigger for this change:

* https://github.com/NVIDIA/cccl/pull/3201#discussion_r1894043178

* https://github.com/NVIDIA/cccl/pull/3201#discussion_r1894044996

* Install CCCL headers under cuda.cccl.include

Trigger for this change:

* https://github.com/NVIDIA/cccl/pull/3201#discussion_r1894048562

Unexpected accidental discovery: cuda.cooperative unit tests pass without CCCL headers entirely.

* Factor out cuda_cccl/cuda/cccl/include_paths.py

* Reuse cuda_cccl/cuda/cccl/include_paths.py from cuda_cooperative

* Add missing Copyright notice.

* Add missing __init__.py (cuda.cccl)

* Add `"cuda.cccl"` to `autodoc.mock_imports`

* Move cuda.cccl.include_paths into function where it is used. (Attempt to resolve Build and Verify Docs failure.)

* Add # TODO: move this to a module-level import

* Modernize cuda_cooperative/pyproject.toml, setup.py

* Convert cuda_cooperative to use hatchling as build backend.

* Revert "Convert cuda_cooperative to use hatchling as build backend."

This reverts commit 61637d608da06fcf6851ef6197f88b5e7dbc3bbe.

* Move numpy from [build-system] requires -> [project] dependencies

* Move pyproject.toml [project] dependencies -> setup.py install_requires, to be able to use CCCL_PATH

* Remove copy_license() and use license_files=["../../LICENSE"] instead.

* Further modernize cuda_cccl/setup.py to use pathlib

* Trivial simplifications in cuda_cccl/pyproject.toml

* Further simplify cuda_cccl/pyproject.toml, setup.py: remove inconsequential code

* Make cuda_cooperative/pyproject.toml more similar to cuda_cccl/pyproject.toml

* Add taplo-pre-commit to .pre-commit-config.yaml

* taplo-pre-commit auto-fixes

* Use pathlib in cuda_cooperative/setup.py

* CCCL_PYTHON_PATH in cuda_cooperative/setup.py

* Modernize cuda_parallel/pyproject.toml, setup.py

* Use pathlib in cuda_parallel/setup.py

* Add `# TOML lint & format` comment.

* Replace MANIFEST.in with `[tool.setuptools.package-data]` section in pyproject.toml

* Use pathlib in cuda/cccl/include_paths.py

* pre-commit autoupdate (EXCEPT clang-format, which was manually restored)

* Fixes after git merge main

* Resolve warning: AttributeError: '_Reduce' object has no attribute 'build_result'

```
=========================================================================== warnings summary ===========================================================================
tests/test_reduce.py::test_reduce_non_contiguous
  /home/coder/cccl/python/devenv/lib/python3.12/site-packages/_pytest/unraisableexception.py:85: PytestUnraisableExceptionWarning: Exception ignored in: <function _Reduce.__del__ at 0x7bf123139080>

  Traceback (most recent call last):
    File "/home/coder/cccl/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py", line 132, in __del__
      bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result))
                                                       ^^^^^^^^^^^^^^^^^
  AttributeError: '_Reduce' object has no attribute 'build_result'

    warnings.warn(pytest.PytestUnraisableExceptionWarning(msg))

-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
============================================================= 1 passed, 93 deselected, 1 warning in 0.44s ==============================================================
```

* Move `copy_cccl_headers_to_cuda_cccl_include()` functionality to `class CustomBuildPy`

* Introduce cuda_cooperative/constraints.txt

* Also add cuda_parallel/constraints.txt

* Add `--constraint constraints.txt` in ci/test_python.sh

* Update Copyright dates

* Switch to https://github.com/ComPWA/taplo-pre-commit (the other repo has been archived by the owner on Jul 1, 2024)

For completeness: The other repo took a long time to install into the pre-commit cache; so long it lead to timeouts in the CCCL CI.

* Remove unused cuda_parallel jinja2 dependency (noticed by chance).

* Remove constraints.txt files, advertise running `pip install cuda-cccl` first instead.

* Make cuda_cooperative, cuda_parallel testing completely independent.

* Run only test_python.sh [skip-rapids][skip-matx][skip-docs][skip-vdc]

* Try using another runner (because V100 runners seem to be stuck) [skip-rapids][skip-matx][skip-docs][skip-vdc]

* Fix sign-compare warning (#3408) [skip-rapids][skip-matx][skip-docs][skip-vdc]

* Revert "Try using another runner (because V100 runners seem to be stuck) [skip-rapids][skip-matx][skip-docs][skip-vdc]"

This reverts commit ea33a218ed77a075156cd1b332047202adb25aa2.

Error message: https://github.com/NVIDIA/cccl/pull/3201#issuecomment-2594012971

* Try using A100 runner (because V100 runners still seem to be stuck) [skip-rapids][skip-matx][skip-docs][skip-vdc]

* Also show cuda-cooperative site-packages, cuda-parallel site-packages (after pip install) [skip-rapids][skip-matx][skip-docs][skip-vdc]

* Try using l4 runner (because V100 runners still seem to be stuck) [skip-rapids][skip-matx][skip-docs][skip-vdc]

* Restore original ci/matrix.yaml [skip-rapids]

* Use for loop in test_python.sh to avoid code duplication.

* Run only test_python.sh [skip-rapids][skip-matx][skip-docs][skip-vdc][skip pre-commit.ci]

* Comment out taplo-lint in pre-commit config [skip-rapids][skip-matx][skip-docs][skip-vdc]

* Revert "Run only test_python.sh [skip-rapids][skip-matx][skip-docs][skip-vdc][skip pre-commit.ci]"

This reverts commit ec206fd8b50a6a293e00a5825b579e125010b13d.

* Implement suggestion by @shwina (https://github.com/NVIDIA/cccl/pull/3201#pullrequestreview-2556918460)

* Address feedback by @leofang

---------

Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>

* cuda.parallel: invoke pytest directly rather than via `python -m pytest` (#3523)

Co-authored-by: Ashwin Srinath <shwina@users.noreply.github.com>

* Copy file from PR #3547 (bugfix/drop_pipe_in_lit by @wmaxey)

* Revert "cuda.parallel: invoke pytest directly rather than via `python -m pytest` (#3523)"

This reverts commit a2e21cbdd2fa15a35b3a0df8eb7e2fc84adc46bc.

* Replace pipes.quote with shlex.quote in lit config (#3547)

* Replace pipes.quote with shlex.quote

* Drop TBB run on windows to unblock CI

* Update ci/matrix.yaml

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>

* Remove nvks runners from testing pool. (#3580)

---------

Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Co-authored-by: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Co-authored-by: Ashwin Srinath <shwina@users.noreply.github.com>
Co-authored-by: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
Co-authored-by: Allison Piper <alliepiper16@gmail.com>
---
 .pre-commit-config.yaml                       | 11 +++
 ci/matrix.yaml                                | 19 ++--
 ci/test_python.sh                             | 33 ++++---
 ci/update_version.sh                          |  2 +
 docs/repo.toml                                |  1 +
 .../test/utils/libcudacxx/test/config.py      | 15 +--
 python/cuda_cccl/.gitignore                   |  2 +
 python/cuda_cccl/README.md                    |  3 +
 python/cuda_cccl/cuda/cccl/__init__.py        |  8 ++
 python/cuda_cccl/cuda/cccl/_version.py        |  7 ++
 python/cuda_cccl/cuda/cccl/include_paths.py   | 63 ++++++++++++
 python/cuda_cccl/pyproject.toml               | 29 ++++++
 python/cuda_cccl/setup.py                     | 51 ++++++++++
 python/cuda_cooperative/.gitignore            |  1 -
 python/cuda_cooperative/MANIFEST.in           |  1 -
 python/cuda_cooperative/README.md             |  1 +
 .../cuda/cooperative/experimental/_nvrtc.py   | 46 ++-------
 python/cuda_cooperative/pyproject.toml        | 34 ++++++-
 python/cuda_cooperative/setup.py              | 88 +----------------
 python/cuda_parallel/.gitignore               |  1 -
 python/cuda_parallel/MANIFEST.in              |  1 -
 python/cuda_parallel/README.md                |  1 +
 .../cuda/parallel/experimental/_bindings.py   | 42 +-------
 .../experimental/algorithms/reduce.py         |  5 +
 python/cuda_parallel/pyproject.toml           | 32 +++++--
 python/cuda_parallel/setup.py                 | 96 +++----------------
 26 files changed, 303 insertions(+), 290 deletions(-)
 create mode 100644 python/cuda_cccl/.gitignore
 create mode 100644 python/cuda_cccl/README.md
 create mode 100644 python/cuda_cccl/cuda/cccl/__init__.py
 create mode 100644 python/cuda_cccl/cuda/cccl/_version.py
 create mode 100644 python/cuda_cccl/cuda/cccl/include_paths.py
 create mode 100644 python/cuda_cccl/pyproject.toml
 create mode 100644 python/cuda_cccl/setup.py
 delete mode 100644 python/cuda_cooperative/MANIFEST.in
 delete mode 100644 python/cuda_parallel/MANIFEST.in

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7dd411ba39b..37ead30ca95 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -43,6 +43,17 @@ repos:
     hooks:
     - id: ruff  # linter
     - id: ruff-format  # formatter
+
+  # TOML lint & format
+  - repo: https://github.com/ComPWA/taplo-pre-commit
+    rev: v0.9.3
+    hooks:
+      # See https://github.com/NVIDIA/cccl/issues/3426
+      # - id: taplo-lint
+      #   exclude: "^docs/"
+      - id: taplo-format
+        exclude: "^docs/"
+
   - repo: https://github.com/codespell-project/codespell
     rev: v2.3.0
     hooks:
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index fd7f2f079c4..b04923fad9a 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -19,8 +19,9 @@ workflows:
     - {jobs: ['build'], std: 'max', cxx: ['intel', 'msvc2019']}
     - {jobs: ['build'], std: [17, 20], cxx: ['gcc', 'clang', 'msvc']}
     # Current CTK testing:
-    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc']}
-    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['clang', 'msvc']}
+    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc', 'clang']}
+    # Disabled until we figure out the issue with the TBB dll
+    #- {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc']}
     # Split up cub tests:
     - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc']}
     - {jobs: ['test_lid1',  'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']}
@@ -278,13 +279,13 @@ projects:
 
 # testing -> Runner with GPU is in a nv-gh-runners testing pool
 gpus:
-  v100:     { sm: 70 }                # 32 GB,  40 runners
-  t4:       { sm: 75, testing: true } # 16 GB,   8 runners
-  rtx2080:  { sm: 75, testing: true } #  8 GB,   8 runners
-  rtxa6000: { sm: 86, testing: true } # 48 GB,  12 runners
-  l4:       { sm: 89, testing: true } # 24 GB,  48 runners
-  rtx4090:  { sm: 89, testing: true } # 24 GB,  10 runners
-  h100:     { sm: 90, testing: true } # 80 GB,  16 runners
+  v100:     { sm: 70 } # 32 GB,  40 runners
+  t4:       { sm: 75 } # 16 GB,  10 runners
+  rtx2080:  { sm: 75 } #  8 GB,  12 runners
+  rtxa6000: { sm: 86 } # 48 GB,  12 runners
+  l4:       { sm: 89 } # 24 GB,  48 runners
+  rtx4090:  { sm: 89 } # 24 GB,  10 runners
+  h100:     { sm: 90 } # 80 GB,  16 runners
 
 # Tags are used to define a `matrix job` in the workflow section.
 #
diff --git a/ci/test_python.sh b/ci/test_python.sh
index bd66cc57716..34900fdb8e0 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -8,25 +8,28 @@ print_environment_details
 
 fail_if_no_gpu
 
-readonly prefix="${BUILD_DIR}/python/"
-export PYTHONPATH="${prefix}:${PYTHONPATH:-}"
+begin_group "⚙️ Existing site-packages"
+pip freeze
+end_group "⚙️ Existing site-packages"
 
-pushd ../python/cuda_cooperative >/dev/null
+for module in cuda_parallel cuda_cooperative; do
 
-run_command "⚙️  Pip install cuda_cooperative" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
-run_command "🚀  Pytest cuda_cooperative" python -m pytest -v ./tests
+  pushd "../python/${module}" >/dev/null
 
-popd >/dev/null
+  TEMP_VENV_DIR="/tmp/${module}_venv"
+  rm -rf "${TEMP_VENV_DIR}"
+  python -m venv "${TEMP_VENV_DIR}"
+  . "${TEMP_VENV_DIR}/bin/activate"
+  echo 'cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl' > /tmp/cuda-cccl_constraints.txt
+  run_command "⚙️  Pip install ${module}" pip install -c /tmp/cuda-cccl_constraints.txt .[test]
+  begin_group "⚙️ ${module} site-packages"
+  pip freeze
+  end_group "⚙️ ${module} site-packages"
+  run_command "🚀  Pytest ${module}" python -m pytest -v ./tests
+  deactivate
 
-pushd ../python/cuda_parallel >/dev/null
+  popd >/dev/null
 
-# Temporarily install the package twice to populate include directory as part of the first installation
-# and to let manifest discover these includes during the second installation. Do not forget to remove the
-# second installation after https://github.com/NVIDIA/cccl/issues/2281 is addressed.
-run_command "⚙️  Pip install cuda_parallel once" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
-run_command "⚙️  Pip install cuda_parallel twice" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
-run_command "🚀  Pytest cuda_parallel" python -m pytest -v ./tests
-
-popd >/dev/null
+done
 
 print_time_summary
diff --git a/ci/update_version.sh b/ci/update_version.sh
index c43303449bb..6a25a837d50 100755
--- a/ci/update_version.sh
+++ b/ci/update_version.sh
@@ -37,6 +37,7 @@ CUB_CMAKE_VERSION_FILE="lib/cmake/cub/cub-config-version.cmake"
 LIBCUDACXX_CMAKE_VERSION_FILE="lib/cmake/libcudacxx/libcudacxx-config-version.cmake"
 THRUST_CMAKE_VERSION_FILE="lib/cmake/thrust/thrust-config-version.cmake"
 CUDAX_CMAKE_VERSION_FILE="lib/cmake/cudax/cudax-config-version.cmake"
+CUDA_CCCL_VERSION_FILE="python/cuda_cccl/cuda/cccl/_version.py"
 CUDA_COOPERATIVE_VERSION_FILE="python/cuda_cooperative/cuda/cooperative/_version.py"
 CUDA_PARALLEL_VERSION_FILE="python/cuda_parallel/cuda/parallel/_version.py"
 
@@ -110,6 +111,7 @@ update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MAJOR \([0-9]\+\))" "
 update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MINOR \([0-9]\+\))" "set(cudax_VERSION_MINOR $minor)"
 update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_PATCH \([0-9]\+\))" "set(cudax_VERSION_PATCH $patch)"
 
+update_file "$CUDA_CCCL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$major.$minor.$patch\""
 update_file "$CUDA_COOPERATIVE_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
 update_file "$CUDA_PARALLEL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
 
diff --git a/docs/repo.toml b/docs/repo.toml
index f5be6925fb5..3313723c527 100644
--- a/docs/repo.toml
+++ b/docs/repo.toml
@@ -348,6 +348,7 @@ autodoc.mock_imports = [
     "numba",
     "pynvjitlink",
     "cuda.bindings",
+    "cuda.cccl",
     "llvmlite",
     "numpy",
 ]
diff --git a/libcudacxx/test/utils/libcudacxx/test/config.py b/libcudacxx/test/utils/libcudacxx/test/config.py
index d52b869dd17..af90b9fcbec 100644
--- a/libcudacxx/test/utils/libcudacxx/test/config.py
+++ b/libcudacxx/test/utils/libcudacxx/test/config.py
@@ -8,7 +8,6 @@
 
 import ctypes
 import os
-import pipes
 import platform
 import re
 import shlex
@@ -423,10 +422,6 @@ def configure_cxx(self):
                     )
                 )
 
-            if "icc" in self.config.available_features:
-                self.cxx.link_flags += ["-lirc"]
-                self.cxx.compile_flags += ["-Xcompiler=-diag-disable=10441"]
-
     def _configure_clang_cl(self, clang_path):
         def _split_env_var(var):
             return [p.strip() for p in os.environ.get(var, "").split(";") if p.strip()]
@@ -1516,14 +1511,14 @@ def configure_modules(self):
 
     def configure_substitutions(self):
         sub = self.config.substitutions
-        cxx_path = pipes.quote(self.cxx.path)
+        cxx_path = shlex.quote(self.cxx.path)
         # Configure compiler substitutions
         sub.append(("%cxx", cxx_path))
         sub.append(("%libcxx_src_root", self.libcudacxx_src_root))
         # Configure flags substitutions
-        flags_str = " ".join([pipes.quote(f) for f in self.cxx.flags])
-        compile_flags_str = " ".join([pipes.quote(f) for f in self.cxx.compile_flags])
-        link_flags_str = " ".join([pipes.quote(f) for f in self.cxx.link_flags])
+        flags_str = " ".join([shlex.quote(f) for f in self.cxx.flags])
+        compile_flags_str = " ".join([shlex.quote(f) for f in self.cxx.compile_flags])
+        link_flags_str = " ".join([shlex.quote(f) for f in self.cxx.link_flags])
         all_flags = "%s %s %s" % (flags_str, compile_flags_str, link_flags_str)
         sub.append(("%flags", flags_str))
         sub.append(("%compile_flags", compile_flags_str))
@@ -1552,7 +1547,7 @@ def configure_substitutions(self):
         sub.append(("%run", "%t.exe"))
         # Configure not program substitutions
         not_py = os.path.join(self.libcudacxx_src_root, "test", "utils", "not.py")
-        not_str = "%s %s " % (pipes.quote(sys.executable), pipes.quote(not_py))
+        not_str = "%s %s " % (shlex.quote(sys.executable), shlex.quote(not_py))
         sub.append(("not ", not_str))
         if self.get_lit_conf("libcudacxx_gdb"):
             sub.append(("%libcxx_gdb", self.get_lit_conf("libcudacxx_gdb")))
diff --git a/python/cuda_cccl/.gitignore b/python/cuda_cccl/.gitignore
new file mode 100644
index 00000000000..24ec757199f
--- /dev/null
+++ b/python/cuda_cccl/.gitignore
@@ -0,0 +1,2 @@
+cuda/cccl/include
+*egg-info
diff --git a/python/cuda_cccl/README.md b/python/cuda_cccl/README.md
new file mode 100644
index 00000000000..37f020b6df6
--- /dev/null
+++ b/python/cuda_cccl/README.md
@@ -0,0 +1,3 @@
+## Note
+
+This package is currently FOR INTERNAL USE ONLY and not meant to be used/installed explicitly.
diff --git a/python/cuda_cccl/cuda/cccl/__init__.py b/python/cuda_cccl/cuda/cccl/__init__.py
new file mode 100644
index 00000000000..5288f071942
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from cuda.cccl._version import __version__
+from cuda.cccl.include_paths import get_include_paths
+
+__all__ = ["__version__", "get_include_paths"]
diff --git a/python/cuda_cccl/cuda/cccl/_version.py b/python/cuda_cccl/cuda/cccl/_version.py
new file mode 100644
index 00000000000..ec7c29a266e
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/_version.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# This file is generated by ci/update_version.sh
+# Do not edit this file manually.
+__version__ = "2.8.0"
diff --git a/python/cuda_cccl/cuda/cccl/include_paths.py b/python/cuda_cccl/cuda/cccl/include_paths.py
new file mode 100644
index 00000000000..da8246b9195
--- /dev/null
+++ b/python/cuda_cccl/cuda/cccl/include_paths.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import shutil
+from dataclasses import dataclass
+from functools import lru_cache
+from pathlib import Path
+from typing import Optional
+
+
+def _get_cuda_path() -> Optional[Path]:
+    cuda_path = os.environ.get("CUDA_PATH")
+    if cuda_path:
+        cuda_path = Path(cuda_path)
+        if cuda_path.exists():
+            return cuda_path
+
+    nvcc_path = shutil.which("nvcc")
+    if nvcc_path:
+        return Path(nvcc_path).parent.parent
+
+    default_path = Path("/usr/local/cuda")
+    if default_path.exists():
+        return default_path
+
+    return None
+
+
+@dataclass
+class IncludePaths:
+    cuda: Optional[Path]
+    libcudacxx: Optional[Path]
+    cub: Optional[Path]
+    thrust: Optional[Path]
+
+    def as_tuple(self):
+        # Note: higher-level ... lower-level order:
+        return (self.thrust, self.cub, self.libcudacxx, self.cuda)
+
+
+@lru_cache()
+def get_include_paths() -> IncludePaths:
+    # TODO: once docs env supports Python >= 3.9, we
+    # can move this to a module-level import.
+    from importlib.resources import as_file, files
+
+    cuda_incl = None
+    cuda_path = _get_cuda_path()
+    if cuda_path is not None:
+        cuda_incl = cuda_path / "include"
+
+    with as_file(files("cuda.cccl.include")) as f:
+        cccl_incl = Path(f)
+    assert cccl_incl.exists()
+
+    return IncludePaths(
+        cuda=cuda_incl,
+        libcudacxx=cccl_incl / "libcudacxx",
+        cub=cccl_incl,
+        thrust=cccl_incl,
+    )
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
new file mode 100644
index 00000000000..ada06301a4c
--- /dev/null
+++ b/python/cuda_cccl/pyproject.toml
@@ -0,0 +1,29 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+[build-system]
+requires = ["setuptools>=61.0.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "cuda-cccl"
+description = "Experimental Package with CCCL headers to support JIT compilation"
+authors = [{ name = "NVIDIA Corporation" }]
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "License :: OSI Approved :: Apache Software License",
+]
+requires-python = ">=3.9"
+dynamic = ["version", "readme"]
+
+[project.urls]
+Homepage = "https://github.com/NVIDIA/cccl"
+
+[tool.setuptools.dynamic]
+version = { attr = "cuda.cccl._version.__version__" }
+readme = { file = ["README.md"], content-type = "text/markdown" }
+
+[tool.setuptools.package-data]
+cuda = ["cccl/include/**/*"]
diff --git a/python/cuda_cccl/setup.py b/python/cuda_cccl/setup.py
new file mode 100644
index 00000000000..f6e5e3fa033
--- /dev/null
+++ b/python/cuda_cccl/setup.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import shutil
+from pathlib import Path
+
+from setuptools import setup
+from setuptools.command.build_py import build_py
+
+PROJECT_PATH = Path(__file__).resolve().parent
+CCCL_PATH = PROJECT_PATH.parents[1]
+
+
+class CustomBuildPy(build_py):
+    """Copy CCCL headers BEFORE super().run()
+
+    Note that the CCCL headers cannot be referenced directly:
+    setuptools (and pyproject.toml) does not support relative paths that
+    reference files outside the package directory (like ../../).
+    This is a restriction designed to avoid inadvertently packaging files
+    that are outside the source tree.
+    """
+
+    def run(self):
+        cccl_headers = [
+            ("cub", "cub"),
+            ("libcudacxx", "include"),
+            ("thrust", "thrust"),
+        ]
+
+        inc_path = PROJECT_PATH / "cuda" / "cccl" / "include"
+        inc_path.mkdir(parents=True, exist_ok=True)
+
+        for proj_dir, header_dir in cccl_headers:
+            src_path = CCCL_PATH / proj_dir / header_dir
+            dst_path = inc_path / proj_dir
+            if dst_path.exists():
+                shutil.rmtree(dst_path)
+            shutil.copytree(src_path, dst_path)
+
+        init_py_path = inc_path / "__init__.py"
+        init_py_path.write_text("# Intentionally empty.\n")
+
+        super().run()
+
+
+setup(
+    license_files=["../../LICENSE"],
+    cmdclass={"build_py": CustomBuildPy},
+)
diff --git a/python/cuda_cooperative/.gitignore b/python/cuda_cooperative/.gitignore
index 15c09b246c1..a9904c10554 100644
--- a/python/cuda_cooperative/.gitignore
+++ b/python/cuda_cooperative/.gitignore
@@ -1,3 +1,2 @@
-cuda/_include
 env
 *egg-info
diff --git a/python/cuda_cooperative/MANIFEST.in b/python/cuda_cooperative/MANIFEST.in
deleted file mode 100644
index 848cbfe2e81..00000000000
--- a/python/cuda_cooperative/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-recursive-include cuda/_include *
diff --git a/python/cuda_cooperative/README.md b/python/cuda_cooperative/README.md
index c202d1d6c17..673e130bbe0 100644
--- a/python/cuda_cooperative/README.md
+++ b/python/cuda_cooperative/README.md
@@ -7,6 +7,7 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
 ## Local development
 
 ```bash
+pip3 install -e ../cuda_cccl
 pip3 install -e .[test]
 pytest -v ./tests/
 ```
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
index 1e86dd45dfe..a1d269fd987 100644
--- a/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
+++ b/python/cuda_cooperative/cuda/cooperative/experimental/_nvrtc.py
@@ -3,9 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import functools
-import importlib.resources as pkg_resources
-import os
-import shutil
 
 from cuda.bindings import nvrtc
 from cuda.cooperative.experimental._caching import disk_cache
@@ -20,22 +17,6 @@ def CHECK_NVRTC(err, prog):
         raise RuntimeError(f"NVRTC error: {log.decode('ascii')}")
 
 
-def get_cuda_path():
-    cuda_path = os.environ.get("CUDA_PATH", "")
-    if os.path.exists(cuda_path):
-        return cuda_path
-
-    nvcc_path = shutil.which("nvcc")
-    if nvcc_path is not None:
-        return os.path.dirname(os.path.dirname(nvcc_path))
-
-    default_path = "/usr/local/cuda"
-    if os.path.exists(default_path):
-        return default_path
-
-    return None
-
-
 # cpp is the C++ source code
 # cc = 800 for Ampere, 900 Hopper, etc
 # rdc is true or false
@@ -47,24 +28,15 @@ def compile_impl(cpp, cc, rdc, code, nvrtc_path, nvrtc_version):
     check_in("rdc", rdc, [True, False])
     check_in("code", code, ["lto", "ptx"])
 
-    with pkg_resources.path("cuda", "_include") as include_path:
-        # Using `.parent` for compatibility with pip install --editable:
-        include_path = pkg_resources.files("cuda.cooperative").parent.joinpath(
-            "_include"
-        )
-        cub_path = include_path
-        thrust_path = include_path
-        libcudacxx_path = os.path.join(include_path, "libcudacxx")
-        cuda_include_path = os.path.join(get_cuda_path(), "include")
-
-    opts = [
-        b"--std=c++17",
-        bytes(f"--include-path={cub_path}", encoding="ascii"),
-        bytes(f"--include-path={thrust_path}", encoding="ascii"),
-        bytes(f"--include-path={libcudacxx_path}", encoding="ascii"),
-        bytes(f"--include-path={cuda_include_path}", encoding="ascii"),
-        bytes(f"--gpu-architecture=compute_{cc}", encoding="ascii"),
-    ]
+    opts = [b"--std=c++17"]
+
+    # TODO: move this to a module-level import (after docs env modernization).
+    from cuda.cccl import get_include_paths
+
+    for path in get_include_paths().as_tuple():
+        if path is not None:
+            opts += [f"--include-path={path}".encode("ascii")]
+    opts += [f"--gpu-architecture=compute_{cc}".encode("ascii")]
     if rdc:
         opts += [b"--relocatable-device-code=true"]
 
diff --git a/python/cuda_cooperative/pyproject.toml b/python/cuda_cooperative/pyproject.toml
index 017c0be1e56..788e1e6d5d8 100644
--- a/python/cuda_cooperative/pyproject.toml
+++ b/python/cuda_cooperative/pyproject.toml
@@ -1,11 +1,41 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 [build-system]
-requires = ["packaging", "setuptools>=61.0.0", "wheel"]
+requires = ["setuptools>=61.0.0"]
 build-backend = "setuptools.build_meta"
 
+[project]
+name = "cuda-cooperative"
+description = "Experimental Core Library for CUDA Python"
+authors = [{ name = "NVIDIA Corporation" }]
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "License :: OSI Approved :: Apache Software License",
+]
+requires-python = ">=3.9"
+dependencies = [
+  "cuda-cccl",
+  "numpy",
+  "numba>=0.60.0",
+  "pynvjitlink-cu12>=0.2.4",
+  "cuda-python==12.*",
+  "jinja2",
+]
+dynamic = ["version", "readme"]
+
+[project.optional-dependencies]
+test = ["pytest", "pytest-xdist"]
+
+[project.urls]
+Homepage = "https://developer.nvidia.com/"
+
+[tool.setuptools.dynamic]
+version = { attr = "cuda.cooperative._version.__version__" }
+readme = { file = ["README.md"], content-type = "text/markdown" }
+
 [tool.ruff]
 extend = "../../pyproject.toml"
 
diff --git a/python/cuda_cooperative/setup.py b/python/cuda_cooperative/setup.py
index 5f954086cfe..b8dd6502515 100644
--- a/python/cuda_cooperative/setup.py
+++ b/python/cuda_cooperative/setup.py
@@ -1,91 +1,9 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import os
-import shutil
-
-from setuptools import Command, find_namespace_packages, setup
-from setuptools.command.build_py import build_py
-from wheel.bdist_wheel import bdist_wheel
-
-project_path = os.path.abspath(os.path.dirname(__file__))
-cccl_path = os.path.abspath(os.path.join(project_path, "..", ".."))
-cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]]
-__version__ = None
-with open(os.path.join(project_path, "cuda", "cooperative", "_version.py")) as f:
-    exec(f.read())
-assert __version__ is not None
-ver = __version__
-del __version__
-
-
-with open("README.md") as f:
-    long_description = f.read()
-
-
-class CustomBuildCommand(build_py):
-    def run(self):
-        self.run_command("package_cccl")
-        build_py.run(self)
-
-
-class CustomWheelBuild(bdist_wheel):
-    def run(self):
-        self.run_command("package_cccl")
-        super().run()
-
-
-class PackageCCCLCommand(Command):
-    description = "Generate additional files"
-    user_options = []
-
-    def initialize_options(self):
-        pass
-
-    def finalize_options(self):
-        pass
-
-    def run(self):
-        for proj_dir, header_dir in cccl_headers:
-            src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir))
-            dst_path = os.path.join(project_path, "cuda", "_include", proj_dir)
-            if os.path.exists(dst_path):
-                shutil.rmtree(dst_path)
-            shutil.copytree(src_path, dst_path)
-
+from setuptools import setup
 
 setup(
-    name="cuda-cooperative",
-    version=ver,
-    description="Experimental Core Library for CUDA Python",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    author="NVIDIA Corporation",
-    classifiers=[
-        "Programming Language :: Python :: 3 :: Only",
-        "Environment :: GPU :: NVIDIA CUDA",
-    ],
-    packages=find_namespace_packages(include=["cuda.*"]),
-    python_requires=">=3.9",
-    install_requires=[
-        "numba>=0.60.0",
-        "pynvjitlink-cu12>=0.2.4",
-        "cuda-python",
-        "jinja2",
-    ],
-    extras_require={
-        "test": [
-            "pytest",
-            "pytest-xdist",
-        ]
-    },
-    cmdclass={
-        "package_cccl": PackageCCCLCommand,
-        "build_py": CustomBuildCommand,
-        "bdist_wheel": CustomWheelBuild,
-    },
-    include_package_data=True,
-    license="Apache-2.0 with LLVM exception",
-    license_files=("../../LICENSE",),
+    license_files=["../../LICENSE"],
 )
diff --git a/python/cuda_parallel/.gitignore b/python/cuda_parallel/.gitignore
index 8e0d030ff6a..7fc9da1604e 100644
--- a/python/cuda_parallel/.gitignore
+++ b/python/cuda_parallel/.gitignore
@@ -1,4 +1,3 @@
-cuda/_include
 env
 *egg-info
 *so
diff --git a/python/cuda_parallel/MANIFEST.in b/python/cuda_parallel/MANIFEST.in
deleted file mode 100644
index 848cbfe2e81..00000000000
--- a/python/cuda_parallel/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-recursive-include cuda/_include *
diff --git a/python/cuda_parallel/README.md b/python/cuda_parallel/README.md
index 98a3a3c92d0..1dad4b0f03e 100644
--- a/python/cuda_parallel/README.md
+++ b/python/cuda_parallel/README.md
@@ -7,6 +7,7 @@ Please visit the documentation here: https://nvidia.github.io/cccl/python.html.
 ## Local development
 
 ```bash
+pip3 install -e ../cuda_cccl
 pip3 install -e .[test]
 pytest -v ./tests/
 ```
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
index c19ceebbf3e..ffc35ee2a87 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
@@ -4,28 +4,12 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import ctypes
-import os
-import shutil
 from functools import lru_cache
-from typing import List, Optional
+from typing import List
 
-from . import _cccl as cccl
-
-
-def _get_cuda_path() -> Optional[str]:
-    cuda_path = os.environ.get("CUDA_PATH", "")
-    if os.path.exists(cuda_path):
-        return cuda_path
-
-    nvcc_path = shutil.which("nvcc")
-    if nvcc_path is not None:
-        return os.path.dirname(os.path.dirname(nvcc_path))
-
-    default_path = "/usr/local/cuda"
-    if os.path.exists(default_path):
-        return default_path
+from cuda.cccl import get_include_paths  # type: ignore[import-not-found]
 
-    return None
+from . import _cccl as cccl
 
 
 @lru_cache()
@@ -55,27 +39,9 @@ def get_bindings() -> ctypes.CDLL:
 
 @lru_cache()
 def get_paths() -> List[bytes]:
-    # TODO: once docs env supports Python >= 3.9, we
-    # can move this to a module-level import.
-    from importlib.resources import as_file, files
-
-    with as_file(files("cuda.parallel")) as f:
-        # Using `.parent` for compatibility with pip install --editable:
-        cub_include_path = str(f.parent / "_include")
-    thrust_include_path = cub_include_path
-    libcudacxx_include_path = str(os.path.join(cub_include_path, "libcudacxx"))
-    cuda_include_path = None
-    cuda_path = _get_cuda_path()
-    if cuda_path is not None:
-        cuda_include_path = str(os.path.join(cuda_path, "include"))
     paths = [
         f"-I{path}".encode()
-        for path in (
-            cub_include_path,
-            thrust_include_path,
-            libcudacxx_include_path,
-            cuda_include_path,
-        )
+        for path in get_include_paths().as_tuple()
         if path is not None
     ]
     return paths
diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
index b99b5c4c9e1..41c0a3449e2 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
@@ -55,6 +55,9 @@ def __init__(
         op: Callable,
         h_init: np.ndarray,
     ):
+        # Referenced from __del__:
+        self.build_result = None
+
         d_in_cccl = cccl.to_cccl_iter(d_in)
         self._ctor_d_in_cccl_type_enum_name = cccl.type_enum_as_name(
             d_in_cccl.value_type.type.value
@@ -128,6 +131,8 @@ def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray
         return temp_storage_bytes.value
 
     def __del__(self):
+        if self.build_result is None:
+            return
         bindings = get_bindings()
         bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result))
 
diff --git a/python/cuda_parallel/pyproject.toml b/python/cuda_parallel/pyproject.toml
index c73736e496a..e7d2b9f0081 100644
--- a/python/cuda_parallel/pyproject.toml
+++ b/python/cuda_parallel/pyproject.toml
@@ -1,19 +1,39 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 [build-system]
-requires = ["packaging", "setuptools>=61.0.0", "wheel"]
+requires = ["setuptools>=61.0.0"]
 build-backend = "setuptools.build_meta"
 
+[project]
+name = "cuda-parallel"
+description = "Experimental Core Library for CUDA Python"
+authors = [{ name = "NVIDIA Corporation" }]
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "License :: OSI Approved :: Apache Software License",
+]
+requires-python = ">=3.9"
+dependencies = ["cuda-cccl", "numba>=0.60.0", "cuda-python==12.*"]
+dynamic = ["version", "readme"]
+
+[project.optional-dependencies]
+test = ["pytest", "pytest-xdist", "cupy-cuda12x", "typing_extensions"]
+
+[project.urls]
+Homepage = "https://developer.nvidia.com/"
+
+[tool.setuptools.dynamic]
+version = { attr = "cuda.parallel._version.__version__" }
+readme = { file = ["README.md"], content-type = "text/markdown" }
+
 [tool.mypy]
 python_version = "3.10"
 
 [[tool.mypy.overrides]]
-module = [
-    "numba.*",
-    "llvmlite"
-]
+module = ["numba.*", "llvmlite"]
 ignore_missing_imports = true
 follow_imports = "skip"
 
diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index bb7cbb3ac44..c5c9fcd3c32 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -1,61 +1,15 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import os
-import shutil
 import subprocess
+from pathlib import Path
 
-from setuptools import Command, Extension, find_namespace_packages, setup
+from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
-from setuptools.command.build_py import build_py
-from wheel.bdist_wheel import bdist_wheel
 
-project_path = os.path.abspath(os.path.dirname(__file__))
-cccl_path = os.path.abspath(os.path.join(project_path, "..", ".."))
-cccl_headers = [["cub", "cub"], ["libcudacxx", "include"], ["thrust", "thrust"]]
-__version__ = None
-with open(os.path.join(project_path, "cuda", "parallel", "_version.py")) as f:
-    exec(f.read())
-assert __version__ is not None
-ver = __version__
-del __version__
-
-
-with open("README.md") as f:
-    long_description = f.read()
-
-
-class CustomBuildCommand(build_py):
-    def run(self):
-        self.run_command("package_cccl")
-        build_py.run(self)
-
-
-class CustomWheelBuild(bdist_wheel):
-    def run(self):
-        self.run_command("package_cccl")
-        super().run()
-
-
-class PackageCCCLCommand(Command):
-    description = "Generate additional files"
-    user_options = []
-
-    def initialize_options(self):
-        pass
-
-    def finalize_options(self):
-        pass
-
-    def run(self):
-        for proj_dir, header_dir in cccl_headers:
-            src_path = os.path.abspath(os.path.join(cccl_path, proj_dir, header_dir))
-            # TODO Extract cccl headers into a standalone package
-            dst_path = os.path.join(project_path, "cuda", "_include", proj_dir)
-            if os.path.exists(dst_path):
-                shutil.rmtree(dst_path)
-            shutil.copytree(src_path, dst_path)
+CCCL_PYTHON_PATH = Path(__file__).resolve().parents[1]
+CCCL_PATH = CCCL_PYTHON_PATH.parent
 
 
 class CMakeExtension(Extension):
@@ -69,53 +23,27 @@ def run(self):
             self.build_extension(ext)
 
     def build_extension(self, ext):
-        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+        extdir = Path(self.get_ext_fullpath(ext.name)).resolve().parent
         cmake_args = [
             "-DCCCL_ENABLE_C=YES",
-            "-DCCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
+            f"-DCCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY={extdir}",
             "-DCMAKE_BUILD_TYPE=Release",
         ]
 
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
+        build_temp_path = Path(self.build_temp)
+        build_temp_path.mkdir(parents=True, exist_ok=True)
 
-        subprocess.check_call(["cmake", cccl_path] + cmake_args, cwd=self.build_temp)
+        subprocess.check_call(["cmake", CCCL_PATH] + cmake_args, cwd=build_temp_path)
         subprocess.check_call(
             ["cmake", "--build", ".", "--target", "cccl.c.parallel"],
-            cwd=self.build_temp,
+            cwd=build_temp_path,
         )
 
 
 setup(
-    name="cuda-parallel",
-    version=ver,
-    description="Experimental Core Library for CUDA Python",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    author="NVIDIA Corporation",
-    classifiers=[
-        "Programming Language :: Python :: 3 :: Only",
-        "Environment :: GPU :: NVIDIA CUDA",
-    ],
-    packages=find_namespace_packages(include=["cuda.*"]),
-    python_requires=">=3.9",
-    # TODO: typing_extensions required for Python 3.7 docs env
-    install_requires=["numba>=0.60.0", "cuda-python", "jinja2", "typing_extensions"],
-    extras_require={
-        "test": [
-            "pytest",
-            "pytest-xdist",
-            "cupy-cuda12x",
-        ]
-    },
+    license_files=["../../LICENSE"],
     cmdclass={
-        "package_cccl": PackageCCCLCommand,
-        "build_py": CustomBuildCommand,
-        "bdist_wheel": CustomWheelBuild,
         "build_ext": BuildCMakeExtension,
     },
     ext_modules=[CMakeExtension("cuda.parallel.experimental.cccl.c")],
-    include_package_data=True,
-    license="Apache-2.0 with LLVM exception",
-    license_files=("../../LICENSE",),
 )

From 2b5ed0b2fb10958f820b3c397b4dae008eb292ac Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Thu, 30 Jan 2025 03:17:12 -0800
Subject: [PATCH 3/8] work around msvc bug exposed by `__type_index` in
 `type_list.h` (#3487) (#3537)

simplify the fall-back implementation of `__type_index` to avoid causing MSVC to ICE.

Co-authored-by: Eric Niebler <eniebler@nvidia.com>
Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 .../cuda/std/__type_traits/type_list.h        | 50 +++++++++++--------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__type_traits/type_list.h b/libcudacxx/include/cuda/std/__type_traits/type_list.h
index 66652922ceb..461b9fc8fbe 100644
--- a/libcudacxx/include/cuda/std/__type_traits/type_list.h
+++ b/libcudacxx/include/cuda/std/__type_traits/type_list.h
@@ -473,29 +473,33 @@ using __type_index = __type_call<__detail::__type_index_fn<_Ip::value>, _Ts...>;
 
 namespace __detail
 {
-template <class... _Ts>
-struct __inherit_flat : _Ts...
-{};
-
 template <size_t _Ip, class _Ty>
-struct __type_index_leaf
+struct __type_tuple_elem
 {
-  using type = _Ty;
+  using type _CCCL_NODEBUG_ALIAS = _Ty;
 };
 
+template <class _Is, class... _Ts>
+struct __type_tupl;
+
+template <size_t... _Is, class... _Ts>
+struct __type_tupl<index_sequence<_Is...>, _Ts...> : __type_tuple_elem<_Is, _Ts>...
+{};
+
+template <class... _Ts>
+using __type_tuple = __type_tupl<make_index_sequence<sizeof...(_Ts)>, _Ts...>;
+
 template <size_t _Ip, class _Ty>
-_LIBCUDACXX_HIDE_FROM_ABI __type_index_leaf<_Ip, _Ty> __type_index_get(__type_index_leaf<_Ip, _Ty>*);
+_LIBCUDACXX_HIDE_FROM_ABI __type_tuple_elem<_Ip, _Ty> __type_tuple_get(__type_tuple_elem<_Ip, _Ty>);
 
-template <class _Is>
-struct __type_index_large_size_fn;
+template <size_t _Ip, class... _Ts>
+using __type_tuple_element_t _CCCL_NODEBUG_ALIAS =
+  __type<decltype(__detail::__type_tuple_get<_Ip>(__type_tuple<_Ts...>{}))>;
 
-template <size_t... _Is>
-struct __type_index_large_size_fn<index_sequence<_Is...>>
+struct __type_index_large_size_fn
 {
   template <class _Ip, class... _Ts>
-  using __call _CCCL_NODEBUG_ALIAS = //
-    __type<decltype(__detail::__type_index_get<_Ip::value>(
-      static_cast<__inherit_flat<__type_index_leaf<_Is, _Ts>...>*>(nullptr)))>;
+  using __call _CCCL_NODEBUG_ALIAS = __type_tuple_element_t<_Ip::value, _Ts...>;
 };
 
 template <size_t _Ip>
@@ -516,12 +520,8 @@ _CCCL_PP_REPEAT_REVERSE(_CCCL_META_UNROLL_LIMIT, _M1)
 #    undef _M1
 
 template <bool _IsSmall>
-struct __type_index_select_fn // Default for larger indices
-{
-  template <class _Ip, class... _Ts>
-  using __call _CCCL_NODEBUG_ALIAS =
-    __type_call<__type_index_large_size_fn<make_index_sequence<sizeof...(_Ts)>>, _Ip, _Ts...>;
-};
+struct __type_index_select_fn : __type_index_large_size_fn // Default for larger indices
+{};
 
 template <>
 struct __type_index_select_fn<true> // Fast implementation for smaller indices
@@ -531,9 +531,19 @@ struct __type_index_select_fn<true> // Fast implementation for smaller indices
 };
 } // namespace __detail
 
+#    if !_CCCL_COMPILER(MSVC)
+
 template <class _Ip, class... _Ts>
 using __type_index = __type_call<__detail::__type_index_select_fn<(_Ip::value < _CCCL_META_UNROLL_LIMIT)>, _Ip, _Ts...>;
 
+#    else // ^^^ !_CCCL_COMPILER(MSVC) ^^^ / vvv _CCCL_COMPILER(MSVC) vvv
+
+// Simplify the implementation for MSVC, which has trouble with the above
+template <class _Ip, class... _Ts>
+using __type_index = __detail::__type_index_large_size_fn::__call<_Ip, _Ts...>;
+
+#    endif // !_CCCL_COMPILER(MSVC)
+
 template <size_t _Ip, class... _Ts>
 using __type_index_c = __type_index<integral_constant<size_t, _Ip>, _Ts...>;
 

From 863b25f5e2a37db7ee50aa31c7d15e95d7ef2228 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 30 Jan 2025 14:44:50 +0100
Subject: [PATCH 4/8] Update CI matrix to use NVKS nodes. (#3572) (#3605)

* Update CI matrix to use NVKS nodes.
* Update windows CI scripts to accept -arch.
* Move all non-Catch2 device algo tests to lid0/lid1.
This makes sure that they run in the correct CI config on appropriate hardware.
* Switch to all rtx queues:
CUB -> RTXA6000 (48GiB)
Thrust -> RTX4090 (24GiB)
Others -> RTX2080 (8GiB)

Co-authored-by: Allison Piper <alliepiper16@gmail.com>
---
 ci/matrix.yaml                  | 63 ++++++++++++++-------------------
 ci/windows/build_common.psm1    | 15 ++++++--
 ci/windows/build_cub.ps1        |  8 +++--
 ci/windows/build_cudax.ps1      |  8 +++--
 ci/windows/build_libcudacxx.ps1 |  8 +++--
 ci/windows/build_thrust.ps1     |  8 +++--
 ci/windows/test_thrust.ps1      |  8 +++--
 cub/test/CMakeLists.txt         |  9 +++++
 8 files changed, 78 insertions(+), 49 deletions(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index b04923fad9a..7f1f86a0da9 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -19,26 +19,28 @@ workflows:
     - {jobs: ['build'], std: 'max', cxx: ['intel', 'msvc2019']}
     - {jobs: ['build'], std: [17, 20], cxx: ['gcc', 'clang', 'msvc']}
     # Current CTK testing:
-    - {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc', 'clang']}
+    - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx4090'}
+    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx2080'}
     # Disabled until we figure out the issue with the TBB dll
-    #- {jobs: ['test'],  project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc']}
+   #- {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['msvc'], gpu: 'rtx4090'}
+    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['msvc'], gpu: 'rtx2080'}
     # Split up cub tests:
-    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc']}
-    - {jobs: ['test_lid1',  'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']}
-    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc']}
-    - {jobs: ['test_lid0'],               project: ['cub'], std: 'max', cxx: 'gcc12', gpu: 'h100',  sm: 'gpu' }
+    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'rtxa6000'}
+    - {jobs: ['test_lid1', 'test_lid2'],  project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'rtxa6000'}
+    - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 'rtxa6000'}
+    - {jobs: ['test_lid0'],               project: ['cub'], std: 'max', cxx: 'gcc12',           gpu: 'h100', sm: 'gpu' }
     # Modded builds:
     - {jobs: ['build'], std: [17, 20], ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'max', cxx: ['gcc', 'clang'], cpu: 'arm64'}
     - {jobs: ['build'], std: 'max', cxx: ['gcc'], sm: '90a'}
     # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly.
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'}
     # default_projects: clang-cuda
     - {jobs: ['build'], std: [17, 20], cudacxx: 'clang', cxx: 'clang'}
     - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90'}
     - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90a'}
     # nvrtc:
-    - {jobs: ['nvrtc'], project: 'libcudacxx', std: 'all'}
+    - {jobs: ['nvrtc'], project: 'libcudacxx', std: 'all', gpu: 'rtx2080', sm: 'gpu'}
     # verify-codegen:
     - {jobs: ['verify_codegen'], project: 'libcudacxx'}
     # cudax has different CTK reqs:
@@ -52,19 +54,19 @@ workflows:
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 17,       cxx: ['gcc'], sm: "90"}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['gcc'], sm: "90a"}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: [17, 20], cxx: ['gcc', 'clang'], cpu: 'arm64'}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['gcc12', 'clang', 'msvc']}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['gcc12', 'clang', 'msvc'], gpu: 'rtx2080'}
     # Python and c/parallel jobs:
-    - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6'}
+    - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6', gpu: 'rtx2080'}
     # cccl-infra:
-    - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6',  'clang9']}
-    - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14']}
-    - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc',   'clang']}
+    - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6',  'clang9'], gpu: 'rtx2080'}
+    - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14'], gpu: 'rtx2080'}
+    - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc',   'clang'], gpu: 'rtx2080'}
 
   nightly:
     # Edge-case jobs
-    - {jobs: ['limited'], project: 'cub', std: 17}
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'}
-    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit'}
+    - {jobs: ['limited'], project: 'cub', std: 17, gpu: 'rtx2080'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'}
+    - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit', gpu: 'rtx4090'}
     # Old CTK
     - {jobs: ['build'], std: 'all', ctk: '11.1', cxx: ['gcc6', 'gcc7', 'gcc8', 'gcc9', 'clang9', 'msvc2017']}
     - {jobs: ['build'], std: 'all', ctk: '11.8', cxx: ['gcc11'], sm: '60;70;80;90'}
@@ -73,7 +75,11 @@ workflows:
     - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], std: 'all', cxx: ['intel', 'msvc2019']}
     # Test current CTK
-    - {jobs: ['test'],  std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']}
+    - {jobs: ['test'],      project: 'cub',        std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtxa6000'}
+    - {jobs: ['test_lid0'], project: 'cub',        std: 'max', cxx: 'gcc',                    gpu: 'v100'}
+    - {jobs: ['test_lid0'], project: 'cub',        std: 'max', cxx: 'gcc',                    gpu: 'h100', sm: 'gpu' }
+    - {jobs: ['test'],      project: 'thrust',     std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'}
+    - {jobs: ['test'],      project: 'libcudacxx', std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
     # Modded builds:
     - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
@@ -92,26 +98,9 @@ workflows:
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['gcc12'], sm: "90"}
     - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc13'], sm: "90a"}
     - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['gcc13', 'clang16'], cpu: 'arm64'}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']}
-    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14']}
-    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang18']}
-
-#  # These are waiting on the NVKS nodes:
-#    - {jobs: ['test'],  ctk: '11.1', gpu: 'v100',     sm: 'gpu', cxx: 'gcc6',    std: [11]}
-#    - {jobs: ['test'],  ctk: '11.1', gpu: 't4',       sm: 'gpu', cxx: 'clang9',  std: [17]}
-#    - {jobs: ['test'],  ctk: '11.8', gpu: 'rtx2080',  sm: 'gpu', cxx: 'gcc11',   std: [17]}
-#    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',    std: [14]}
-#    - {jobs: ['test'],  ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',   std: 'all'}
-#    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang9',  std: [11]}
-#    # H100 runners are currently flakey, only build since those use CPU-only runners:
-#    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',   std: [11, 20]}
-#    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'clang18', std: [17]}
-#
-#   # nvrtc:
-#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 't4',       sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
-#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc13',  std: [20],     project: ['libcudacxx']}
-#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',  std: 'all',    project: ['libcudacxx']}
-#    - {jobs: ['nvrtc'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc13',  std: [11, 20], project: ['libcudacxx']}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']  , gpu: 'rtx2080'}
+    - {jobs: ['test'],  project: 'cudax', ctk: ['12.0'        ], std: 'all', cxx: ['clang14'], gpu: 'rtx2080'}
+    - {jobs: ['test'],  project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang18'], gpu: 'rtx2080'}
 
   # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows.
   exclude:
diff --git a/ci/windows/build_common.psm1 b/ci/windows/build_common.psm1
index 1eb5f1a9d63..151bb1f112e 100644
--- a/ci/windows/build_common.psm1
+++ b/ci/windows/build_common.psm1
@@ -3,7 +3,11 @@ Param(
     [Alias("std")]
     [ValidateNotNullOrEmpty()]
     [ValidateSet(11, 14, 17, 20)]
-    [int]$CXX_STANDARD = 17
+    [int]$CXX_STANDARD = 17,
+    [Parameter(Mandatory = $false)]
+    [ValidateNotNullOrEmpty()]
+    [Alias("arch")]
+    [int]$CUDA_ARCH = 0
 )
 
 $ErrorActionPreference = "Stop"
@@ -20,6 +24,12 @@ if ($script:CL_VERSION_STRING -match "Version (\d+\.\d+)\.\d+") {
     Write-Host "Detected cl.exe version: $CL_VERSION"
 }
 
+$script:GLOBAL_CMAKE_OPTIONS = ""
+if ($CUDA_ARCH -ne 0) {
+    $script:GLOBAL_CMAKE_OPTIONS += "-DCMAKE_CUDA_ARCHITECTURES=$CUDA_ARCH"
+}
+
+
 if (-not $env:CCCL_BUILD_INFIX) {
     $env:CCCL_BUILD_INFIX = ""
 }
@@ -56,6 +66,7 @@ Write-Host "NVCC_VERSION=$NVCC_VERSION"
 Write-Host "CMAKE_BUILD_PARALLEL_LEVEL=$env:CMAKE_BUILD_PARALLEL_LEVEL"
 Write-Host "CTEST_PARALLEL_LEVEL=$env:CTEST_PARALLEL_LEVEL"
 Write-Host "CCCL_BUILD_INFIX=$env:CCCL_BUILD_INFIX"
+Write-Host "GLOBAL_CMAKE_OPTIONS=$script:GLOBAL_CMAKE_OPTIONS"
 Write-Host "Current commit is:"
 Write-Host "$(git log -1 --format=short)"
 Write-Host "========================================"
@@ -82,7 +93,7 @@ function configure_preset {
     pushd ".."
 
     # Echo and execute command to stdout:
-    $configure_command = "cmake --preset $PRESET $CMAKE_OPTIONS --log-level VERBOSE"
+    $configure_command = "cmake --preset $PRESET $script:GLOBAL_CMAKE_OPTIONS $CMAKE_OPTIONS --log-level VERBOSE"
     Write-Host $configure_command
     Invoke-Expression $configure_command
     $test_result = $LastExitCode
diff --git a/ci/windows/build_cub.ps1 b/ci/windows/build_cub.ps1
index 32e4f71ee9a..27c5360ded9 100644
--- a/ci/windows/build_cub.ps1
+++ b/ci/windows/build_cub.ps1
@@ -3,7 +3,11 @@ Param(
     [Alias("std")]
     [ValidateNotNullOrEmpty()]
     [ValidateSet(11, 14, 17, 20)]
-    [int]$CXX_STANDARD = 17
+    [int]$CXX_STANDARD = 17,
+    [Parameter(Mandatory = $false)]
+    [ValidateNotNullOrEmpty()]
+    [Alias("arch")]
+    [int]$CUDA_ARCH = 0
 )
 
 $ErrorActionPreference = "Stop"
@@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") {
     pushd "$PSScriptRoot/.."
 }
 
-Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH
 
 $PRESET = "cub-cpp$CXX_STANDARD"
 $CMAKE_OPTIONS = ""
diff --git a/ci/windows/build_cudax.ps1 b/ci/windows/build_cudax.ps1
index ca7bd578291..7b8cd0ff771 100644
--- a/ci/windows/build_cudax.ps1
+++ b/ci/windows/build_cudax.ps1
@@ -4,7 +4,11 @@ Param(
     [Alias("std")]
     [ValidateNotNullOrEmpty()]
     [ValidateSet(20)]
-    [int]$CXX_STANDARD = 20
+    [int]$CXX_STANDARD = 20,
+    [Parameter(Mandatory = $false)]
+    [ValidateNotNullOrEmpty()]
+    [Alias("arch")]
+    [int]$CUDA_ARCH = 0
 )
 
 $CURRENT_PATH = Split-Path $pwd -leaf
@@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") {
 }
 
 Remove-Module -Name build_common
-Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH
 
 $PRESET = "cudax-cpp$CXX_STANDARD"
 $CMAKE_OPTIONS = ""
diff --git a/ci/windows/build_libcudacxx.ps1 b/ci/windows/build_libcudacxx.ps1
index a57e2280de7..2f80619f76b 100644
--- a/ci/windows/build_libcudacxx.ps1
+++ b/ci/windows/build_libcudacxx.ps1
@@ -3,7 +3,11 @@ Param(
     [Alias("std")]
     [ValidateNotNullOrEmpty()]
     [ValidateSet(11, 14, 17, 20)]
-    [int]$CXX_STANDARD = 17
+    [int]$CXX_STANDARD = 17,
+    [Parameter(Mandatory = $false)]
+    [ValidateNotNullOrEmpty()]
+    [Alias("arch")]
+    [int]$CUDA_ARCH = 0
 )
 
 $ErrorActionPreference = "Stop"
@@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") {
     pushd "$PSScriptRoot/.."
 }
 
-Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $GPU_ARCHS
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH
 
 $PRESET = "libcudacxx-cpp${CXX_STANDARD}"
 $CMAKE_OPTIONS = ""
diff --git a/ci/windows/build_thrust.ps1 b/ci/windows/build_thrust.ps1
index 186ed94eace..bda86859fd4 100644
--- a/ci/windows/build_thrust.ps1
+++ b/ci/windows/build_thrust.ps1
@@ -3,7 +3,11 @@ Param(
     [Alias("std")]
     [ValidateNotNullOrEmpty()]
     [ValidateSet(11, 14, 17, 20)]
-    [int]$CXX_STANDARD = 17
+    [int]$CXX_STANDARD = 17,
+    [Parameter(Mandatory = $false)]
+    [ValidateNotNullOrEmpty()]
+    [Alias("arch")]
+    [int]$CUDA_ARCH = 0
 )
 
 $ErrorActionPreference = "Stop"
@@ -14,7 +18,7 @@ If($CURRENT_PATH -ne "ci") {
     pushd "$PSScriptRoot/.."
 }
 
-Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $CUDA_ARCH
 
 $PRESET = "thrust-cpp$CXX_STANDARD"
 $CMAKE_OPTIONS = ""
diff --git a/ci/windows/test_thrust.ps1 b/ci/windows/test_thrust.ps1
index 7c020714208..eabda06df5b 100644
--- a/ci/windows/test_thrust.ps1
+++ b/ci/windows/test_thrust.ps1
@@ -5,6 +5,10 @@ Param(
     [ValidateSet(11, 14, 17, 20)]
     [int]$CXX_STANDARD = 17,
     [Parameter(Mandatory = $false)]
+    [ValidateNotNullOrEmpty()]
+    [Alias("arch")]
+    [int]$CUDA_ARCH = 0,
+    [Parameter(Mandatory = $false)]
     [Alias("cpu-only")]
     [switch]$CPU_ONLY = $false
 )
@@ -24,11 +28,11 @@ If($CURRENT_PATH -ne "ci") {
 }
 
 # Execute the build script:
-$build_command = "$PSScriptRoot/build_thrust.ps1 -std $CXX_STANDARD"
+$build_command = "$PSScriptRoot/build_thrust.ps1 -std $CXX_STANDARD -arch $CUDA_ARCH"
 Write-Host "Executing: $build_command"
 Invoke-Expression $build_command
 
-Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
+Import-Module -Name "$PSScriptRoot/build_common.psm1" -ArgumentList $CXX_STANDARD, $CUDA_ARCH
 
 $PRESET = "thrust-cpu-cpp$CXX_STANDARD"
 
diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt
index c86d24754de..80b382c5a46 100644
--- a/cub/test/CMakeLists.txt
+++ b/cub/test/CMakeLists.txt
@@ -370,6 +370,15 @@ foreach (test_src IN LISTS test_srcs)
         set(launcher 0)
       endif()
 
+      # FIXME: There are a few remaining device algorithm tests that have not been ported to
+      # use Catch2 and lid variants. Mark these as `lid_0/1` so they'll run in the appropriate
+      # CI configs:
+      string(REGEX MATCH "^device_" is_device_test "${test_name}")
+      _cub_is_fail_test(is_fail_test "%{test_name}")
+      if (is_device_test AND NOT is_fail_test)
+        string(APPEND test_name ".lid_${launcher}")
+      endif()
+
       # Only one version of this test.
       cub_add_test(test_target ${test_name} "${test_src}" ${cub_target} ${launcher})
       cub_configure_cuda_target(${test_target} RDC ${CUB_FORCE_RDC})

From 3a594e30260dd3cf235b4274590f48dfcdfdbdb7 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 16:33:47 +0100
Subject: [PATCH 5/8] Specialize `cuda::std::numeric_limits` for FP8 types
 (#3478) (#3492)

Co-authored-by: David Bayer <48736217+davebayer@users.noreply.github.com>
---
 libcudacxx/include/cuda/std/limits            | 192 ++++++++++++++++++
 .../limits/numeric.limits.members/common.h    |  27 +++
 .../const_data_members.pass.cpp               |   4 +
 .../denorm_min.pass.cpp                       |   4 +
 .../numeric.limits.members/digits.pass.cpp    |   5 +
 .../numeric.limits.members/digits10.pass.cpp  |  67 +++---
 .../numeric.limits.members/epsilon.pass.cpp   |   4 +
 .../has_denorm.pass.cpp                       |   4 +
 .../has_denorm_loss.pass.cpp                  |   4 +
 .../has_infinity.pass.cpp                     |   4 +
 .../has_quiet_NaN.pass.cpp                    |   4 +
 .../has_signaling_NaN.pass.cpp                |   4 +
 .../numeric.limits.members/infinity.pass.cpp  |   8 +
 .../is_bounded.pass.cpp                       |   4 +
 .../numeric.limits.members/is_exact.pass.cpp  |   4 +
 .../numeric.limits.members/is_iec559.pass.cpp |   4 +
 .../is_integer.pass.cpp                       |   4 +
 .../numeric.limits.members/is_modulo.pass.cpp |   4 +
 .../numeric.limits.members/is_signed.pass.cpp |   4 +
 .../numeric.limits.members/lowest.pass.cpp    |   4 +
 .../numeric.limits.members/max.pass.cpp       |   4 +
 .../max_digits10.pass.cpp                     |  66 +++---
 .../max_exponent.pass.cpp                     |  49 +++--
 .../max_exponent10.pass.cpp                   |  49 +++--
 .../numeric.limits.members/min.pass.cpp       |   4 +
 .../min_exponent.pass.cpp                     |  49 +++--
 .../min_exponent10.pass.cpp                   |  49 +++--
 .../numeric.limits.members/quiet_NaN.pass.cpp |  64 ++++--
 .../numeric.limits.members/radix.pass.cpp     |   4 +
 .../round_error.pass.cpp                      |   4 +
 .../round_style.pass.cpp                      |   4 +
 .../signaling_NaN.pass.cpp                    |  64 ++++--
 .../tinyness_before.pass.cpp                  |   4 +
 .../numeric.limits.members/traps.pass.cpp     |   4 +
 34 files changed, 627 insertions(+), 150 deletions(-)

diff --git a/libcudacxx/include/cuda/std/limits b/libcudacxx/include/cuda/std/limits
index 5e4df32270e..2bd0191af43 100644
--- a/libcudacxx/include/cuda/std/limits
+++ b/libcudacxx/include/cuda/std/limits
@@ -29,6 +29,21 @@
 #include <cuda/std/climits>
 #include <cuda/std/version>
 
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+#if _CCCL_HAS_NVFP8()
+#  include <cuda_fp8.h>
+#endif // _CCCL_HAS_NVFP8()
+
 _CCCL_PUSH_MACROS
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -744,6 +759,183 @@ public:
 };
 #endif // _LIBCUDACXX_HAS_NVBF16
 
+#if _CCCL_HAS_NVFP8()
+#  if defined(_CCCL_BUILTIN_BIT_CAST) || _CCCL_STD_VER >= 2014
+#    define _LIBCUDACXX_CONSTEXPR_FP8_LIMITS constexpr
+#  else // ^^^ _CCCL_BUILTIN_BIT_CAST || _CCCL_STD_VER >= 2014 ^^^ // vvv !_CCCL_BUILTIN_BIT_CAST && _CCCL_STD_VER <
+        // 2014 vvv
+#    define _LIBCUDACXX_CONSTEXPR_FP8_LIMITS
+#  endif // ^^^ !_CCCL_BUILTIN_BIT_CAST && _CCCL_STD_VER < 2014 ^^^
+
+template <>
+class __numeric_limits_impl<__nv_fp8_e4m3, __numeric_limits_type::__floating_point>
+{
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS __nv_fp8_e4m3 __make_value(__nv_fp8_storage_t __val)
+  {
+#  if defined(_CCCL_BUILTIN_BIT_CAST)
+    return _CUDA_VSTD::bit_cast<__nv_fp8_e4m3>(__val);
+#  else // ^^^ _CCCL_BUILTIN_BIT_CAST ^^^ // vvv !_CCCL_BUILTIN_BIT_CAST vvv
+    __nv_fp8_e4m3 __ret{};
+    __ret.__x = __val;
+    return __ret;
+#  endif // ^^^ !_CCCL_BUILTIN_BIT_CAST ^^^
+  }
+
+public:
+  using type = __nv_fp8_e4m3;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = 3;
+  static constexpr int digits10     = 0;
+  static constexpr int max_digits10 = 2;
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type min() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x08u));
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type max() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x7eu));
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type lowest() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0xfeu));
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type epsilon() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x20u));
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type round_error() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x30u));
+  }
+
+  static constexpr int min_exponent   = -6;
+  static constexpr int min_exponent10 = -2;
+  static constexpr int max_exponent   = 8;
+  static constexpr int max_exponent10 = 2;
+
+  static constexpr bool has_infinity             = false;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = false;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type infinity() noexcept
+  {
+    return type{};
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type quiet_NaN() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x7fu));
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type signaling_NaN() noexcept
+  {
+    return type{};
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type denorm_min() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x01u));
+  }
+
+  static constexpr bool is_iec559  = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+
+template <>
+class __numeric_limits_impl<__nv_fp8_e5m2, __numeric_limits_type::__floating_point>
+{
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS __nv_fp8_e5m2 __make_value(__nv_fp8_storage_t __val)
+  {
+#  if defined(_CCCL_BUILTIN_BIT_CAST)
+    return _CUDA_VSTD::bit_cast<__nv_fp8_e5m2>(__val);
+#  else // ^^^ _CCCL_BUILTIN_BIT_CAST ^^^ // vvv !_CCCL_BUILTIN_BIT_CAST vvv
+    __nv_fp8_e5m2 __ret{};
+    __ret.__x = __val;
+    return __ret;
+#  endif // ^^^ !_CCCL_BUILTIN_BIT_CAST ^^^
+  }
+
+public:
+  using type = __nv_fp8_e5m2;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = 2;
+  static constexpr int digits10     = 0;
+  static constexpr int max_digits10 = 2;
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type min() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x04u));
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type max() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x7bu));
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type lowest() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0xfbu));
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type epsilon() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x34u));
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type round_error() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x38u));
+  }
+
+  static constexpr int min_exponent   = -15;
+  static constexpr int min_exponent10 = -5;
+  static constexpr int max_exponent   = 15;
+  static constexpr int max_exponent10 = 4;
+
+  static constexpr bool has_infinity             = true;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type infinity() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x7cu));
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type quiet_NaN() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x7eu));
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type signaling_NaN() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x7du));
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_FP8_LIMITS type denorm_min() noexcept
+  {
+    return __make_value(static_cast<__nv_fp8_storage_t>(0x01u));
+  }
+
+  static constexpr bool is_iec559  = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+#endif // _CCCL_HAS_NVFP8()
+
 template <class _Tp>
 class numeric_limits : public __numeric_limits_impl<_Tp>
 {};
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h
index 15b48836839..8400071611c 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h
@@ -11,6 +11,7 @@
 #define NUMERIC_LIMITS_MEMBERS_COMMON_H
 
 // Disable all the extended floating point operations and conversions
+#define __CUDA_NO_FP8_CONVERSIONS__      1
 #define __CUDA_NO_HALF_CONVERSIONS__     1
 #define __CUDA_NO_HALF_OPERATORS__       1
 #define __CUDA_NO_BFLOAT16_CONVERSIONS__ 1
@@ -24,6 +25,32 @@ __host__ __device__ bool float_eq(T x, T y)
   return x == y;
 }
 
+#if _CCCL_HAS_NVFP8()
+__host__ __device__ inline __nv_fp8_e4m3 make_fp8_e4m3(double x, __nv_saturation_t sat = __NV_NOSAT)
+{
+  __nv_fp8_e4m3 res;
+  res.__x = __nv_cvt_double_to_fp8(x, sat, __NV_E4M3);
+  return res;
+}
+
+__host__ __device__ inline __nv_fp8_e5m2 make_fp8_e5m2(double x, __nv_saturation_t sat = __NV_NOSAT)
+{
+  __nv_fp8_e5m2 res;
+  res.__x = __nv_cvt_double_to_fp8(x, sat, __NV_E5M2);
+  return res;
+}
+
+__host__ __device__ inline bool float_eq(__nv_fp8_e4m3 x, __nv_fp8_e4m3 y)
+{
+  return float_eq(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3)}, __half{__nv_cvt_fp8_to_halfraw(y.__x, __NV_E4M3)});
+}
+
+__host__ __device__ inline bool float_eq(__nv_fp8_e5m2 x, __nv_fp8_e5m2 y)
+{
+  return float_eq(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E5M2)}, __half{__nv_cvt_fp8_to_halfraw(y.__x, __NV_E5M2)});
+}
+#endif // _CCCL_HAS_NVFP8
+
 #if defined(_LIBCUDACXX_HAS_NVFP16)
 __host__ __device__ inline bool float_eq(__half x, __half y)
 {
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
index 769080cff83..b095d63afcd 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
@@ -116,6 +116,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test_type<__nv_bfloat16>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test_type<__nv_fp8_e4m3>();
+  test_type<__nv_fp8_e5m2>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp
index cc64ed14686..475f41a3388 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp
@@ -72,6 +72,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(9.18354961579912115600575419705e-41));
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3>(make_fp8_e4m3(0.001953125));
+  test<__nv_fp8_e5m2>(make_fp8_e5m2(0.0000152587890625));
+#endif // _CCCL_HAS_NVFP8()
 #if !defined(__FLT_DENORM_MIN__) && !defined(FLT_TRUE_MIN)
 #  error Test has no expected values for floating point types
 #endif
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp
index efce1ccf678..0d3c910b672 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp
@@ -61,5 +61,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, 8>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, 3>();
+  test<__nv_fp8_e5m2, 2>();
+#endif // _CCCL_HAS_NVFP8()
+
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp
index 32990ece4b1..bd66aeecfeb 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp
@@ -12,10 +12,25 @@
 
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
+#include <cuda/std/type_traits>
 
 #include "test_macros.h"
 
-template <class T, int expected>
+template <class T, cuda::std::enable_if_t<cuda::std::is_integral<T>::value, int> = 0>
+__host__ __device__ constexpr int make_expected_digits10()
+{
+  // digits * log10(2)
+  return static_cast<int>((cuda::std::numeric_limits<T>::digits * 30103l) / 100000l);
+}
+
+template <class T, cuda::std::enable_if_t<!cuda::std::is_integral<T>::value, int> = 0>
+__host__ __device__ constexpr int make_expected_digits10()
+{
+  // (digits - 1) * log10(2)
+  return static_cast<int>(((cuda::std::numeric_limits<T>::digits - 1) * 30103l) / 100000l);
+}
+
+template <class T, int expected = make_expected_digits10<T>()>
 __host__ __device__ void test()
 {
   static_assert(cuda::std::numeric_limits<T>::digits10 == expected, "digits10 test 1");
@@ -30,41 +45,45 @@ __host__ __device__ void test()
 
 int main(int, char**)
 {
-  test<bool, 0>();
-  test<char, 2>();
-  test<signed char, 2>();
-  test<unsigned char, 2>();
-  test<wchar_t, 5 * sizeof(wchar_t) / 2 - 1>(); // 4 -> 9 and 2 -> 4
+  test<bool>();
+  test<char>();
+  test<signed char>();
+  test<unsigned char>();
+  test<wchar_t>();
 #if TEST_STD_VER > 2017 && defined(__cpp_char8_t)
-  test<char8_t, 2>();
+  test<char8_t>();
 #endif
 #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-  test<char16_t, 4>();
-  test<char32_t, 9>();
+  test<char16_t>();
+  test<char32_t>();
 #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-  test<short, 4>();
-  test<unsigned short, 4>();
-  test<int, 9>();
-  test<unsigned int, 9>();
-  test<long, sizeof(long) == 4 ? 9 : 18>();
-  test<unsigned long, sizeof(long) == 4 ? 9 : 19>();
-  test<long long, 18>();
-  test<unsigned long long, 19>();
+  test<short>();
+  test<unsigned short>();
+  test<int>();
+  test<unsigned int>();
+  test<long>();
+  test<unsigned long>();
+  test<long long>();
+  test<unsigned long long>();
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-  test<__int128_t, 38>();
-  test<__uint128_t, 38>();
+  test<__int128_t>();
+  test<__uint128_t>();
 #endif
-  test<float, FLT_DIG>();
-  test<double, DBL_DIG>();
+  test<float>();
+  test<double>();
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-  test<long double, LDBL_DIG>();
+  test<long double>();
 #endif
 #if defined(_LIBCUDACXX_HAS_NVFP16)
-  test<__half, 3>();
+  test<__half>();
 #endif // _LIBCUDACXX_HAS_NVFP16
 #if defined(_LIBCUDACXX_HAS_NVBF16)
-  test<__nv_bfloat16, 2>();
+  test<__nv_bfloat16>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3>();
+  test<__nv_fp8_e5m2>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp
index fa42c5e8fe6..15366bdf308 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp
@@ -63,6 +63,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(0.0078125));
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3>(make_fp8_e4m3(0.125));
+  test<__nv_fp8_e5m2>(make_fp8_e5m2(0.25));
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
index 5a0a05ab73b..8fa506b93ce 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
@@ -60,6 +60,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, cuda::std::denorm_present>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, cuda::std::denorm_present>();
+  test<__nv_fp8_e5m2, cuda::std::denorm_present>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
index 450e51b8111..3b7722acd8b 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
@@ -60,6 +60,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, false>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, false>();
+  test<__nv_fp8_e5m2, false>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp
index 646f5e20160..ebddcb4421e 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp
@@ -60,6 +60,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, true>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, false>();
+  test<__nv_fp8_e5m2, true>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp
index 626b4110695..908f2d7fa4a 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp
@@ -60,6 +60,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, true>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, true>();
+  test<__nv_fp8_e5m2, true>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp
index 20cd04d107e..62d81c8a524 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp
@@ -60,6 +60,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, true>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, false>();
+  test<__nv_fp8_e5m2, true>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp
index 34527e300c5..627105a4a8c 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp
@@ -70,6 +70,10 @@ int main(int, char**)
 #  if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(1.0 / 0.0));
 #  endif // _LIBCUDACXX_HAS_NVBF16
+#  if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3>(__nv_fp8_e4m3{});
+  test<__nv_fp8_e5m2>(make_fp8_e5m2(1.0 / 0.0));
+#  endif // _CCCL_HAS_NVFP8()
 // MSVC has issues with producing INF with divisions by zero.
 #else
   test<float>(INFINITY);
@@ -83,6 +87,10 @@ int main(int, char**)
 #  if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(INFINITY));
 #  endif // _LIBCUDACXX_HAS_NVBF16
+#  if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3>(__nv_fp8_e4m3{});
+  test<__nv_fp8_e5m2>(make_fp8_e5m2(INFINITY));
+#  endif // _CCCL_HAS_NVFP8()
 #endif
 
   return 0;
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp
index 9e671c5d905..eeb9740e4e2 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp
@@ -60,6 +60,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, true>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, true>();
+  test<__nv_fp8_e5m2, true>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp
index cfc9a6cab90..c3c2e027c72 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp
@@ -60,6 +60,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, false>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, false>();
+  test<__nv_fp8_e5m2, false>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
index 945347ff4b5..7bab40e8826 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
@@ -60,6 +60,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, true>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, false>();
+  test<__nv_fp8_e5m2, false>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp
index 65dd98fdb04..68e7437f1e0 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp
@@ -60,6 +60,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, false>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, false>();
+  test<__nv_fp8_e5m2, false>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp
index 6d82269e1c8..992be2b18b7 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp
@@ -60,6 +60,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, false>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, false>();
+  test<__nv_fp8_e5m2, false>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp
index eb39869bf24..be7e4f235a7 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp
@@ -60,6 +60,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, true>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, true>();
+  test<__nv_fp8_e5m2, true>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
index e3b832dfd9b..6a8b2a9c181 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
@@ -72,6 +72,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(-3.3895313892515355e+38));
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3>(make_fp8_e4m3(-448.0));
+  test<__nv_fp8_e5m2>(make_fp8_e5m2(-57344.0));
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
index 7ba6dabb1d2..a1582e41b22 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
@@ -71,6 +71,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(3.3895313892515355e+38));
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3>(make_fp8_e4m3(448.0));
+  test<__nv_fp8_e5m2>(make_fp8_e5m2(57344.0));
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp
index 92b3d13ea61..d01a4aa099c 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp
@@ -12,10 +12,24 @@
 
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
+#include <cuda/std/type_traits>
 
 #include "test_macros.h"
 
-template <class T, int expected>
+template <class T, cuda::std::enable_if_t<cuda::std::is_integral<T>::value, int> = 0>
+__host__ __device__ constexpr int make_expected_max_digits10()
+{
+  return 0;
+}
+
+template <class T, cuda::std::enable_if_t<!cuda::std::is_integral<T>::value, int> = 0>
+__host__ __device__ constexpr int make_expected_max_digits10()
+{
+  // std::ceil(std::numeric_limits<float>::digits * std::log10(2) + 1)
+  return static_cast<int>((cuda::std::numeric_limits<T>::digits * 30103l + 99999l) / 100000l) + 1;
+}
+
+template <class T, int expected = make_expected_max_digits10<T>()>
 __host__ __device__ void test()
 {
   static_assert(cuda::std::numeric_limits<T>::max_digits10 == expected, "max_digits10 test 1");
@@ -26,41 +40,45 @@ __host__ __device__ void test()
 
 int main(int, char**)
 {
-  test<bool, 0>();
-  test<char, 0>();
-  test<signed char, 0>();
-  test<unsigned char, 0>();
-  test<wchar_t, 0>();
+  test<bool>();
+  test<char>();
+  test<signed char>();
+  test<unsigned char>();
+  test<wchar_t>();
 #if TEST_STD_VER > 2017 && defined(__cpp_char8_t)
-  test<char8_t, 0>();
+  test<char8_t>();
 #endif
 #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-  test<char16_t, 0>();
-  test<char32_t, 0>();
+  test<char16_t>();
+  test<char32_t>();
 #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-  test<short, 0>();
-  test<unsigned short, 0>();
-  test<int, 0>();
-  test<unsigned int, 0>();
-  test<long, 0>();
-  test<unsigned long, 0>();
-  test<long long, 0>();
-  test<unsigned long long, 0>();
+  test<short>();
+  test<unsigned short>();
+  test<int>();
+  test<unsigned int>();
+  test<long>();
+  test<unsigned long>();
+  test<long long>();
+  test<unsigned long long>();
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-  test<__int128_t, 0>();
-  test<__uint128_t, 0>();
+  test<__int128_t>();
+  test<__uint128_t>();
 #endif
-  test<float, 2 + (FLT_MANT_DIG * 30103) / 100000>();
-  test<double, 2 + (DBL_MANT_DIG * 30103) / 100000>();
+  test<float>();
+  test<double>();
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-  test<long double, 2 + (LDBL_MANT_DIG * 30103) / 100000>();
+  test<long double>();
 #endif
 #if defined(_LIBCUDACXX_HAS_NVFP16)
-  test<__half, 5>();
+  test<__half>();
 #endif // _LIBCUDACXX_HAS_NVFP16
 #if defined(_LIBCUDACXX_HAS_NVBF16)
-  test<__nv_bfloat16, 4>();
+  test<__nv_bfloat16>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3>();
+  test<__nv_fp8_e5m2>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp
index 81d5ae07795..3027e9f06f5 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp
@@ -12,10 +12,17 @@
 
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
+#include <cuda/std/type_traits>
 
 #include "test_macros.h"
 
-template <class T, int expected>
+template <class T, cuda::std::enable_if_t<cuda::std::is_integral<T>::value, int> = 0>
+__host__ __device__ constexpr int make_expected_max_exponent()
+{
+  return 0;
+}
+
+template <class T, int expected = make_expected_max_exponent<T>()>
 __host__ __device__ void test()
 {
   static_assert(cuda::std::numeric_limits<T>::max_exponent == expected, "max_exponent test 1");
@@ -26,29 +33,29 @@ __host__ __device__ void test()
 
 int main(int, char**)
 {
-  test<bool, 0>();
-  test<char, 0>();
-  test<signed char, 0>();
-  test<unsigned char, 0>();
-  test<wchar_t, 0>();
+  test<bool>();
+  test<char>();
+  test<signed char>();
+  test<unsigned char>();
+  test<wchar_t>();
 #if TEST_STD_VER > 2017 && defined(__cpp_char8_t)
-  test<char8_t, 0>();
+  test<char8_t>();
 #endif
 #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-  test<char16_t, 0>();
-  test<char32_t, 0>();
+  test<char16_t>();
+  test<char32_t>();
 #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-  test<short, 0>();
-  test<unsigned short, 0>();
-  test<int, 0>();
-  test<unsigned int, 0>();
-  test<long, 0>();
-  test<unsigned long, 0>();
-  test<long long, 0>();
-  test<unsigned long long, 0>();
+  test<short>();
+  test<unsigned short>();
+  test<int>();
+  test<unsigned int>();
+  test<long>();
+  test<unsigned long>();
+  test<long long>();
+  test<unsigned long long>();
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-  test<__int128_t, 0>();
-  test<__uint128_t, 0>();
+  test<__int128_t>();
+  test<__uint128_t>();
 #endif
   test<float, FLT_MAX_EXP>();
   test<double, DBL_MAX_EXP>();
@@ -61,6 +68,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, 128>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, 8>();
+  test<__nv_fp8_e5m2, 15>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp
index 4c426b37460..5924aee173d 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp
@@ -12,10 +12,17 @@
 
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
+#include <cuda/std/type_traits>
 
 #include "test_macros.h"
 
-template <class T, int expected>
+template <class T, cuda::std::enable_if_t<cuda::std::is_integral<T>::value, int> = 0>
+__host__ __device__ constexpr int make_expected_max_exponent10()
+{
+  return 0;
+}
+
+template <class T, int expected = make_expected_max_exponent10<T>()>
 __host__ __device__ void test()
 {
   static_assert(cuda::std::numeric_limits<T>::max_exponent10 == expected, "max_exponent10 test 1");
@@ -26,29 +33,29 @@ __host__ __device__ void test()
 
 int main(int, char**)
 {
-  test<bool, 0>();
-  test<char, 0>();
-  test<signed char, 0>();
-  test<unsigned char, 0>();
-  test<wchar_t, 0>();
+  test<bool>();
+  test<char>();
+  test<signed char>();
+  test<unsigned char>();
+  test<wchar_t>();
 #if TEST_STD_VER > 2017 && defined(__cpp_char8_t)
-  test<char8_t, 0>();
+  test<char8_t>();
 #endif
 #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-  test<char16_t, 0>();
-  test<char32_t, 0>();
+  test<char16_t>();
+  test<char32_t>();
 #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-  test<short, 0>();
-  test<unsigned short, 0>();
-  test<int, 0>();
-  test<unsigned int, 0>();
-  test<long, 0>();
-  test<unsigned long, 0>();
-  test<long long, 0>();
-  test<unsigned long long, 0>();
+  test<short>();
+  test<unsigned short>();
+  test<int>();
+  test<unsigned int>();
+  test<long>();
+  test<unsigned long>();
+  test<long long>();
+  test<unsigned long long>();
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-  test<__int128_t, 0>();
-  test<__uint128_t, 0>();
+  test<__int128_t>();
+  test<__uint128_t>();
 #endif
   test<float, FLT_MAX_10_EXP>();
   test<double, DBL_MAX_10_EXP>();
@@ -61,6 +68,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, 38>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, 2>();
+  test<__nv_fp8_e5m2, 4>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
index c24c3fde869..15f470909df 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
@@ -72,6 +72,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(1.17549435082228750796873653722e-38));
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3>(make_fp8_e4m3(0.015625));
+  test<__nv_fp8_e5m2>(make_fp8_e5m2(0.000061035));
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp
index e3150f8dc8e..b63d653a7c3 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp
@@ -12,10 +12,17 @@
 
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
+#include <cuda/std/type_traits>
 
 #include "test_macros.h"
 
-template <class T, int expected>
+template <class T, cuda::std::enable_if_t<cuda::std::is_integral<T>::value, int> = 0>
+__host__ __device__ constexpr int make_expected_min_exponent()
+{
+  return 0;
+}
+
+template <class T, int expected = make_expected_min_exponent<T>()>
 __host__ __device__ void test()
 {
   static_assert(cuda::std::numeric_limits<T>::min_exponent == expected, "min_exponent test 1");
@@ -26,29 +33,29 @@ __host__ __device__ void test()
 
 int main(int, char**)
 {
-  test<bool, 0>();
-  test<char, 0>();
-  test<signed char, 0>();
-  test<unsigned char, 0>();
-  test<wchar_t, 0>();
+  test<bool>();
+  test<char>();
+  test<signed char>();
+  test<unsigned char>();
+  test<wchar_t>();
 #if TEST_STD_VER > 2017 && defined(__cpp_char8_t)
-  test<char8_t, 0>();
+  test<char8_t>();
 #endif
 #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-  test<char16_t, 0>();
-  test<char32_t, 0>();
+  test<char16_t>();
+  test<char32_t>();
 #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-  test<short, 0>();
-  test<unsigned short, 0>();
-  test<int, 0>();
-  test<unsigned int, 0>();
-  test<long, 0>();
-  test<unsigned long, 0>();
-  test<long long, 0>();
-  test<unsigned long long, 0>();
+  test<short>();
+  test<unsigned short>();
+  test<int>();
+  test<unsigned int>();
+  test<long>();
+  test<unsigned long>();
+  test<long long>();
+  test<unsigned long long>();
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-  test<__int128_t, 0>();
-  test<__uint128_t, 0>();
+  test<__int128_t>();
+  test<__uint128_t>();
 #endif
   test<float, FLT_MIN_EXP>();
   test<double, DBL_MIN_EXP>();
@@ -61,6 +68,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, -125>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, -6>();
+  test<__nv_fp8_e5m2, -15>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp
index cbca8e04171..a6ff20e7fde 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp
@@ -12,10 +12,17 @@
 
 #include <cuda/std/cfloat>
 #include <cuda/std/limits>
+#include <cuda/std/type_traits>
 
 #include "test_macros.h"
 
-template <class T, int expected>
+template <class T, cuda::std::enable_if_t<cuda::std::is_integral<T>::value, int> = 0>
+__host__ __device__ constexpr int make_expected_min_exponent10()
+{
+  return 0;
+}
+
+template <class T, int expected = make_expected_min_exponent10<T>()>
 __host__ __device__ void test()
 {
   static_assert(cuda::std::numeric_limits<T>::min_exponent10 == expected, "min_exponent10 test 1");
@@ -26,29 +33,29 @@ __host__ __device__ void test()
 
 int main(int, char**)
 {
-  test<bool, 0>();
-  test<char, 0>();
-  test<signed char, 0>();
-  test<unsigned char, 0>();
-  test<wchar_t, 0>();
+  test<bool>();
+  test<char>();
+  test<signed char>();
+  test<unsigned char>();
+  test<wchar_t>();
 #if TEST_STD_VER > 2017 && defined(__cpp_char8_t)
-  test<char8_t, 0>();
+  test<char8_t>();
 #endif
 #ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-  test<char16_t, 0>();
-  test<char32_t, 0>();
+  test<char16_t>();
+  test<char32_t>();
 #endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-  test<short, 0>();
-  test<unsigned short, 0>();
-  test<int, 0>();
-  test<unsigned int, 0>();
-  test<long, 0>();
-  test<unsigned long, 0>();
-  test<long long, 0>();
-  test<unsigned long long, 0>();
+  test<short>();
+  test<unsigned short>();
+  test<int>();
+  test<unsigned int>();
+  test<long>();
+  test<unsigned long>();
+  test<long long>();
+  test<unsigned long long>();
 #ifndef _LIBCUDACXX_HAS_NO_INT128
-  test<__int128_t, 0>();
-  test<__uint128_t, 0>();
+  test<__int128_t>();
+  test<__uint128_t>();
 #endif
   test<float, FLT_MIN_10_EXP>();
   test<double, DBL_MIN_10_EXP>();
@@ -61,6 +68,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, -37>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, -2>();
+  test<__nv_fp8_e5m2, -5>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp
index 74e7f427941..2d6d9582f5c 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp
@@ -18,29 +18,63 @@
 #include "test_macros.h"
 
 template <class T>
-__host__ __device__ void test_imp(cuda::std::true_type)
+__host__ __device__ bool is_nan(T x)
 {
-  assert(cuda::std::isnan(cuda::std::numeric_limits<T>::quiet_NaN()));
-  assert(cuda::std::isnan(cuda::std::numeric_limits<const T>::quiet_NaN()));
-  assert(cuda::std::isnan(cuda::std::numeric_limits<volatile T>::quiet_NaN()));
-  assert(cuda::std::isnan(cuda::std::numeric_limits<const volatile T>::quiet_NaN()));
+  return cuda::std::isnan(x);
 }
 
+#if _CCCL_HAS_NVFP8()
+__host__ __device__ bool is_nan(__nv_fp8_e4m3 x)
+{
+  return is_nan(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3)});
+}
+
+__host__ __device__ bool is_nan(__nv_fp8_e5m2 x)
+{
+  return is_nan(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E5M2)});
+}
+#endif // _CCCL_HAS_NVFP8()
+
 template <class T>
-__host__ __device__ void test_imp(cuda::std::false_type)
+__host__ __device__ void test_impl(cuda::std::true_type)
 {
-  assert(cuda::std::numeric_limits<T>::quiet_NaN() == T());
-  assert(cuda::std::numeric_limits<const T>::quiet_NaN() == T());
-  assert(cuda::std::numeric_limits<volatile T>::quiet_NaN() == T());
-  assert(cuda::std::numeric_limits<const volatile T>::quiet_NaN() == T());
+  assert(is_nan(cuda::std::numeric_limits<T>::quiet_NaN()));
+  assert(is_nan(cuda::std::numeric_limits<const T>::quiet_NaN()));
+  assert(is_nan(cuda::std::numeric_limits<volatile T>::quiet_NaN()));
+  assert(is_nan(cuda::std::numeric_limits<const volatile T>::quiet_NaN()));
 }
 
 template <class T>
-__host__ __device__ inline void test()
+__host__ __device__ bool equal_to(T x, T y)
 {
-  constexpr bool is_float = cuda::std::is_floating_point<T>::value || cuda::std::__is_extended_floating_point<T>::value;
+  return x == y;
+}
+
+#if _CCCL_HAS_NVFP8()
+__host__ __device__ bool equal_to(__nv_fp8_e4m3 x, __nv_fp8_e4m3 y)
+{
+  return x.__x == y.__x;
+}
 
-  test_imp<T>(cuda::std::integral_constant<bool, is_float>{});
+__host__ __device__ bool equal_to(__nv_fp8_e5m2 x, __nv_fp8_e5m2 y)
+{
+  return x.__x == y.__x;
+}
+#endif // _CCCL_HAS_NVFP8()
+
+template <class T>
+__host__ __device__ void test_impl(cuda::std::false_type)
+{
+  assert(equal_to(cuda::std::numeric_limits<T>::signaling_NaN(), T()));
+  assert(equal_to(cuda::std::numeric_limits<const T>::signaling_NaN(), T()));
+  assert(equal_to(cuda::std::numeric_limits<volatile T>::signaling_NaN(), T()));
+  assert(equal_to(cuda::std::numeric_limits<const volatile T>::signaling_NaN(), T()));
+}
+
+template <class T>
+__host__ __device__ inline void test()
+{
+  test_impl<T>(cuda::std::integral_constant<bool, cuda::std::numeric_limits<T>::has_quiet_NaN>{});
 }
 
 int main(int, char**)
@@ -80,6 +114,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3>();
+  test<__nv_fp8_e5m2>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp
index 9765db6f760..7e5c87927aa 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp
@@ -61,6 +61,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, FLT_RADIX>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, FLT_RADIX>();
+  test<__nv_fp8_e5m2, FLT_RADIX>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp
index ba5049fc49f..d4faf373a09 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp
@@ -63,6 +63,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(0.5));
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3>(make_fp8_e4m3(0.5));
+  test<__nv_fp8_e5m2>(make_fp8_e5m2(0.5));
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp
index 3fb436381a7..8515581d650 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp
@@ -60,6 +60,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, cuda::std::round_to_nearest>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, cuda::std::round_to_nearest>();
+  test<__nv_fp8_e5m2, cuda::std::round_to_nearest>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp
index 69ba66038de..19ace1b3d2c 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp
@@ -18,29 +18,63 @@
 #include "test_macros.h"
 
 template <class T>
-__host__ __device__ void test_imp(cuda::std::true_type)
+__host__ __device__ bool is_nan(T x)
 {
-  assert(cuda::std::isnan(cuda::std::numeric_limits<T>::signaling_NaN()));
-  assert(cuda::std::isnan(cuda::std::numeric_limits<const T>::signaling_NaN()));
-  assert(cuda::std::isnan(cuda::std::numeric_limits<volatile T>::signaling_NaN()));
-  assert(cuda::std::isnan(cuda::std::numeric_limits<const volatile T>::signaling_NaN()));
+  return cuda::std::isnan(x);
 }
 
+#if _CCCL_HAS_NVFP8()
+__host__ __device__ bool is_nan(__nv_fp8_e4m3 x)
+{
+  return is_nan(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3)});
+}
+
+__host__ __device__ bool is_nan(__nv_fp8_e5m2 x)
+{
+  return is_nan(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E5M2)});
+}
+#endif // _CCCL_HAS_NVFP8()
+
 template <class T>
-__host__ __device__ void test_imp(cuda::std::false_type)
+__host__ __device__ void test_impl(cuda::std::true_type)
 {
-  assert(cuda::std::numeric_limits<T>::signaling_NaN() == T());
-  assert(cuda::std::numeric_limits<const T>::signaling_NaN() == T());
-  assert(cuda::std::numeric_limits<volatile T>::signaling_NaN() == T());
-  assert(cuda::std::numeric_limits<const volatile T>::signaling_NaN() == T());
+  assert(is_nan(cuda::std::numeric_limits<T>::signaling_NaN()));
+  assert(is_nan(cuda::std::numeric_limits<const T>::signaling_NaN()));
+  assert(is_nan(cuda::std::numeric_limits<volatile T>::signaling_NaN()));
+  assert(is_nan(cuda::std::numeric_limits<const volatile T>::signaling_NaN()));
 }
 
 template <class T>
-__host__ __device__ inline void test()
+__host__ __device__ bool equal_to(T x, T y)
 {
-  constexpr bool is_float = cuda::std::is_floating_point<T>::value || cuda::std::__is_extended_floating_point<T>::value;
+  return x == y;
+}
+
+#if _CCCL_HAS_NVFP8()
+__host__ __device__ bool equal_to(__nv_fp8_e4m3 x, __nv_fp8_e4m3 y)
+{
+  return x.__x == y.__x;
+}
 
-  test_imp<T>(cuda::std::integral_constant<bool, is_float>{});
+__host__ __device__ bool equal_to(__nv_fp8_e5m2 x, __nv_fp8_e5m2 y)
+{
+  return x.__x == y.__x;
+}
+#endif // _CCCL_HAS_NVFP8()
+
+template <class T>
+__host__ __device__ void test_impl(cuda::std::false_type)
+{
+  assert(equal_to(cuda::std::numeric_limits<T>::signaling_NaN(), T()));
+  assert(equal_to(cuda::std::numeric_limits<const T>::signaling_NaN(), T()));
+  assert(equal_to(cuda::std::numeric_limits<volatile T>::signaling_NaN(), T()));
+  assert(equal_to(cuda::std::numeric_limits<const volatile T>::signaling_NaN(), T()));
+}
+
+template <class T>
+__host__ __device__ inline void test()
+{
+  test_impl<T>(cuda::std::integral_constant<bool, cuda::std::numeric_limits<T>::has_signaling_NaN>{});
 }
 
 int main(int, char**)
@@ -80,6 +114,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3>();
+  test<__nv_fp8_e5m2>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
index 70cde2711a1..38dec8c872b 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
@@ -60,6 +60,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, false>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, false>();
+  test<__nv_fp8_e5m2, false>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
index 7dd7eee68cc..55d7eb990db 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
@@ -66,6 +66,10 @@ int main(int, char**)
 #if defined(_LIBCUDACXX_HAS_NVBF16)
   test<__nv_bfloat16, false>();
 #endif // _LIBCUDACXX_HAS_NVBF16
+#if _CCCL_HAS_NVFP8()
+  test<__nv_fp8_e4m3, false>();
+  test<__nv_fp8_e5m2, false>();
+#endif // _CCCL_HAS_NVFP8()
 
   return 0;
 }

From 5511c7e0e464be00dba5b56c612874576e3b2090 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 18:14:23 +0100
Subject: [PATCH 6/8] Backport to 2.8: Deprecate thrust universal iterator
 categories (#3461) (#3471)

---
 .../iterator/detail/universal_categories.h    | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/thrust/thrust/iterator/detail/universal_categories.h b/thrust/thrust/iterator/detail/universal_categories.h
index ea30f076beb..ee620f977a8 100644
--- a/thrust/thrust/iterator/detail/universal_categories.h
+++ b/thrust/thrust/iterator/detail/universal_categories.h
@@ -27,13 +27,13 @@
 #endif // no system header
 #include <thrust/iterator/iterator_categories.h>
 
-// XXX eliminate this file
-
+_CCCL_SUPPRESS_DEPRECATED_PUSH
 THRUST_NAMESPACE_BEGIN
 
 // define these types without inheritance to avoid ambiguous conversion to base classes
 
-struct input_universal_iterator_tag
+// deprecated [Since 2.8]
+struct CCCL_DEPRECATED input_universal_iterator_tag
 {
   operator input_host_iterator_tag()
   {
@@ -46,7 +46,8 @@ struct input_universal_iterator_tag
   }
 };
 
-struct output_universal_iterator_tag
+// deprecated [Since 2.8]
+struct CCCL_DEPRECATED output_universal_iterator_tag
 {
   operator output_host_iterator_tag()
   {
@@ -59,7 +60,8 @@ struct output_universal_iterator_tag
   }
 };
 
-struct forward_universal_iterator_tag : input_universal_iterator_tag
+// deprecated [Since 2.8]
+struct CCCL_DEPRECATED forward_universal_iterator_tag : input_universal_iterator_tag
 {
   operator forward_host_iterator_tag()
   {
@@ -72,7 +74,8 @@ struct forward_universal_iterator_tag : input_universal_iterator_tag
   };
 };
 
-struct bidirectional_universal_iterator_tag : forward_universal_iterator_tag
+// deprecated [Since 2.8]
+struct CCCL_DEPRECATED bidirectional_universal_iterator_tag : forward_universal_iterator_tag
 {
   operator bidirectional_host_iterator_tag()
   {
@@ -95,7 +98,8 @@ struct one_degree_of_separation : T
 
 } // namespace detail
 
-struct random_access_universal_iterator_tag
+// deprecated [Since 2.8]
+struct CCCL_DEPRECATED random_access_universal_iterator_tag
 {
   // these conversions are all P0
   operator random_access_host_iterator_tag()
@@ -115,4 +119,5 @@ struct random_access_universal_iterator_tag
   }
 };
 
+_CCCL_SUPPRESS_DEPRECATED_POP
 THRUST_NAMESPACE_END

From 3f8c8d5d3391856d0b27d5e0ef33d59eb167c322 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 19:40:33 +0100
Subject: [PATCH 7/8] Deprecate and replace thrust::cuda_cub iterators (#3422)
 (#3510)

---
 thrust/thrust/system/cuda/detail/count.h      |  3 ++-
 thrust/thrust/system/cuda/detail/extrema.h    | 12 ++++++----
 thrust/thrust/system/cuda/detail/find.h       |  8 ++++---
 .../thrust/system/cuda/detail/inner_product.h | 15 +++++-------
 thrust/thrust/system/cuda/detail/mismatch.h   | 20 +++++++---------
 .../system/cuda/detail/transform_scan.h       |  7 +++---
 thrust/thrust/system/cuda/detail/util.h       | 24 ++++++++++++-------
 7 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/thrust/thrust/system/cuda/detail/count.h b/thrust/thrust/system/cuda/detail/count.h
index cb9b7017902..530682ba717 100644
--- a/thrust/thrust/system/cuda/detail/count.h
+++ b/thrust/thrust/system/cuda/detail/count.h
@@ -40,6 +40,7 @@
 #  include <thrust/system/cuda/config.h>
 
 #  include <thrust/distance.h>
+#  include <thrust/iterator/transform_iterator.h>
 #  include <thrust/system/cuda/detail/reduce.h>
 #  include <thrust/system/cuda/detail/util.h>
 
@@ -52,7 +53,7 @@ typename iterator_traits<InputIt>::difference_type _CCCL_HOST_DEVICE
 count_if(execution_policy<Derived>& policy, InputIt first, InputIt last, UnaryPred unary_pred)
 {
   using size_type       = typename iterator_traits<InputIt>::difference_type;
-  using flag_iterator_t = transform_input_iterator_t<size_type, InputIt, UnaryPred>;
+  using flag_iterator_t = transform_iterator<UnaryPred, InputIt, size_type, size_type>;
 
   return cuda_cub::reduce_n(
     policy, flag_iterator_t(first, unary_pred), thrust::distance(first, last), size_type(0), plus<size_type>());
diff --git a/thrust/thrust/system/cuda/detail/extrema.h b/thrust/thrust/system/cuda/detail/extrema.h
index f10c3578173..617eb8bbc79 100644
--- a/thrust/thrust/system/cuda/detail/extrema.h
+++ b/thrust/thrust/system/cuda/detail/extrema.h
@@ -45,6 +45,8 @@
 #  include <thrust/detail/temporary_array.h>
 #  include <thrust/distance.h>
 #  include <thrust/extrema.h>
+#  include <thrust/iterator/counting_iterator.h>
+#  include <thrust/iterator/transform_iterator.h>
 #  include <thrust/pair.h>
 #  include <thrust/system/cuda/detail/cdp_dispatch.h>
 #  include <thrust/system/cuda/detail/reduce.h>
@@ -370,10 +372,10 @@ element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last, BinaryPr
 
   IndexType num_items = static_cast<IndexType>(thrust::distance(first, last));
 
-  using iterator_tuple = tuple<ItemsIt, counting_iterator_t<IndexType>>;
+  using iterator_tuple = tuple<ItemsIt, counting_iterator<IndexType>>;
   using zip_iterator   = zip_iterator<iterator_tuple>;
 
-  iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator_t<IndexType>(0));
+  iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator<IndexType>(0));
 
   using arg_min_t = ArgFunctor<InputType, IndexType, BinaryPred>;
   using T         = tuple<InputType, IndexType>;
@@ -443,15 +445,15 @@ minmax_element(execution_policy<Derived>& policy, ItemsIt first, ItemsIt last, B
 
      const auto num_items = static_cast<IndexType>(thrust::distance(first, last));
 
-     using iterator_tuple = tuple<ItemsIt, counting_iterator_t<IndexType>>;
+     using iterator_tuple = tuple<ItemsIt, counting_iterator<IndexType>>;
      using zip_iterator   = zip_iterator<iterator_tuple>;
 
-     iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator_t<IndexType>(0));
+     iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator<IndexType>(0));
 
      using arg_minmax_t   = __extrema::arg_minmax_f<InputType, IndexType, BinaryPred>;
      using two_pairs_type = typename arg_minmax_t::two_pairs_type;
      using duplicate_t    = typename arg_minmax_t::duplicate_tuple;
-     using transform_t    = transform_input_iterator_t<two_pairs_type, zip_iterator, duplicate_t>;
+     using transform_t    = transform_iterator<duplicate_t, zip_iterator, two_pairs_type, two_pairs_type>;
 
      zip_iterator begin    = make_zip_iterator(iter_tuple);
      two_pairs_type result = __extrema::extrema(
diff --git a/thrust/thrust/system/cuda/detail/find.h b/thrust/thrust/system/cuda/detail/find.h
index de633c73ebb..4b4e11c8ed9 100644
--- a/thrust/thrust/system/cuda/detail/find.h
+++ b/thrust/thrust/system/cuda/detail/find.h
@@ -41,6 +41,8 @@
 
 #  include <thrust/detail/minmax.h>
 #  include <thrust/distance.h>
+#  include <thrust/iterator/counting_iterator.h>
+#  include <thrust/iterator/transform_iterator.h>
 #  include <thrust/system/cuda/detail/execution_policy.h>
 
 THRUST_NAMESPACE_BEGIN
@@ -116,11 +118,11 @@ find_if_n(execution_policy<Derived>& policy, InputIt first, Size num_items, Pred
   const Size interval_size      = (thrust::min)(interval_threshold, num_items);
 
   // force transform_iterator output to bool
-  using XfrmIterator  = transform_input_iterator_t<bool, InputIt, Predicate>;
-  using IteratorTuple = thrust::tuple<XfrmIterator, counting_iterator_t<Size>>;
+  using XfrmIterator  = transform_iterator<Predicate, InputIt, bool, bool>;
+  using IteratorTuple = thrust::tuple<XfrmIterator, counting_iterator<Size>>;
   using ZipIterator   = thrust::zip_iterator<IteratorTuple>;
 
-  IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, predicate), counting_iterator_t<Size>(0));
+  IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, predicate), counting_iterator<Size>(0));
 
   ZipIterator begin = thrust::make_zip_iterator(iter_tuple);
   ZipIterator end   = begin + num_items;
diff --git a/thrust/thrust/system/cuda/detail/inner_product.h b/thrust/thrust/system/cuda/detail/inner_product.h
index af41c5ccda8..1c36d5e256f 100644
--- a/thrust/thrust/system/cuda/detail/inner_product.h
+++ b/thrust/thrust/system/cuda/detail/inner_product.h
@@ -39,15 +39,15 @@
 #if _CCCL_HAS_CUDA_COMPILER
 #  include <thrust/detail/minmax.h>
 #  include <thrust/distance.h>
+#  include <thrust/iterator/transform_iterator.h>
+#  include <thrust/iterator/zip_iterator.h>
 #  include <thrust/system/cuda/detail/reduce.h>
-
-#  include <iterator>
+#  include <thrust/zip_function.h>
 
 THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub
 {
-
 template <class Derived, class InputIt1, class InputIt2, class T, class ReduceOp, class ProductOp>
 T _CCCL_HOST_DEVICE inner_product(
   execution_policy<Derived>& policy,
@@ -58,11 +58,9 @@ T _CCCL_HOST_DEVICE inner_product(
   ReduceOp reduce_op,
   ProductOp product_op)
 {
-  using size_type        = typename iterator_traits<InputIt1>::difference_type;
-  size_type num_items    = static_cast<size_type>(thrust::distance(first1, last1));
-  using binop_iterator_t = transform_pair_of_input_iterators_t<T, InputIt1, InputIt2, ProductOp>;
-
-  return cuda_cub::reduce_n(policy, binop_iterator_t(first1, first2, product_op), num_items, init, reduce_op);
+  const auto n     = thrust::distance(first1, last1);
+  const auto first = make_transform_iterator(make_zip_iterator(first1, first2), make_zip_function(product_op));
+  return cuda_cub::reduce_n(policy, first, n, init, reduce_op);
 }
 
 template <class Derived, class InputIt1, class InputIt2, class T>
@@ -71,7 +69,6 @@ inner_product(execution_policy<Derived>& policy, InputIt1 first1, InputIt1 last1
 {
   return cuda_cub::inner_product(policy, first1, last1, first2, init, plus<T>(), multiplies<T>());
 }
-
 } // namespace cuda_cub
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/system/cuda/detail/mismatch.h b/thrust/thrust/system/cuda/detail/mismatch.h
index fc7a878b7d5..9987799faca 100644
--- a/thrust/thrust/system/cuda/detail/mismatch.h
+++ b/thrust/thrust/system/cuda/detail/mismatch.h
@@ -40,15 +40,14 @@
 #  include <thrust/system/cuda/config.h>
 
 #  include <thrust/distance.h>
+#  include <thrust/iterator/zip_iterator.h>
 #  include <thrust/pair.h>
 #  include <thrust/system/cuda/detail/execution_policy.h>
-
-#  include <cuda/std/__functional/identity.h>
+#  include <thrust/zip_function.h>
 
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
-
 template <class Derived, class InputIt1, class InputIt2, class BinaryPred>
 pair<InputIt1, InputIt2> _CCCL_HOST_DEVICE
 mismatch(execution_policy<Derived>& policy, InputIt1 first1, InputIt1 last1, InputIt2 first2, BinaryPred binary_pred);
@@ -69,15 +68,12 @@ template <class Derived, class InputIt1, class InputIt2, class BinaryPred>
 pair<InputIt1, InputIt2> _CCCL_HOST_DEVICE
 mismatch(execution_policy<Derived>& policy, InputIt1 first1, InputIt1 last1, InputIt2 first2, BinaryPred binary_pred)
 {
-  using transform_t = transform_pair_of_input_iterators_t<bool, InputIt1, InputIt2, BinaryPred>;
-
-  transform_t transform_first = transform_t(first1, first2, binary_pred);
-
-  transform_t result = cuda_cub::find_if_not(
-    policy, transform_first, transform_first + thrust::distance(first1, last1), ::cuda::std::__identity{});
-
-  return thrust::make_pair(first1 + thrust::distance(transform_first, result),
-                           first2 + thrust::distance(transform_first, result));
+  const auto n            = thrust::distance(first1, last1);
+  const auto first        = make_zip_iterator(first1, first2);
+  const auto last         = make_zip_iterator(last1, first2 + n);
+  const auto mismatch_pos = cuda_cub::find_if_not(policy, first, last, make_zip_function(binary_pred));
+  const auto dist         = thrust::distance(first, mismatch_pos);
+  return thrust::make_pair(first1 + dist, first2 + dist);
 }
 
 template <class Derived, class InputIt1, class InputIt2>
diff --git a/thrust/thrust/system/cuda/detail/transform_scan.h b/thrust/thrust/system/cuda/detail/transform_scan.h
index ed94edc7d47..2227249ab16 100644
--- a/thrust/thrust/system/cuda/detail/transform_scan.h
+++ b/thrust/thrust/system/cuda/detail/transform_scan.h
@@ -39,6 +39,7 @@
 #if _CCCL_HAS_CUDA_COMPILER
 #  include <thrust/detail/type_traits.h>
 #  include <thrust/distance.h>
+#  include <thrust/iterator/transform_iterator.h>
 #  include <thrust/system/cuda/detail/scan.h>
 
 #  include <cuda/std/type_traits>
@@ -66,7 +67,7 @@ OutputIt _CCCL_HOST_DEVICE transform_inclusive_scan(
 
   using size_type              = typename iterator_traits<InputIt>::difference_type;
   size_type num_items          = static_cast<size_type>(thrust::distance(first, last));
-  using transformed_iterator_t = transform_input_iterator_t<value_type, InputIt, TransformOp>;
+  using transformed_iterator_t = transform_iterator<TransformOp, InputIt, value_type, value_type>;
 
   return cuda_cub::inclusive_scan_n(policy, transformed_iterator_t(first, transform_op), num_items, result, scan_op);
 }
@@ -87,7 +88,7 @@ OutputIt _CCCL_HOST_DEVICE transform_inclusive_scan(
 
   using size_type              = typename iterator_traits<InputIt>::difference_type;
   size_type num_items          = static_cast<size_type>(thrust::distance(first, last));
-  using transformed_iterator_t = transform_input_iterator_t<value_type, InputIt, TransformOp>;
+  using transformed_iterator_t = transform_iterator<TransformOp, InputIt, value_type, value_type>;
 
   return cuda_cub::inclusive_scan_n(
     policy, transformed_iterator_t(first, transform_op), num_items, result, init, scan_op);
@@ -108,7 +109,7 @@ OutputIt _CCCL_HOST_DEVICE transform_exclusive_scan(
 
   using size_type              = typename iterator_traits<InputIt>::difference_type;
   size_type num_items          = static_cast<size_type>(thrust::distance(first, last));
-  using transformed_iterator_t = transform_input_iterator_t<result_type, InputIt, TransformOp>;
+  using transformed_iterator_t = transform_iterator<TransformOp, InputIt, result_type, result_type>;
 
   return cuda_cub::exclusive_scan_n(
     policy, transformed_iterator_t(first, transform_op), num_items, result, init, scan_op);
diff --git a/thrust/thrust/system/cuda/detail/util.h b/thrust/thrust/system/cuda/detail/util.h
index 4cdde4508d0..27f0e1af288 100644
--- a/thrust/thrust/system/cuda/detail/util.h
+++ b/thrust/thrust/system/cuda/detail/util.h
@@ -249,12 +249,13 @@ _CCCL_HOST_DEVICE inline void throw_on_error(cudaError_t status, char const* msg
   }
 }
 
-// FIXME: Move the iterators elsewhere.
-
+// deprecated [Since 2.8]
 template <class ValueType, class InputIt, class UnaryOp>
-struct transform_input_iterator_t
+struct CCCL_DEPRECATED_BECAUSE("Use thrust::transform_iterator") transform_input_iterator_t
 {
-  using self_t            = transform_input_iterator_t;
+  _CCCL_SUPPRESS_DEPRECATED_PUSH
+  using self_t = transform_input_iterator_t;
+  _CCCL_SUPPRESS_DEPRECATED_POP
   using difference_type   = typename iterator_traits<InputIt>::difference_type;
   using value_type        = ValueType;
   using pointer           = void;
@@ -358,10 +359,14 @@ struct transform_input_iterator_t
   }
 }; // struct transform_input_iterarot_t
 
+// deprecated [Since 2.8]
 template <class ValueType, class InputIt1, class InputIt2, class BinaryOp>
-struct transform_pair_of_input_iterators_t
+struct CCCL_DEPRECATED_BECAUSE("Use thrust::transform_iterator of a thrust::zip_iterator")
+  transform_pair_of_input_iterators_t
 {
-  using self_t            = transform_pair_of_input_iterators_t;
+  _CCCL_SUPPRESS_DEPRECATED_PUSH
+  using self_t = transform_pair_of_input_iterators_t;
+  _CCCL_SUPPRESS_DEPRECATED_POP
   using difference_type   = typename iterator_traits<InputIt1>::difference_type;
   using value_type        = ValueType;
   using pointer           = void;
@@ -488,10 +493,13 @@ struct CCCL_DEPRECATED_BECAUSE("Use cuda::std::identity") identity
   }
 };
 
+// deprecated [Since 2.8]
 template <class T>
-struct counting_iterator_t
+struct CCCL_DEPRECATED_BECAUSE("Use thrust::counting_iterator") counting_iterator_t
 {
-  using self_t            = counting_iterator_t;
+  _CCCL_SUPPRESS_DEPRECATED_PUSH
+  using self_t = counting_iterator_t;
+  _CCCL_SUPPRESS_DEPRECATED_POP
   using difference_type   = T;
   using value_type        = T;
   using pointer           = void;

From d19f3426eafbc742e047528681f07583e17255b7 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 21:26:05 +0100
Subject: [PATCH 8/8] Deprecate thrust macros from type_deduction.h (#3501)
 (#3511)

---
 thrust/thrust/detail/type_deduction.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/thrust/thrust/detail/type_deduction.h b/thrust/thrust/detail/type_deduction.h
index a1d41de9676..717edc366ee 100644
--- a/thrust/thrust/detail/type_deduction.h
+++ b/thrust/thrust/detail/type_deduction.h
@@ -30,16 +30,18 @@
 
 /// \def THRUST_MVCAP(x)
 /// \brief Capture `x` into a lambda by moving.
-///
+/// deprecated [Since 2.8]
 #define THRUST_MVCAP(x) x = ::std::move(x)
 
 /// \def THRUST_RETOF(invocable, ...)
 /// \brief Expands to the type returned by invoking an instance of the invocable
 ///        type \a invocable with parameters of type \c __VA_ARGS__. Must
 ///        be called with 1 or fewer parameters to the invocable.
-///
-#define THRUST_RETOF(...)   THRUST_PP_DISPATCH(THRUST_RETOF, __VA_ARGS__)
-#define THRUST_RETOF1(C)    decltype(::std::declval<C>()())
+/// deprecated [Since 2.8]
+#define THRUST_RETOF(...) THRUST_PP_DISPATCH(THRUST_RETOF, __VA_ARGS__)
+/// deprecated [Since 2.8]
+#define THRUST_RETOF1(C) decltype(::std::declval<C>()())
+/// deprecated [Since 2.8]
 #define THRUST_RETOF2(C, V) decltype(::std::declval<C>()(::std::declval<V>()))
 
 /// \def THRUST_RETURNS(...)
@@ -88,6 +90,7 @@
     }                                  \
     /**/
 #else
+/// deprecated [Since 2.8]
 #  define THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(condition, ...)                              \
     noexcept(noexcept(__VA_ARGS__))->typename std::enable_if<condition, decltype(__VA_ARGS__)>::type \
     {                                                                                                \