From 0c17dbd005a934ffe2f83cf0b73a6a9aa5383852 Mon Sep 17 00:00:00 2001
From: Federico Busato <50413820+fbusato@users.noreply.github.com>
Date: Wed, 29 Jan 2025 17:28:19 -0800
Subject: [PATCH 01/15] Deprecate and replace `CUB_IS_INT128_ENABLED` (#3427)

Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
---
 cub/cub/detail/fast_modulo_division.cuh       |  6 +++---
 .../device/dispatch/dispatch_histogram.cuh    | 12 +++++------
 .../tuning/tuning_run_length_encode.cuh       |  8 ++++----
 .../device/dispatch/tuning/tuning_scan.cuh    |  4 ++--
 .../dispatch/tuning/tuning_scan_by_key.cuh    | 20 +++++++++----------
 .../dispatch/tuning/tuning_select_if.cuh      | 16 +++++++--------
 cub/cub/util_ptx.cuh                          |  2 +-
 cub/cub/util_type.cuh                         | 13 ++----------
 .../catch2_test_device_for_each_in_extents.cu |  4 ++--
 cub/test/catch2_test_printing.cu              |  2 +-
 cub/test/internal/catch2_test_fast_div_mod.cu |  2 +-
 cub/test/test_util.h                          |  2 +-
 12 files changed, 41 insertions(+), 50 deletions(-)
diff --git a/cub/cub/detail/fast_modulo_division.cuh b/cub/cub/detail/fast_modulo_division.cuh
index 4a5f2048e32..09068d87be0 100644
--- a/cub/cub/detail/fast_modulo_division.cuh
+++ b/cub/cub/detail/fast_modulo_division.cuh
@@ -38,7 +38,7 @@
 #endif // no system header
 
 #include <cub/detail/type_traits.cuh> // implicit_prom_t
-#include <cub/util_type.cuh> // CUB_IS_INT128_ENABLED
+#include <cub/util_type.cuh> // _CCCL_HAS_INT128()
 
 #include <cuda/cmath> // cuda::std::ceil_div
 #include <cuda/std/bit> // std::has_single_bit
@@ -79,7 +79,7 @@ struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) == 4)>
   using type = ::cuda::std::uint64_t;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 
 template <typename T>
 struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) == 8)>::type>
@@ -87,7 +87,7 @@ struct larger_unsigned_type<T, typename ::cuda::std::enable_if<(sizeof(T) == 8)>
   using type = __uint128_t;
 };
 
-#endif // CUB_IS_INT128_ENABLED
+#endif // _CCCL_HAS_INT128()
 
 template <typename T>
 using larger_unsigned_type_t = typename larger_unsigned_type<T>::type;
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index 2ac4e160220..2c2d0a2a9ca 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -646,27 +646,27 @@ public:
     using IntArithmeticT = ::cuda::std::_If< //
       sizeof(SampleT) + sizeof(CommonT) <= sizeof(uint32_t), //
       uint32_t, //
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
       ::cuda::std::_If< //
         (::cuda::std::is_same<CommonT, __int128_t>::value || //
          ::cuda::std::is_same<CommonT, __uint128_t>::value), //
         CommonT, //
         uint64_t> //
-#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv
+#else // ^^^ _CCCL_HAS_INT128() ^^^ / vvv !_CCCL_HAS_INT128() vvv
       uint64_t
-#endif // !CUB_IS_INT128_ENABLED
+#endif // !_CCCL_HAS_INT128()
       >;
 
     // Alias template that excludes __[u]int128 from the integral types
     template <typename T>
     using is_integral_excl_int128 =
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
       ::cuda::std::_If<::cuda::std::is_same<T, __int128_t>::value&& ::cuda::std::is_same<T, __uint128_t>::value,
                        ::cuda::std::false_type,
                        ::cuda::std::is_integral<T>>;
-#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv
+#else // ^^^ _CCCL_HAS_INT128() ^^^ / vvv !_CCCL_HAS_INT128() vvv
       ::cuda::std::is_integral<T>;
-#endif // !CUB_IS_INT128_ENABLED
+#endif // !_CCCL_HAS_INT128()
 
     union ScaleT
     {
diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
index d938209dcf2..12f07f3f366 100644
--- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
@@ -156,7 +156,7 @@ struct sm80_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, len
   using delay_constructor                            = detail::no_delay_constructor_t<1075>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class LengthT>
 struct sm80_tuning<LengthT, __int128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
 {
@@ -216,7 +216,7 @@ struct sm90_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, len
   using delay_constructor                            = detail::no_delay_constructor_t<515>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class LengthT>
 struct sm90_tuning<LengthT, __int128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
 {
@@ -349,7 +349,7 @@ struct sm80_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, len
   using delay_constructor                            = detail::no_delay_constructor_t<1065>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class LengthT>
 struct sm80_tuning<LengthT, __int128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
 {
@@ -414,7 +414,7 @@ struct sm90_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, len
   using delay_constructor                            = detail::no_delay_constructor_t<840>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class LengthT>
 struct sm90_tuning<LengthT, __int128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
 {
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 7b076507341..165a17cae52 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -175,7 +175,7 @@ struct sm80_tuning<double, primitive_op::yes, primitive_accum::yes, accum_size::
   static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <>
 struct sm80_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16>
 {
@@ -221,7 +221,7 @@ template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes
 template <> struct sm90_tuning<float,  primitive_op::yes, primitive_accum::yes, accum_size::_4> : sm90_tuning_vals<float,  128, 24, 688, 1140> {};
 template <> struct sm90_tuning<double, primitive_op::yes, primitive_accum::yes, accum_size::_8> : sm90_tuning_vals<double, 224, 24, 576, 1215> {};
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <> struct sm90_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : sm90_tuning_vals<__int128_t, 576, 21, 860, 630> {};
 template <>
 struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_size::_16>
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
index f8e29201eea..2bc31ef6697 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
@@ -172,7 +172,7 @@ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_8,
   using delay_constructor                              = fixed_delay_constructor_t<124, 1040>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class KeyT>
 struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
 {
@@ -229,7 +229,7 @@ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_8,
   using delay_constructor                              = fixed_delay_constructor_t<160, 695>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class KeyT>
 struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
 {
@@ -286,7 +286,7 @@ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_8,
   using delay_constructor                              = fixed_delay_constructor_t<888, 635>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class KeyT>
 struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
 {
@@ -343,7 +343,7 @@ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_8,
   using delay_constructor                              = no_delay_constructor_t<1160>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class KeyT>
 struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
 {
@@ -400,7 +400,7 @@ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_8,
   using delay_constructor                              = no_delay_constructor_t<1030>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class KeyT>
 struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
 {
@@ -465,7 +465,7 @@ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_8,
   using delay_constructor                              = fixed_delay_constructor_t<488, 1070>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class KeyT>
 struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
 {
@@ -522,7 +522,7 @@ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_8,
   using delay_constructor                              = fixed_delay_constructor_t<352, 1170>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class KeyT>
 struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
 {
@@ -579,7 +579,7 @@ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_8,
   using delay_constructor                              = fixed_delay_constructor_t<556, 1195>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class KeyT>
 struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
 {
@@ -636,7 +636,7 @@ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_8,
   using delay_constructor                              = fixed_delay_constructor_t<600, 930>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class KeyT>
 struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
 {
@@ -693,7 +693,7 @@ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_8,
   using delay_constructor                              = fixed_delay_constructor_t<320, 1200>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <class KeyT>
 struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
 {
diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh
index 10d22286068..c1b74b4ae09 100644
--- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh
@@ -121,7 +121,7 @@ struct sm80_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primit
   using delay_constructor                            = detail::fixed_delay_constructor_t<832, 1165>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <>
 struct sm80_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -174,7 +174,7 @@ struct sm80_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primi
   using delay_constructor                            = detail::no_delay_constructor_t<1130>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <>
 struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -227,7 +227,7 @@ struct sm80_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primi
   using delay_constructor                            = detail::fixed_delay_constructor_t<68, 1160>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <>
 struct sm80_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -280,7 +280,7 @@ struct sm80_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, prim
   using delay_constructor                            = detail::fixed_delay_constructor_t<884, 1130>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <>
 struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -336,7 +336,7 @@ struct sm90_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primit
   using delay_constructor                            = detail::fixed_delay_constructor_t<380, 1140>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <>
 struct sm90_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -389,7 +389,7 @@ struct sm90_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primi
   using delay_constructor                            = detail::fixed_delay_constructor_t<360, 1170>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <>
 struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -442,7 +442,7 @@ struct sm90_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primi
   using delay_constructor                            = detail::fixed_delay_constructor_t<512, 1075>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <>
 struct sm90_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -495,7 +495,7 @@ struct sm90_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, prim
   using delay_constructor                            = detail::fixed_delay_constructor_t<532, 1180>;
 };
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 template <>
 struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
 {
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index 99beeed313e..e6bb45c4a31 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -99,7 +99,7 @@ BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type
   return (source >> bit_start) & MASK;
 }
 
-#  if CUB_IS_INT128_ENABLED
+#  if _CCCL_HAS_INT128()
 /**
  * Bitfield-extract for 128-bit types.
  */
diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh
index 4d1db99a821..a89cd159309 100644
--- a/cub/cub/util_type.cuh
+++ b/cub/cub/util_type.cuh
@@ -76,17 +76,8 @@ _CCCL_DIAG_POP
 CUB_NAMESPACE_BEGIN
 
 #ifndef CUB_IS_INT128_ENABLED
-#  if defined(__CUDACC_RTC__)
-#    if defined(__CUDACC_RTC_INT128__)
-#      define CUB_IS_INT128_ENABLED 1
-#    endif // !defined(__CUDACC_RTC_INT128__)
-#  else // !defined(__CUDACC_RTC__)
-#    if _CCCL_CUDACC_AT_LEAST(11, 5)
-#      if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC)
-#        define CUB_IS_INT128_ENABLED 1
-#      endif // GCC || CLANG || NVHPC
-#    endif // _CCCL_CUDACC_AT_LEAST(11, 5)
-#  endif // !defined(__CUDACC_RTC__)
+// Deprecated [Since 2.8]
+#  define CUB_IS_INT128_ENABLED _CCCL_HAS_INT128()
 #endif // !defined(CUB_IS_INT128_ENABLED)
 
 /******************************************************************************
diff --git a/cub/test/catch2_test_device_for_each_in_extents.cu b/cub/test/catch2_test_device_for_each_in_extents.cu
index 3e5a6c6689a..313b9e58b38 100644
--- a/cub/test/catch2_test_device_for_each_in_extents.cu
+++ b/cub/test/catch2_test_device_for_each_in_extents.cu
@@ -107,7 +107,7 @@ using index_types =
                  uint16_t,
                  int32_t,
                  uint32_t
-#  if CUB_IS_INT128_ENABLED
+#  if _CCCL_HAS_INT128()
                  ,
                  int64_t,
                  uint64_t
@@ -120,7 +120,7 @@ using index_types_dynamic =
                  uint16_t,
                  int32_t,
                  uint32_t
-#  if CUB_IS_INT128_ENABLED
+#  if _CCCL_HAS_INT128()
                  ,
                  int64_t,
                  uint64_t
diff --git a/cub/test/catch2_test_printing.cu b/cub/test/catch2_test_printing.cu
index 6f93515114a..63b622f3554 100644
--- a/cub/test/catch2_test_printing.cu
+++ b/cub/test/catch2_test_printing.cu
@@ -11,7 +11,7 @@ std::string print(T val)
   return ss.str();
 }
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 TEST_CASE("Test utils can print __int128", "[test][utils]")
 {
   REQUIRE(print(__int128_t{0}) == "0");
diff --git a/cub/test/internal/catch2_test_fast_div_mod.cu b/cub/test/internal/catch2_test_fast_div_mod.cu
index 8a1a3e96a27..ec3b5e20d68 100644
--- a/cub/test/internal/catch2_test_fast_div_mod.cu
+++ b/cub/test/internal/catch2_test_fast_div_mod.cu
@@ -42,7 +42,7 @@ using index_types =
                  uint16_t,
                  int32_t,
                  uint32_t
-#  if CUB_IS_INT128_ENABLED
+#  if _CCCL_HAS_INT128()
                  ,
                  int64_t,
                  uint64_t
diff --git a/cub/test/test_util.h b/cub/test/test_util.h
index 031298120dc..9a5fefcc69c 100644
--- a/cub/test/test_util.h
+++ b/cub/test/test_util.h
@@ -717,7 +717,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair<
   return os;
 }
 
-#if CUB_IS_INT128_ENABLED
+#if _CCCL_HAS_INT128()
 inline std::ostream& operator<<(std::ostream& os, __uint128_t val)
 {
   constexpr int max_digits      = 40;

From c02e845e7f40dc748777638ce70e9893560e473c Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Thu, 30 Jan 2025 07:39:35 +0100
Subject: [PATCH 02/15] Adds support for large num items to `DeviceMerge`
 (#3530)

* adds support for large num items

* re-enable vsmem tests

* rephrases test description
---
 cub/cub/device/device_merge.cuh      |  18 ++--
 cub/test/catch2_test_device_merge.cu | 129 +++++----------------------
 2 files changed, 33 insertions(+), 114 deletions(-)

diff --git a/cub/cub/device/device_merge.cuh b/cub/cub/device/device_merge.cuh
index 7135546a0e6..814bad75248 100644
--- a/cub/cub/device/device_merge.cuh
+++ b/cub/cub/device/device_merge.cuh
@@ -76,16 +76,19 @@ struct DeviceMerge
     void* d_temp_storage,
     std::size_t& temp_storage_bytes,
     KeyIteratorIn1 keys_in1,
-    int num_keys1,
+    ::cuda::std::int64_t num_keys1,
     KeyIteratorIn2 keys_in2,
-    int num_keys2,
+    ::cuda::std::int64_t num_keys2,
     KeyIteratorOut keys_out,
     CompareOp compare_op = {},
     cudaStream_t stream  = nullptr)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys");
+
+    using offset_t = ::cuda::std::int64_t;
+
     return detail::merge::
-      dispatch_t<KeyIteratorIn1, NullType*, KeyIteratorIn2, NullType*, KeyIteratorOut, NullType*, int, CompareOp>::
+      dispatch_t<KeyIteratorIn1, NullType*, KeyIteratorIn2, NullType*, KeyIteratorOut, NullType*, offset_t, CompareOp>::
         dispatch(
           d_temp_storage,
           temp_storage_bytes,
@@ -161,16 +164,19 @@ struct DeviceMerge
     std::size_t& temp_storage_bytes,
     KeyIteratorIn1 keys_in1,
     ValueIteratorIn1 values_in1,
-    int num_pairs1,
+    ::cuda::std::int64_t num_pairs1,
     KeyIteratorIn2 keys_in2,
     ValueIteratorIn2 values_in2,
-    int num_pairs2,
+    ::cuda::std::int64_t num_pairs2,
     KeyIteratorOut keys_out,
     ValueIteratorOut values_out,
     CompareOp compare_op = {},
     cudaStream_t stream  = nullptr)
   {
     CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs");
+
+    using offset_t = ::cuda::std::int64_t;
+
     return detail::merge::dispatch_t<
       KeyIteratorIn1,
       ValueIteratorIn1,
@@ -178,7 +184,7 @@ struct DeviceMerge
       ValueIteratorIn2,
       KeyIteratorOut,
       ValueIteratorOut,
-      int,
+      offset_t,
       CompareOp>::dispatch(d_temp_storage,
                            temp_storage_bytes,
                            keys_in1,
diff --git a/cub/test/catch2_test_device_merge.cu b/cub/test/catch2_test_device_merge.cu
index ae0d3f84baa..4835f597710 100644
--- a/cub/test/catch2_test_device_merge.cu
+++ b/cub/test/catch2_test_device_merge.cu
@@ -20,103 +20,8 @@
 DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergePairs, merge_pairs);
 DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergeKeys, merge_keys);
 
-// TODO(bgruber): replace the following by the CUB device API directly, once we have figured out how to handle different
-// offset types
-namespace detail
-{
-template <typename KeyIteratorIn1,
-          typename KeyIteratorIn2,
-          typename KeyIteratorOut,
-          typename Offset,
-          typename CompareOp = ::cuda::std::less<>>
-CUB_RUNTIME_FUNCTION static cudaError_t merge_keys_custom_offset_type(
-  void* d_temp_storage,
-  std::size_t& temp_storage_bytes,
-  KeyIteratorIn1 keys_in1,
-  Offset num_keys1,
-  KeyIteratorIn2 keys_in2,
-  Offset num_keys2,
-  KeyIteratorOut keys_out,
-  CompareOp compare_op = {},
-  cudaStream_t stream  = 0)
-{
-  CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys");
-  return cub::detail::merge::dispatch_t<
-    KeyIteratorIn1,
-    cub::NullType*,
-    KeyIteratorIn2,
-    cub::NullType*,
-    KeyIteratorOut,
-    cub::NullType*,
-    Offset,
-    CompareOp>::dispatch(d_temp_storage,
-                         temp_storage_bytes,
-                         keys_in1,
-                         nullptr,
-                         num_keys1,
-                         keys_in2,
-                         nullptr,
-                         num_keys2,
-                         keys_out,
-                         nullptr,
-                         compare_op,
-                         stream);
-}
-
-template <typename KeyIteratorIn1,
-          typename ValueIteratorIn1,
-          typename KeyIteratorIn2,
-          typename ValueIteratorIn2,
-          typename KeyIteratorOut,
-          typename ValueIteratorOut,
-          typename Offset,
-          typename CompareOp = ::cuda::std::less<>>
-CUB_RUNTIME_FUNCTION static cudaError_t merge_pairs_custom_offset_type(
-  void* d_temp_storage,
-  std::size_t& temp_storage_bytes,
-  KeyIteratorIn1 keys_in1,
-  ValueIteratorIn1 values_in1,
-  Offset num_pairs1,
-  KeyIteratorIn2 keys_in2,
-  ValueIteratorIn2 values_in2,
-  Offset num_pairs2,
-  KeyIteratorOut keys_out,
-  ValueIteratorOut values_out,
-  CompareOp compare_op = {},
-  cudaStream_t stream  = 0)
-{
-  CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs");
-  return cub::detail::merge::dispatch_t<
-    KeyIteratorIn1,
-    ValueIteratorIn1,
-    KeyIteratorIn2,
-    ValueIteratorIn2,
-    KeyIteratorOut,
-    ValueIteratorOut,
-    Offset,
-    CompareOp>::dispatch(d_temp_storage,
-                         temp_storage_bytes,
-                         keys_in1,
-                         values_in1,
-                         num_pairs1,
-                         keys_in2,
-                         values_in2,
-                         num_pairs2,
-                         keys_out,
-                         values_out,
-                         compare_op,
-                         stream);
-}
-} // namespace detail
-
-DECLARE_LAUNCH_WRAPPER(detail::merge_keys_custom_offset_type, merge_keys_custom_offset_type);
-DECLARE_LAUNCH_WRAPPER(detail::merge_pairs_custom_offset_type, merge_pairs_custom_offset_type);
-
 using types = c2h::type_list<std::uint8_t, std::int16_t, std::uint32_t, double>;
 
-// gevtushenko: there is no code path in CUB and Thrust that leads to unsigned offsets, so let's safe some compile time
-using offset_types = c2h::type_list<std::int32_t, std::int64_t>;
-
 template <typename Key,
           typename Offset,
           typename CompareOp = ::cuda::std::less<Key>,
@@ -223,11 +128,27 @@ C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_
     });
 }
 
-C2H_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]", offset_types)
+C2H_TEST("DeviceMerge::MergeKeys works for large number of items", "[merge][device]")
+
+try
+{
+  using key_t    = char;
+  using offset_t = int64_t;
+
+  // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
+  const auto num_items_int_max = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
+  // Generate the input sizes to test for
+  const offset_t num_items_lhs =
+    GENERATE_COPY(values({num_items_int_max + offset_t{1000000}, num_items_int_max - 1, offset_t{3}}));
+  const offset_t num_items_rhs =
+    GENERATE_COPY(values({num_items_int_max + offset_t{1000000}, num_items_int_max, offset_t{3}}));
+
+  test_keys<key_t, offset_t>(num_items_lhs, num_items_rhs, ::cuda::std::less<>{});
+}
+catch (const std::bad_alloc&)
 {
-  using key_t    = int;
-  using offset_t = c2h::get<0, TestType>;
-  test_keys<key_t, offset_t>(3623, 6346, ::cuda::std::less<>{}, merge_keys_custom_offset_type);
+  // allocation failure is not a test failure, so we can run tests on smaller GPUs
 }
 
 C2H_TEST("DeviceMerge::MergeKeys input sizes", "[merge][device]")
@@ -385,14 +306,6 @@ C2H_TEST("DeviceMerge::MergePairs value types", "[merge][device]", types)
   test_pairs<key_t, value_t, offset_t>();
 }
 
-C2H_TEST("DeviceMerge::MergePairs offset types", "[merge][device]", offset_types)
-{
-  using key_t    = int;
-  using value_t  = int;
-  using offset_t = c2h::get<0, TestType>;
-  test_pairs<key_t, value_t, offset_t>(3623, 6346, ::cuda::std::less<>{}, merge_pairs_custom_offset_type);
-}
-
 C2H_TEST("DeviceMerge::MergePairs input sizes", "[merge][device]")
 {
   using key_t      = int;
@@ -410,7 +323,7 @@ try
   using key_t     = char;
   using value_t   = char;
   const auto size = std::int64_t{1} << GENERATE(30, 31, 32, 33);
-  test_pairs<key_t, value_t>(size, size, ::cuda::std::less<>{}, merge_pairs_custom_offset_type);
+  test_pairs<key_t, value_t>(size, size, ::cuda::std::less<>{});
 }
 catch (const std::bad_alloc&)
 {

From a654bc6e0fec3937ddd597dc44adaec61a40701f Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 08:33:22 +0100
Subject: [PATCH 03/15] Support FP16 traits on CTK 12.0 (#3535)

* Support FP16 traits on CTK 12.0
* Only enable constexpr limits when supported
* Support float_eq on CTK < 12.2
---
 .../is_extended_floating_point.h              | 16 ++---
 libcudacxx/include/cuda/std/limits            | 58 ++++++++++++-------
 .../meta.unary.cat/is_floating_point.pass.cpp |  8 +--
 .../limits/is_specialized.pass.cpp            |  8 +--
 .../limits/numeric.limits.members/common.h    | 25 ++++++--
 .../const_data_members.pass.cpp               |  8 +--
 .../denorm_min.pass.cpp                       |  8 +--
 .../numeric.limits.members/digits.pass.cpp    |  8 +--
 .../numeric.limits.members/digits10.pass.cpp  |  8 +--
 .../numeric.limits.members/epsilon.pass.cpp   |  8 +--
 .../has_denorm.pass.cpp                       |  8 +--
 .../has_denorm_loss.pass.cpp                  |  8 +--
 .../has_infinity.pass.cpp                     |  8 +--
 .../has_quiet_NaN.pass.cpp                    |  8 +--
 .../has_signaling_NaN.pass.cpp                |  8 +--
 .../numeric.limits.members/infinity.pass.cpp  | 16 ++---
 .../is_bounded.pass.cpp                       |  8 +--
 .../numeric.limits.members/is_exact.pass.cpp  |  8 +--
 .../numeric.limits.members/is_iec559.pass.cpp |  8 +--
 .../is_integer.pass.cpp                       |  8 +--
 .../numeric.limits.members/is_modulo.pass.cpp |  8 +--
 .../numeric.limits.members/is_signed.pass.cpp |  8 +--
 .../numeric.limits.members/lowest.pass.cpp    |  8 +--
 .../numeric.limits.members/max.pass.cpp       |  8 +--
 .../max_digits10.pass.cpp                     |  8 +--
 .../max_exponent.pass.cpp                     |  8 +--
 .../max_exponent10.pass.cpp                   |  8 +--
 .../numeric.limits.members/min.pass.cpp       |  8 +--
 .../min_exponent.pass.cpp                     |  8 +--
 .../min_exponent10.pass.cpp                   |  8 +--
 .../numeric.limits.members/quiet_NaN.pass.cpp |  8 +--
 .../numeric.limits.members/radix.pass.cpp     |  8 +--
 .../round_error.pass.cpp                      |  8 +--
 .../round_style.pass.cpp                      |  8 +--
 .../signaling_NaN.pass.cpp                    |  8 +--
 .../tinyness_before.pass.cpp                  |  8 +--
 .../numeric.limits.members/traps.pass.cpp     |  8 +--
 37 files changed, 205 insertions(+), 174 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
index b9700a87066..040418f5fe7 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
@@ -22,16 +22,16 @@
 
 #include <cuda/std/__type_traits/integral_constant.h>
 
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
 #  include <cuda_fp16.h>
-#endif // _LIBCUDACXX_HAS_NVFP16
+#endif // _CCCL_HAS_NVFP16
 
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#if defined(_CCCL_HAS_NVBF16)
 _CCCL_DIAG_PUSH
 _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
 #  include <cuda_bf16.h>
 _CCCL_DIAG_POP
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 
 #if _CCCL_HAS_NVFP8()
 #  include <cuda_fp8.h>
@@ -53,7 +53,7 @@ _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v
 #  endif // !_CCCL_NO_INLINE_VARIABLES
 #endif // !_CCCL_NO_VARIABLE_TEMPLATES
 
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
 template <>
 struct __is_extended_floating_point<__half> : true_type
 {};
@@ -62,9 +62,9 @@ struct __is_extended_floating_point<__half> : true_type
 template <>
 _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__half> = true;
 #  endif // !_CCCL_NO_INLINE_VARIABLES
-#endif // _LIBCUDACXX_HAS_NVFP16
+#endif // _CCCL_HAS_NVFP16
 
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#if defined(_CCCL_HAS_NVBF16)
 template <>
 struct __is_extended_floating_point<__nv_bfloat16> : true_type
 {};
@@ -73,7 +73,7 @@ struct __is_extended_floating_point<__nv_bfloat16> : true_type
 template <>
 _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__nv_bfloat16> = true;
 #  endif // !_CCCL_NO_INLINE_VARIABLES
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 
 #if _CCCL_HAS_NVFP8()
 template <>
diff --git a/libcudacxx/include/cuda/std/limits b/libcudacxx/include/cuda/std/limits
index 9d0cbc81108..29f4bf24ec3 100644
--- a/libcudacxx/include/cuda/std/limits
+++ b/libcudacxx/include/cuda/std/limits
@@ -608,7 +608,13 @@ public:
 #endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
 };
 
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
+#  ifdef _LIBCUDACXX_HAS_NVFP16
+#    define _LIBCUDACXX_FP16_CONSTEXPR constexpr
+#  else //_LIBCUDACXX_HAS_NVFP16
+#    define _LIBCUDACXX_FP16_CONSTEXPR
+#  endif //_LIBCUDACXX_HAS_NVFP16
+
 template <>
 class __numeric_limits_impl<__half, __numeric_limits_type::__floating_point>
 {
@@ -621,15 +627,15 @@ public:
   static constexpr int digits       = 11;
   static constexpr int digits10     = 3;
   static constexpr int max_digits10 = 5;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type min() noexcept
   {
     return type(__half_raw{0x0400u});
   }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type max() noexcept
   {
     return type(__half_raw{0x7bffu});
   }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type lowest() noexcept
   {
     return type(__half_raw{0xfbffu});
   }
@@ -637,11 +643,11 @@ public:
   static constexpr bool is_integer = false;
   static constexpr bool is_exact   = false;
   static constexpr int radix       = __FLT_RADIX__;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type epsilon() noexcept
   {
     return type(__half_raw{0x1400u});
   }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type round_error() noexcept
   {
     return type(__half_raw{0x3800u});
   }
@@ -656,19 +662,19 @@ public:
   static constexpr bool has_signaling_NaN        = true;
   static constexpr float_denorm_style has_denorm = denorm_present;
   static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type infinity() noexcept
   {
     return type(__half_raw{0x7c00u});
   }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type quiet_NaN() noexcept
   {
     return type(__half_raw{0x7e00u});
   }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type signaling_NaN() noexcept
   {
     return type(__half_raw{0x7d00u});
   }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_FP16_CONSTEXPR type denorm_min() noexcept
   {
     return type(__half_raw{0x0001u});
   }
@@ -681,9 +687,16 @@ public:
   static constexpr bool tinyness_before          = false;
   static constexpr float_round_style round_style = round_to_nearest;
 };
-#endif // _LIBCUDACXX_HAS_NVFP16
+#  undef _LIBCUDACXX_FP16_CONSTEXPR
+#endif // _CCCL_HAS_NVFP16
+
+#if defined(_CCCL_HAS_NVBF16)
+#  ifdef _LIBCUDACXX_HAS_NVBF16
+#    define _LIBCUDACXX_BF16_CONSTEXPR constexpr
+#  else //_LIBCUDACXX_HAS_NVBF16
+#    define _LIBCUDACXX_BF16_CONSTEXPR
+#  endif //_LIBCUDACXX_HAS_NVBF16
 
-#if defined(_LIBCUDACXX_HAS_NVBF16)
 template <>
 class __numeric_limits_impl<__nv_bfloat16, __numeric_limits_type::__floating_point>
 {
@@ -696,15 +709,15 @@ public:
   static constexpr int digits       = 8;
   static constexpr int digits10     = 2;
   static constexpr int max_digits10 = 4;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type min() noexcept
   {
     return type(__nv_bfloat16_raw{0x0080u});
   }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type max() noexcept
   {
     return type(__nv_bfloat16_raw{0x7f7fu});
   }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type lowest() noexcept
   {
     return type(__nv_bfloat16_raw{0xff7fu});
   }
@@ -712,11 +725,11 @@ public:
   static constexpr bool is_integer = false;
   static constexpr bool is_exact   = false;
   static constexpr int radix       = __FLT_RADIX__;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type epsilon() noexcept
   {
     return type(__nv_bfloat16_raw{0x3c00u});
   }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type round_error() noexcept
   {
     return type(__nv_bfloat16_raw{0x3f00u});
   }
@@ -731,19 +744,19 @@ public:
   static constexpr bool has_signaling_NaN        = true;
   static constexpr float_denorm_style has_denorm = denorm_present;
   static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type infinity() noexcept
   {
     return type(__nv_bfloat16_raw{0x7f80u});
   }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type quiet_NaN() noexcept
   {
     return type(__nv_bfloat16_raw{0x7fc0u});
   }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type signaling_NaN() noexcept
   {
     return type(__nv_bfloat16_raw{0x7fa0u});
   }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_BF16_CONSTEXPR type denorm_min() noexcept
   {
     return type(__nv_bfloat16_raw{0x0001u});
   }
@@ -756,7 +769,8 @@ public:
   static constexpr bool tinyness_before          = false;
   static constexpr float_round_style round_style = round_to_nearest;
 };
-#endif // _LIBCUDACXX_HAS_NVBF16
+#  undef _LIBCUDACXX_BF16_CONSTEXPR
+#endif // _CCCL_HAS_NVBF16
 
 #if _CCCL_HAS_NVFP8()
 #  if defined(_CCCL_BUILTIN_BIT_CAST) || _CCCL_STD_VER >= 2014
diff --git a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp
index b0b7a3f3b69..5a04070c598 100644
--- a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp
+++ b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/meta.unary/meta.unary.cat/is_floating_point.pass.cpp
@@ -80,12 +80,12 @@ int main(int, char**)
   test_is_floating_point<float>();
   test_is_floating_point<double>();
   test_is_floating_point<long double>();
-#ifdef _LIBCUDACXX_HAS_NVFP16
+#ifdef _CCCL_HAS_NVFP16
   test_is_floating_point<__half>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#ifdef _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVFP16
+#ifdef _CCCL_HAS_NVBF16
   test_is_floating_point<__nv_bfloat16>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test_is_floating_point<__nv_fp8_e4m3>();
   test_is_floating_point<__nv_fp8_e5m2>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp
index 7113c0e2772..adb30091033 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/is_specialized.pass.cpp
@@ -68,12 +68,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 
   static_assert(!cuda::std::numeric_limits<cuda::std::complex<double>>::is_specialized,
                 "!cuda::std::numeric_limits<cuda::std::complex<double> >::is_specialized");
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h
index 8400071611c..7d15f2ba6b6 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/common.h
@@ -17,6 +17,7 @@
 #define __CUDA_NO_BFLOAT16_CONVERSIONS__ 1
 #define __CUDA_NO_BFLOAT16_OPERATORS__   1
 
+#include <cuda/std/__bit/bit_cast.h>
 #include <cuda/std/limits>
 
 template <class T>
@@ -42,27 +43,43 @@ __host__ __device__ inline __nv_fp8_e5m2 make_fp8_e5m2(double x, __nv_saturation
 
 __host__ __device__ inline bool float_eq(__nv_fp8_e4m3 x, __nv_fp8_e4m3 y)
 {
+#  if _CCCL_CUDACC_AT_LEAST(12, 2)
   return float_eq(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3)}, __half{__nv_cvt_fp8_to_halfraw(y.__x, __NV_E4M3)});
+#  else
+  return ::cuda::std::bit_cast<unsigned char>(x) == ::cuda::std::bit_cast<unsigned char>(y);
+#  endif
 }
 
 __host__ __device__ inline bool float_eq(__nv_fp8_e5m2 x, __nv_fp8_e5m2 y)
 {
+#  if _CCCL_CUDACC_AT_LEAST(12, 2)
   return float_eq(__half{__nv_cvt_fp8_to_halfraw(x.__x, __NV_E5M2)}, __half{__nv_cvt_fp8_to_halfraw(y.__x, __NV_E5M2)});
+#  else
+  return ::cuda::std::bit_cast<unsigned char>(x) == ::cuda::std::bit_cast<unsigned char>(y);
+#  endif
 }
 #endif // _CCCL_HAS_NVFP8
 
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
 __host__ __device__ inline bool float_eq(__half x, __half y)
 {
+#  if _CCCL_CUDACC_AT_LEAST(12, 2)
   return __heq(x, y);
+#  else
+  return __half2float(x) == __half2float(y);
+#  endif
 }
-#endif // _LIBCUDACXX_HAS_NVFP16
+#endif // _CCCL_HAS_NVFP16
 
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#if defined(_CCCL_HAS_NVBF16)
 __host__ __device__ inline bool float_eq(__nv_bfloat16 x, __nv_bfloat16 y)
 {
+#  if _CCCL_CUDACC_AT_LEAST(12, 2)
   return __heq(x, y);
+#  else
+  return __bfloat162float(x) == __bfloat162float(y);
+#  endif
 }
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 
 #endif // NUMERIC_LIMITS_MEMBERS_COMMON_H
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
index b095d63afcd..093b5d331be 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
@@ -110,12 +110,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test_type<long double>();
 #endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test_type<__half>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test_type<__nv_bfloat16>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test_type<__nv_fp8_e4m3>();
   test_type<__nv_fp8_e5m2>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp
index 475f41a3388..9ea232eaad6 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/denorm_min.pass.cpp
@@ -66,12 +66,12 @@ int main(int, char**)
   test<long double>(LDBL_TRUE_MIN);
 #  endif
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half>(__double2half(5.9604644775390625e-08));
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(9.18354961579912115600575419705e-41));
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3>(make_fp8_e4m3(0.001953125));
   test<__nv_fp8_e5m2>(make_fp8_e5m2(0.0000152587890625));
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp
index 0d3c910b672..01f6b05543b 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits.pass.cpp
@@ -55,12 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MANT_DIG>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, 11>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, 8>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, 3>();
   test<__nv_fp8_e5m2, 2>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp
index bd66aeecfeb..24c53725738 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/digits10.pass.cpp
@@ -74,12 +74,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3>();
   test<__nv_fp8_e5m2>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp
index 15366bdf308..bb65847df33 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/epsilon.pass.cpp
@@ -57,12 +57,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(LDBL_EPSILON);
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half>(__double2half(0.0009765625));
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(0.0078125));
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3>(make_fp8_e4m3(0.125));
   test<__nv_fp8_e5m2>(make_fp8_e5m2(0.25));
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
index 8fa506b93ce..8d9881580bf 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
@@ -54,12 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, cuda::std::denorm_present>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, cuda::std::denorm_present>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, cuda::std::denorm_present>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, cuda::std::denorm_present>();
   test<__nv_fp8_e5m2, cuda::std::denorm_present>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
index 3b7722acd8b..5a046a9b339 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
@@ -54,12 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, false>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, false>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, false>();
   test<__nv_fp8_e5m2, false>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp
index ebddcb4421e..768e53d1c88 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_infinity.pass.cpp
@@ -54,12 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, true>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, true>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, false>();
   test<__nv_fp8_e5m2, true>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp
index 908f2d7fa4a..4c3e11a9b05 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_quiet_NaN.pass.cpp
@@ -54,12 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, true>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, true>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, true>();
   test<__nv_fp8_e5m2, true>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp
index 62d81c8a524..1b80d1869e6 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/has_signaling_NaN.pass.cpp
@@ -54,12 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, true>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, true>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, false>();
   test<__nv_fp8_e5m2, true>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp
index 627105a4a8c..8dd611556c5 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/infinity.pass.cpp
@@ -64,12 +64,12 @@ int main(int, char**)
 #  ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(1. / 0.);
 #  endif
-#  if defined(_LIBCUDACXX_HAS_NVFP16)
+#  if defined(_CCCL_HAS_NVFP16)
   test<__half>(__double2half(1.0 / 0.0));
-#  endif // _LIBCUDACXX_HAS_NVFP16
-#  if defined(_LIBCUDACXX_HAS_NVBF16)
+#  endif // _CCCL_HAS_NVFP16
+#  if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(1.0 / 0.0));
-#  endif // _LIBCUDACXX_HAS_NVBF16
+#  endif // _CCCL_HAS_NVBF16
 #  if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3>(__nv_fp8_e4m3{});
   test<__nv_fp8_e5m2>(make_fp8_e5m2(1.0 / 0.0));
@@ -81,12 +81,12 @@ int main(int, char**)
 #  ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(INFINITY);
 #  endif
-#  if defined(_LIBCUDACXX_HAS_NVFP16)
+#  if defined(_CCCL_HAS_NVFP16)
   test<__half>(__double2half(INFINITY));
-#  endif // _LIBCUDACXX_HAS_NVFP16
-#  if defined(_LIBCUDACXX_HAS_NVBF16)
+#  endif // _CCCL_HAS_NVFP16
+#  if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(INFINITY));
-#  endif // _LIBCUDACXX_HAS_NVBF16
+#  endif // _CCCL_HAS_NVBF16
 #  if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3>(__nv_fp8_e4m3{});
   test<__nv_fp8_e5m2>(make_fp8_e5m2(INFINITY));
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp
index eeb9740e4e2..e28ab8313b6 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_bounded.pass.cpp
@@ -54,12 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, true>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, true>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, true>();
   test<__nv_fp8_e5m2, true>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp
index c3c2e027c72..e6038f1589b 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_exact.pass.cpp
@@ -54,12 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, false>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, false>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, false>();
   test<__nv_fp8_e5m2, false>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
index 7bab40e8826..1ff809bad09 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_iec559.pass.cpp
@@ -54,12 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, true>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, true>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, false>();
   test<__nv_fp8_e5m2, false>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp
index 68e7437f1e0..eed9d38c050 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_integer.pass.cpp
@@ -54,12 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, false>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, false>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, false>();
   test<__nv_fp8_e5m2, false>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp
index 992be2b18b7..fc3ca9dbb4e 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_modulo.pass.cpp
@@ -54,12 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, false>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, false>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, false>();
   test<__nv_fp8_e5m2, false>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp
index be7e4f235a7..54005f6c0b9 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/is_signed.pass.cpp
@@ -54,12 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, true>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, true>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, true>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, true>();
   test<__nv_fp8_e5m2, true>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
index 6a8b2a9c181..72190bd2ad7 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
@@ -66,12 +66,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(-LDBL_MAX);
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half>(__double2half(-65504.0));
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(-3.3895313892515355e+38));
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3>(make_fp8_e4m3(-448.0));
   test<__nv_fp8_e5m2>(make_fp8_e5m2(-57344.0));
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
index a1582e41b22..5039f773a2f 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
@@ -65,12 +65,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(LDBL_MAX);
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half>(__double2half(65504.0));
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(3.3895313892515355e+38));
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3>(make_fp8_e4m3(448.0));
   test<__nv_fp8_e5m2>(make_fp8_e5m2(57344.0));
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp
index d01a4aa099c..309279bc79c 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_digits10.pass.cpp
@@ -69,12 +69,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3>();
   test<__nv_fp8_e5m2>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp
index 3027e9f06f5..606e9c52b7f 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent.pass.cpp
@@ -62,12 +62,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MAX_EXP>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, 16>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, 128>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, 8>();
   test<__nv_fp8_e5m2, 15>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp
index 5924aee173d..61145deec86 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max_exponent10.pass.cpp
@@ -62,12 +62,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MAX_10_EXP>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, 4>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, 38>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, 2>();
   test<__nv_fp8_e5m2, 4>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
index 15f470909df..ccab08a38f5 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
@@ -66,12 +66,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(LDBL_MIN);
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half>(__double2half(6.103515625e-05));
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(1.17549435082228750796873653722e-38));
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3>(make_fp8_e4m3(0.015625));
   test<__nv_fp8_e5m2>(make_fp8_e5m2(0.000061035));
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp
index b63d653a7c3..c942a6288be 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent.pass.cpp
@@ -62,12 +62,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MIN_EXP>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, -13>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, -125>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, -6>();
   test<__nv_fp8_e5m2, -15>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp
index a6ff20e7fde..e9b6f29d25f 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min_exponent10.pass.cpp
@@ -62,12 +62,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, LDBL_MIN_10_EXP>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, -4>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, -37>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, -2>();
   test<__nv_fp8_e5m2, -5>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp
index 2d6d9582f5c..a8b076fbeee 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/quiet_NaN.pass.cpp
@@ -108,12 +108,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3>();
   test<__nv_fp8_e5m2>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp
index 7e5c87927aa..dd15c391180 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/radix.pass.cpp
@@ -55,12 +55,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, FLT_RADIX>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, FLT_RADIX>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, FLT_RADIX>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, FLT_RADIX>();
   test<__nv_fp8_e5m2, FLT_RADIX>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp
index d4faf373a09..95ed80eb951 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_error.pass.cpp
@@ -57,12 +57,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>(0.5);
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half>(__double2half(0.5));
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16>(__double2bfloat16(0.5));
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3>(make_fp8_e4m3(0.5));
   test<__nv_fp8_e5m2>(make_fp8_e5m2(0.5));
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp
index 8515581d650..1eb5c0b0f5a 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/round_style.pass.cpp
@@ -54,12 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, cuda::std::round_to_nearest>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, cuda::std::round_to_nearest>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, cuda::std::round_to_nearest>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, cuda::std::round_to_nearest>();
   test<__nv_fp8_e5m2, cuda::std::round_to_nearest>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp
index 19ace1b3d2c..0ec70976b32 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/signaling_NaN.pass.cpp
@@ -108,12 +108,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3>();
   test<__nv_fp8_e5m2>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
index 38dec8c872b..1da28874b06 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
@@ -54,12 +54,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, false>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, false>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, false>();
   test<__nv_fp8_e5m2, false>();
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
index 55d7eb990db..4cb627a4b77 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
@@ -60,12 +60,12 @@ int main(int, char**)
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
   test<long double, false>();
 #endif
-#if defined(_LIBCUDACXX_HAS_NVFP16)
+#if defined(_CCCL_HAS_NVFP16)
   test<__half, false>();
-#endif // _LIBCUDACXX_HAS_NVFP16
-#if defined(_LIBCUDACXX_HAS_NVBF16)
+#endif // _CCCL_HAS_NVFP16
+#if defined(_CCCL_HAS_NVBF16)
   test<__nv_bfloat16, false>();
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // _CCCL_HAS_NVBF16
 #if _CCCL_HAS_NVFP8()
   test<__nv_fp8_e4m3, false>();
   test<__nv_fp8_e5m2, false>();

From b6209e841a72eb7def4ba2aace30eff8a9b539a4 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 30 Jan 2025 09:06:31 +0100
Subject: [PATCH 04/15] Suppress execution checks for vocabulary types (#3578)

* Suppress execution checks for optional
* Suppress execution checks for `expected`
* Suppress execution checks for `pair`
* Suppress execution checks for `variant`
---
 .../cuda/std/__expected/bad_expected_access.h |  21 +-
 .../include/cuda/std/__expected/expected.h    |  20 ++
 .../cuda/std/__expected/expected_base.h       |  18 ++
 .../include/cuda/std/__expected/unexpected.h  |   7 +
 .../include/cuda/std/__memory/construct_at.h  |   1 +
 libcudacxx/include/cuda/std/__utility/pair.h  |  19 +-
 .../cuda/std/detail/libcxx/include/optional   |  25 +++
 .../cuda/std/detail/libcxx/include/tuple      |   2 +
 .../cuda/std/detail/libcxx/include/variant    |  20 ++
 .../expected/device_only_types.pass.cpp       | 201 ++++++++++++++++++
 .../expected/host_only_types.pass.cpp         | 199 +++++++++++++++++
 .../optional/device_only_types.pass.cpp       | 136 ++++++++++++
 .../optional/host_only_types.pass.cpp         | 134 ++++++++++++
 .../tuple/device_only_types.pass.cpp          |  81 +++++++
 .../tuple/forward_as_tuple_interop.pass.cpp   |   0
 .../utilities/tuple/host_only_types.pass.cpp  |  90 ++++++++
 .../tuple/vector_types_get.pass.cpp           |   0
 .../vector_types_structured_bindings.pass.cpp |   0
 .../tuple/vector_types_tuple_element.pass.cpp |   0
 .../tuple/vector_types_tuple_size.pass.cpp    |   0
 .../unexpected/device_only_types.pass.cpp     |  82 +++++++
 .../unexpected/host_only_types.pass.cpp       |  85 ++++++++
 .../utility/pair/device_only_types.pass.cpp   |  93 ++++++++
 .../utility/pair/host_only_types.pass.cpp     |  93 ++++++++
 .../pair/interop}/pair.assign.pass.cpp        |   0
 .../utility/pair/interop}/pair.cons.pass.cpp  |   0
 .../utility/pair/interop}/pair.conv.pass.cpp  |   0
 .../variant/device_only_types.pass.cpp        | 120 +++++++++++
 .../variant/host_only_types.pass.cpp          | 129 +++++++++++
 libcudacxx/test/support/host_device_types.h   | 148 +++++++++++++
 30 files changed, 1714 insertions(+), 10 deletions(-)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/expected/device_only_types.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/expected/host_only_types.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/optional/device_only_types.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/optional/host_only_types.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/tuple/device_only_types.pass.cpp
 rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/forward_as_tuple_interop.pass.cpp (100%)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/tuple/host_only_types.pass.cpp
 rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/vector_types_get.pass.cpp (100%)
 rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/vector_types_structured_bindings.pass.cpp (100%)
 rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/vector_types_tuple_element.pass.cpp (100%)
 rename libcudacxx/test/libcudacxx/cuda/{ => utilities}/tuple/vector_types_tuple_size.pass.cpp (100%)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/unexpected/device_only_types.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/unexpected/host_only_types.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/device_only_types.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/host_only_types.pass.cpp
 rename libcudacxx/test/libcudacxx/cuda/{pair_interop => utilities/utility/pair/interop}/pair.assign.pass.cpp (100%)
 rename libcudacxx/test/libcudacxx/cuda/{pair_interop => utilities/utility/pair/interop}/pair.cons.pass.cpp (100%)
 rename libcudacxx/test/libcudacxx/cuda/{pair_interop => utilities/utility/pair/interop}/pair.conv.pass.cpp (100%)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/variant/device_only_types.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/utilities/variant/host_only_types.pass.cpp
 create mode 100644 libcudacxx/test/support/host_device_types.h

diff --git a/libcudacxx/include/cuda/std/__expected/bad_expected_access.h b/libcudacxx/include/cuda/std/__expected/bad_expected_access.h
index 5600402e429..0f10f546be6 100644
--- a/libcudacxx/include/cuda/std/__expected/bad_expected_access.h
+++ b/libcudacxx/include/cuda/std/__expected/bad_expected_access.h
@@ -51,14 +51,6 @@ class bad_expected_access;
 template <>
 class bad_expected_access<void> : public ::std::exception
 {
-protected:
-  _CCCL_HIDE_FROM_ABI bad_expected_access() noexcept                             = default;
-  _CCCL_HIDE_FROM_ABI bad_expected_access(const bad_expected_access&)            = default;
-  _CCCL_HIDE_FROM_ABI bad_expected_access(bad_expected_access&&)                 = default;
-  _CCCL_HIDE_FROM_ABI bad_expected_access& operator=(const bad_expected_access&) = default;
-  _CCCL_HIDE_FROM_ABI bad_expected_access& operator=(bad_expected_access&&)      = default;
-  ~bad_expected_access() noexcept override                                       = default;
-
 public:
   // The way this has been designed (by using a class template below) means that we'll already
   // have a profusion of these vtables in TUs, and the dynamic linker will already have a bunch
@@ -74,10 +66,21 @@ template <class _Err>
 class bad_expected_access : public bad_expected_access<void>
 {
 public:
-  explicit bad_expected_access(_Err __e)
+#      if _CCCL_CUDA_COMPILER(CLANG) // Clang needs this or it breaks with device only types
+  _CCCL_HOST_DEVICE
+#      endif // _CCCL_CUDA_COMPILER(CLANG)
+  _CCCL_HIDE_FROM_ABI explicit bad_expected_access(_Err __e)
       : __unex_(_CUDA_VSTD::move(__e))
   {}
 
+#      if _CCCL_CUDA_COMPILER(CLANG) // Clang needs this or it breaks with device only types
+  _CCCL_HOST_DEVICE
+#      endif // _CCCL_CUDA_COMPILER(CLANG)
+  _CCCL_HIDE_FROM_ABI ~bad_expected_access() noexcept
+  {
+    __unex_.~_Err();
+  }
+
   _LIBCUDACXX_HIDE_FROM_ABI _Err& error() & noexcept
   {
     return __unex_;
diff --git a/libcudacxx/include/cuda/std/__expected/expected.h b/libcudacxx/include/cuda/std/__expected/expected.h
index cc5ddfc03f0..f618ff57c92 100644
--- a/libcudacxx/include/cuda/std/__expected/expected.h
+++ b/libcudacxx/include/cuda/std/__expected/expected.h
@@ -1070,6 +1070,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 
   // [expected.object.eq], equality operators
+  _CCCL_EXEC_CHECK_DISABLE
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected& __y)
   {
     if (__x.__has_val_ != __y.has_value())
@@ -1090,12 +1091,14 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected& __y)
   {
     return !(__x == __y);
   }
 #  endif // _CCCL_STD_VER < 2020
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _T2, class _E2)
   _CCCL_REQUIRES((!_CCCL_TRAIT(is_void, _T2)))
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected<_T2, _E2>& __y)
@@ -1118,6 +1121,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _T2, class _E2)
   _CCCL_REQUIRES((!_CCCL_TRAIT(is_void, _T2)))
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected<_T2, _E2>& __y)
@@ -1126,6 +1130,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 #  endif // _CCCL_STD_VER < 2020
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _T2)
   _CCCL_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) )
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const _T2& __v)
@@ -1133,18 +1138,21 @@ class expected : private __expected_move_assign<_Tp, _Err>
     return __x.__has_val_ && static_cast<bool>(__x.__union_.__val_ == __v);
   }
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _T2)
   _CCCL_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) )
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const _T2& __v, const expected& __x)
   {
     return __x.__has_val_ && static_cast<bool>(__x.__union_.__val_ == __v);
   }
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _T2)
   _CCCL_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) )
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const _T2& __v)
   {
     return !__x.__has_val_ || static_cast<bool>(__x.__union_.__val_ != __v);
   }
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _T2)
   _CCCL_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) )
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const _T2& __v, const expected& __x)
@@ -1153,22 +1161,26 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 #  endif // _CCCL_STD_VER < 2020
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const unexpected<_E2>& __e)
   {
     return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __e.error());
   }
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const unexpected<_E2>& __e, const expected& __x)
   {
     return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __e.error());
   }
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const unexpected<_E2>& __e)
   {
     return __x.__has_val_ || static_cast<bool>(__x.__union_.__unex_ != __e.error());
   }
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const unexpected<_E2>& __e, const expected& __x)
   {
@@ -1906,6 +1918,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 
   // [expected.void.eq], equality operators
+  _CCCL_EXEC_CHECK_DISABLE
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected& __y) noexcept
   {
     if (__x.__has_val_ != __y.has_value())
@@ -1918,12 +1931,14 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
     }
   }
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected& __y) noexcept
   {
     return !(__x == __y);
   }
 #  endif
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator==(const expected& __x, const expected<void, _E2>& __y) noexcept
@@ -1938,6 +1953,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
     }
   }
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator!=(const expected& __x, const expected<void, _E2>& __y) noexcept
@@ -1946,22 +1962,26 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 #  endif
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const unexpected<_E2>& __y) noexcept
   {
     return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __y.error());
   }
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const unexpected<_E2>& __y, const expected& __x) noexcept
   {
     return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __y.error());
   }
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool operator!=(const expected& __x, const unexpected<_E2>& __y) noexcept
   {
     return __x.__has_val_ || static_cast<bool>(__x.__union_.__unex_ != __y.error());
   }
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool operator!=(const unexpected<_E2>& __y, const expected& __x) noexcept
   {
diff --git a/libcudacxx/include/cuda/std/__expected/expected_base.h b/libcudacxx/include/cuda/std/__expected/expected_base.h
index 31de97e3f50..0de6cc29158 100644
--- a/libcudacxx/include/cuda/std/__expected/expected_base.h
+++ b/libcudacxx/include/cuda/std/__expected/expected_base.h
@@ -71,30 +71,35 @@ union __expected_union_t
   struct __empty_t
   {};
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Tp2 = _Tp)
   _CCCL_REQUIRES(_CCCL_TRAIT(is_default_constructible, _Tp2))
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp2))
       : __val_()
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Tp2 = _Tp)
   _CCCL_REQUIRES((!_CCCL_TRAIT(is_default_constructible, _Tp2)))
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept
       : __empty_()
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(in_place_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
       : __val_(_CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(unexpect_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Err, _Args...))
       : __unex_(_CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _Fun, class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(
     __expected_construct_from_invoke_tag,
@@ -104,6 +109,7 @@ union __expected_union_t
       : __val_(_CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fun>(__fun), _CUDA_VSTD::forward<_Args>(__args)...))
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _Fun, class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(
     __expected_construct_from_invoke_tag,
@@ -128,18 +134,21 @@ union __expected_union_t<_Tp, _Err, true>
   struct __empty_t
   {};
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Tp2 = _Tp)
   _CCCL_REQUIRES(_CCCL_TRAIT(is_default_constructible, _Tp2))
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept(_CCCL_TRAIT(is_nothrow_default_constructible, _Tp2))
       : __val_()
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Tp2 = _Tp)
   _CCCL_REQUIRES((!_CCCL_TRAIT(is_default_constructible, _Tp2)))
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t() noexcept
       : __empty_()
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(in_place_t, _Args&&... __args) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _Tp, _Args...))
@@ -152,6 +161,7 @@ union __expected_union_t<_Tp, _Err, true>
       : __unex_(_CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _Fun, class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(
     __expected_construct_from_invoke_tag,
@@ -161,6 +171,7 @@ union __expected_union_t<_Tp, _Err, true>
       : __val_(_CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fun>(__fun), _CUDA_VSTD::forward<_Args>(__args)...))
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _Fun, class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __expected_union_t(
     __expected_construct_from_invoke_tag,
@@ -436,6 +447,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err>
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_storage, __expected_destruct, _Tp, _Err);
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _T1, class _T2, class... _Args)
   _CCCL_REQUIRES(_CCCL_TRAIT(is_nothrow_constructible, _T1, _Args...))
   static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void
@@ -445,6 +457,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err>
     _LIBCUDACXX_CONSTRUCT_AT(__newval, _CUDA_VSTD::forward<_Args>(__args)...);
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _T1, class _T2, class... _Args)
   _CCCL_REQUIRES(
     (!_CCCL_TRAIT(is_nothrow_constructible, _T1, _Args...)) _CCCL_AND _CCCL_TRAIT(is_nothrow_move_constructible, _T1))
@@ -456,6 +469,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err>
     _LIBCUDACXX_CONSTRUCT_AT(__newval, _CUDA_VSTD::move(__tmp));
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _T1, class _T2, class... _Args)
   _CCCL_REQUIRES(
     (!_CCCL_TRAIT(is_nothrow_constructible, _T1, _Args...)) _CCCL_AND(!_CCCL_TRAIT(is_nothrow_move_constructible, _T1)))
@@ -475,6 +489,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err>
     __trans.__complete();
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Err2 = _Err)
   _CCCL_REQUIRES(_CCCL_TRAIT(is_nothrow_move_constructible, _Err2))
   static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void
@@ -493,6 +508,7 @@ struct __expected_storage : __expected_destruct<_Tp, _Err>
     __with_err.__has_val_ = true;
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Err2 = _Err)
   _CCCL_REQUIRES((!_CCCL_TRAIT(is_nothrow_move_constructible, _Err2)))
   static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void
@@ -653,6 +669,7 @@ struct __expected_copy_assign<_Tp, _Err, __smf_availability::__available> : __ex
   _CCCL_HIDE_FROM_ABI __expected_copy_assign(const __expected_copy_assign&) = default;
   _CCCL_HIDE_FROM_ABI __expected_copy_assign(__expected_copy_assign&&)      = default;
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __expected_copy_assign&
   operator=(const __expected_copy_assign& __other) noexcept(
     _CCCL_TRAIT(is_nothrow_copy_assignable, _Tp) && _CCCL_TRAIT(is_nothrow_copy_constructible, _Tp)
@@ -917,6 +934,7 @@ struct __expected_storage<void, _Err> : __expected_destruct<void, _Err>
 {
   _LIBCUDACXX_DELEGATE_CONSTRUCTORS(__expected_storage, __expected_destruct, void, _Err);
 
+  _CCCL_EXEC_CHECK_DISABLE
   static _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void __swap_val_unex_impl(
     __expected_storage& __with_val,
     __expected_storage& __with_err) noexcept(_CCCL_TRAIT(is_nothrow_move_constructible, _Err))
diff --git a/libcudacxx/include/cuda/std/__expected/unexpected.h b/libcudacxx/include/cuda/std/__expected/unexpected.h
index 0f8f3784374..0da94402a85 100644
--- a/libcudacxx/include/cuda/std/__expected/unexpected.h
+++ b/libcudacxx/include/cuda/std/__expected/unexpected.h
@@ -73,6 +73,7 @@ class unexpected
   _CCCL_HIDE_FROM_ABI unexpected(const unexpected&) = default;
   _CCCL_HIDE_FROM_ABI unexpected(unexpected&&)      = default;
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Error = _Err)
   _CCCL_REQUIRES((!_CCCL_TRAIT(is_same, remove_cvref_t<_Error>, unexpected)
                   && !_CCCL_TRAIT(is_same, remove_cvref_t<_Error>, in_place_t)
@@ -82,6 +83,7 @@ class unexpected
       : __unex_(_CUDA_VSTD::forward<_Error>(__error))
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class... _Args)
   _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, _Args...))
   _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit unexpected(in_place_t, _Args&&... __args) noexcept(
@@ -89,6 +91,7 @@ class unexpected
       : __unex_(_CUDA_VSTD::forward<_Args>(__args)...)
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Up, class... _Args)
   _CCCL_REQUIRES(_CCCL_TRAIT(is_constructible, _Err, initializer_list<_Up>&, _Args...))
   _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit unexpected(
@@ -123,6 +126,7 @@ class unexpected
   }
 
   // [expected.un.swap]
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_HIDE_FROM_ABI constexpr void swap(unexpected& __other) noexcept(_CCCL_TRAIT(is_nothrow_swappable, _Err))
   {
     static_assert(_CCCL_TRAIT(is_swappable, _Err), "E must be swappable");
@@ -130,6 +134,7 @@ class unexpected
     swap(__unex_, __other.__unex_);
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Err2 = _Err)
   _CCCL_REQUIRES(_CCCL_TRAIT(is_swappable, _Err2))
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr void
@@ -140,6 +145,7 @@ class unexpected
   }
 
   // [expected.un.eq]
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _UErr>
   _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator==(const unexpected& __lhs,
@@ -148,6 +154,7 @@ class unexpected
     return __lhs.error() == __rhs.error();
   }
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _UErr>
   _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator!=(const unexpected& __lhs,
diff --git a/libcudacxx/include/cuda/std/__memory/construct_at.h b/libcudacxx/include/cuda/std/__memory/construct_at.h
index bc231cd27d7..a78314c6479 100644
--- a/libcudacxx/include/cuda/std/__memory/construct_at.h
+++ b/libcudacxx/include/cuda/std/__memory/construct_at.h
@@ -50,6 +50,7 @@
 #  ifndef __cpp_lib_constexpr_dynamic_alloc
 namespace std
 {
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp,
           class... _Args,
           class = decltype(::new(_CUDA_VSTD::declval<void*>()) _Tp(_CUDA_VSTD::declval<_Args>()...))>
diff --git a/libcudacxx/include/cuda/std/__utility/pair.h b/libcudacxx/include/cuda/std/__utility/pair.h
index e725cf4b001..e8678f58767 100644
--- a/libcudacxx/include/cuda/std/__utility/pair.h
+++ b/libcudacxx/include/cuda/std/__utility/pair.h
@@ -124,6 +124,7 @@ struct __pair_base
   _T1 first;
   _T2 second;
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _Constraints                                               = __pair_constraints<_T1, _T2>,
             enable_if_t<_Constraints::__explicit_default_constructible, int> = 0>
   _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __pair_base() noexcept(
@@ -132,6 +133,7 @@ struct __pair_base
       , second()
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _Constraints                                               = __pair_constraints<_T1, _T2>,
             enable_if_t<_Constraints::__implicit_default_constructible, int> = 0>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base() noexcept(
@@ -140,6 +142,7 @@ struct __pair_base
       , second()
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _U1, class _U2>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base(_U1&& __t1, _U2&& __t2) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _T1, _U1) && _CCCL_TRAIT(is_nothrow_constructible, _T2, _U2))
@@ -163,6 +166,7 @@ struct __pair_base<_T1, _T2, true>
   _T1 first;
   _T2 second;
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _Constraints                                               = __pair_constraints<_T1, _T2>,
             enable_if_t<_Constraints::__explicit_default_constructible, int> = 0>
   _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __pair_base() noexcept(
@@ -171,6 +175,7 @@ struct __pair_base<_T1, _T2, true>
       , second()
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _Constraints                                               = __pair_constraints<_T1, _T2>,
             enable_if_t<_Constraints::__implicit_default_constructible, int> = 0>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base() noexcept(
@@ -179,10 +184,13 @@ struct __pair_base<_T1, _T2, true>
       , second()
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_HIDE_FROM_ABI constexpr __pair_base(const __pair_base&) = default;
-  _CCCL_HIDE_FROM_ABI constexpr __pair_base(__pair_base&&)      = default;
+  _CCCL_EXEC_CHECK_DISABLE
+  _CCCL_HIDE_FROM_ABI constexpr __pair_base(__pair_base&&) = default;
 
   // We need to ensure that a reference type, which would inhibit the implicit copy assignment still works
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __pair_base& operator=(
     conditional_t<_CCCL_TRAIT(is_copy_assignable, _T1) && _CCCL_TRAIT(is_copy_assignable, _T2), __pair_base, __nat> const&
       __p) noexcept(_CCCL_TRAIT(is_nothrow_copy_assignable, _T1) && _CCCL_TRAIT(is_nothrow_copy_assignable, _T2))
@@ -193,6 +201,7 @@ struct __pair_base<_T1, _T2, true>
   }
 
   // We need to ensure that a reference type, which would inhibit the implicit move assignment still works
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __pair_base& operator=(
     conditional_t<_CCCL_TRAIT(is_move_assignable, _T1) && _CCCL_TRAIT(is_move_assignable, _T2), __pair_base, __nat>&&
       __p) noexcept(_CCCL_TRAIT(is_nothrow_move_assignable, _T1) && _CCCL_TRAIT(is_nothrow_move_assignable, _T2))
@@ -202,6 +211,7 @@ struct __pair_base<_T1, _T2, true>
     return *this;
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _U1, class _U2>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __pair_base(_U1&& __t1, _U2&& __t2) noexcept(
     _CCCL_TRAIT(is_nothrow_constructible, _T1, _U1) && _CCCL_TRAIT(is_nothrow_constructible, _T2, _U2))
@@ -532,6 +542,7 @@ _CCCL_HOST_DEVICE pair(_T1, _T2) -> pair<_T1, _T2>;
 
 // [pairs.spec], specialized algorithms
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _T1, class _T2>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 {
@@ -540,6 +551,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const pair<_T1,
 
 #ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _T1, class _T2>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr common_comparison_category_t<__synth_three_way_result<_T1>,
                                                                  __synth_three_way_result<_T2>>
@@ -554,30 +566,35 @@ operator<=>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 
 #else // _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _T1, class _T2>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 {
   return !(__x == __y);
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _T1, class _T2>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator<(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 {
   return __x.first < __y.first || (!(__y.first < __x.first) && __x.second < __y.second);
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _T1, class _T2>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator>(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 {
   return __y < __x;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _T1, class _T2>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator>=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 {
   return !(__x < __y);
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _T1, class _T2>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator<=(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y)
 {
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/optional b/libcudacxx/include/cuda/std/detail/libcxx/include/optional
index 04f056c91d3..d61ce254f4d 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/optional
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/optional
@@ -296,12 +296,14 @@ struct __optional_destruct_base<_Tp, false>
       , __engaged_(false)
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args)
       : __val_(_CUDA_VSTD::forward<_Args>(__args)...)
       , __engaged_(true)
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _Fp, class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_destruct_base(
     __optional_construct_from_invoke_tag, _Fp&& __f, _Args&&... __args)
@@ -338,12 +340,14 @@ struct __optional_destruct_base<_Tp, true>
       , __engaged_(false)
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args)
       : __val_(_CUDA_VSTD::forward<_Args>(__args)...)
       , __engaged_(true)
   {}
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _Fp, class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr __optional_destruct_base(
     __optional_construct_from_invoke_tag, _Fp&& __f, _Args&&... __args)
@@ -389,6 +393,7 @@ struct __optional_storage_base : __optional_destruct_base<_Tp>
     return _CUDA_VSTD::move(this->__val_);
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void __construct(_Args&&... __args)
   {
@@ -410,6 +415,7 @@ struct __optional_storage_base : __optional_destruct_base<_Tp>
     }
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _That>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr void __assign_from(_That&& __opt)
   {
@@ -811,6 +817,7 @@ public:
     return this->__get();
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_HIDE_FROM_ABI constexpr void swap(optional& __opt) noexcept(
     _CCCL_TRAIT(is_nothrow_move_constructible, value_type) && _CCCL_TRAIT(is_nothrow_swappable, value_type))
   {
@@ -1088,6 +1095,7 @@ _CCCL_HOST_DEVICE optional(_Tp) -> optional<_Tp>;
 #  endif // _CCCL_NO_DEDUCTION_GUIDES
 
 // Comparisons between optionals
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() == declval<const _Up&>()), bool),
@@ -1105,6 +1113,7 @@ operator==(const optional<_Tp>& __x, const optional<_Up>& __y)
   return *__x == *__y;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() != declval<const _Up&>()), bool),
@@ -1122,6 +1131,7 @@ operator!=(const optional<_Tp>& __x, const optional<_Up>& __y)
   return *__x != *__y;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() < declval<const _Up&>()), bool),
@@ -1139,6 +1149,7 @@ operator<(const optional<_Tp>& __x, const optional<_Up>& __y)
   return *__x < *__y;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() > declval<const _Up&>()), bool),
@@ -1156,6 +1167,7 @@ operator>(const optional<_Tp>& __x, const optional<_Up>& __y)
   return *__x > *__y;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() <= declval<const _Up&>()), bool),
@@ -1173,6 +1185,7 @@ operator<=(const optional<_Tp>& __x, const optional<_Up>& __y)
   return *__x <= *__y;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() >= declval<const _Up&>()), bool),
@@ -1264,6 +1277,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator>=(nullopt_t, const optional<_T
 }
 
 // Comparisons with T
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() == declval<const _Up&>()), bool),
@@ -1273,6 +1287,7 @@ operator==(const optional<_Tp>& __x, const _Up& __v)
   return static_cast<bool>(__x) ? *__x == __v : false;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() == declval<const _Up&>()), bool),
@@ -1282,6 +1297,7 @@ operator==(const _Tp& __v, const optional<_Up>& __x)
   return static_cast<bool>(__x) ? __v == *__x : false;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() != declval<const _Up&>()), bool),
@@ -1291,6 +1307,7 @@ operator!=(const optional<_Tp>& __x, const _Up& __v)
   return static_cast<bool>(__x) ? *__x != __v : true;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() != declval<const _Up&>()), bool),
@@ -1300,6 +1317,7 @@ operator!=(const _Tp& __v, const optional<_Up>& __x)
   return static_cast<bool>(__x) ? __v != *__x : true;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() < declval<const _Up&>()), bool),
@@ -1309,6 +1327,7 @@ operator<(const optional<_Tp>& __x, const _Up& __v)
   return static_cast<bool>(__x) ? *__x < __v : true;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() < declval<const _Up&>()), bool),
@@ -1318,6 +1337,7 @@ operator<(const _Tp& __v, const optional<_Up>& __x)
   return static_cast<bool>(__x) ? __v < *__x : false;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() <= declval<const _Up&>()), bool),
@@ -1327,6 +1347,7 @@ operator<=(const optional<_Tp>& __x, const _Up& __v)
   return static_cast<bool>(__x) ? *__x <= __v : true;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() <= declval<const _Up&>()), bool),
@@ -1336,6 +1357,7 @@ operator<=(const _Tp& __v, const optional<_Up>& __x)
   return static_cast<bool>(__x) ? __v <= *__x : false;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() > declval<const _Up&>()), bool),
@@ -1345,6 +1367,7 @@ operator>(const optional<_Tp>& __x, const _Up& __v)
   return static_cast<bool>(__x) ? *__x > __v : false;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() > declval<const _Up&>()), bool),
@@ -1354,6 +1377,7 @@ operator>(const _Tp& __v, const optional<_Up>& __x)
   return static_cast<bool>(__x) ? __v > *__x : true;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() >= declval<const _Up&>()), bool),
@@ -1363,6 +1387,7 @@ operator>=(const optional<_Tp>& __x, const _Up& __v)
   return static_cast<bool>(__x) ? *__x >= __v : false;
 }
 
+_CCCL_EXEC_CHECK_DISABLE
 template <class _Tp, class _Up>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<
   _CCCL_TRAIT(is_convertible, decltype(declval<const _Tp&>() >= declval<const _Up&>()), bool),
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
index aa2fdeaa368..6ff1039e61b 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
@@ -1124,6 +1124,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple<_Tp&&...> forward_as_tuple
 template <size_t _Ip>
 struct __tuple_equal
 {
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _Tp, class _Up>
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp& __x, const _Up& __y)
   {
@@ -1157,6 +1158,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const tuple<_Tp.
 template <size_t _Ip>
 struct __tuple_less
 {
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _Tp, class _Up>
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator()(const _Tp& __x, const _Up& __y)
   {
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/variant b/libcudacxx/include/cuda/std/detail/libcxx/include/variant
index 0f6ec9d29fc..af1f7ba85ad 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/variant
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/variant
@@ -255,6 +255,7 @@ C++20
 #include <cuda/std/__utility/monostate.h>
 #include <cuda/std/__utility/move.h>
 #include <cuda/std/__utility/swap.h>
+#include <cuda/std/__utility/unreachable.h>
 #include <cuda/std/cstddef>
 #include <cuda/std/initializer_list>
 #include <cuda/std/tuple>
@@ -744,10 +745,22 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __alt
 {
   using __value_type = _Tp;
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __alt(in_place_t, _Args&&... __args)
       : __value(_CUDA_VSTD::forward<_Args>(__args)...)
   {}
+  _CCCL_EXEC_CHECK_DISABLE
+  constexpr __alt(const __alt&) = default;
+  _CCCL_EXEC_CHECK_DISABLE
+  constexpr __alt(__alt&&) = default;
+  _CCCL_EXEC_CHECK_DISABLE
+  constexpr __alt& operator=(const __alt&) = default;
+  _CCCL_EXEC_CHECK_DISABLE
+  constexpr __alt& operator=(__alt&&) = default;
+
+  _CCCL_EXEC_CHECK_DISABLE
+  ~__alt() = default;
 
   __value_type __value;
 };
@@ -906,6 +919,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT __dtor<__traits<_Types...>, _Trait::_Availab
 {
   struct __visitor
   {
+    _CCCL_EXEC_CHECK_DISABLE
     template <class _Alt>
     _LIBCUDACXX_HIDE_FROM_ABI void operator()(_Alt& __alt) const noexcept
     {
@@ -1148,6 +1162,7 @@ public:
   }
 
 protected:
+  _CCCL_EXEC_CHECK_DISABLE
   template <
     size_t _Ip,
     class _Tp,
@@ -1166,6 +1181,7 @@ protected:
     }
   }
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <
     size_t _Ip,
     class _Tp,
@@ -1896,7 +1912,11 @@ private:
       return __op(_CUDA_VSTD::get<0>(__lhs), _CUDA_VSTD::get<0>(__rhs));
     }
     // We already checked that every variant has a value, so we should never reach this line
+#  if _CCCL_COMPILER(MSVC) // MSVC needs this to be wrapped in a function or it will error
+    _CUDA_VSTD::unreachable();
+#  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
     _CCCL_UNREACHABLE();
+#  endif // !_CCCL_COMPILER(MSVC)
   }
 };
 
diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/expected/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/expected/device_only_types.pass.cpp
new file mode 100644
index 00000000000..ba972e02d3a
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/utilities/expected/device_only_types.pass.cpp
@@ -0,0 +1,201 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// We cannot suppress execution checks in cuda::std::construct_at
+// XFAIL: c++20 && !nvrtc && nvcc && !msvc
+// UNSUPPORTED: clang-14
+
+#include <cuda/std/cassert>
+#include <cuda/std/expected>
+#include <cuda/std/initializer_list>
+
+#include "host_device_types.h"
+#include "test_macros.h"
+
+__device__ void test()
+{
+  using expected = cuda::std::expected<device_only_type, device_only_type>;
+  { // default construction
+    expected default_constructed{};
+    assert(default_constructed.has_value());
+    assert(*default_constructed == 0);
+  }
+
+  { // in_place zero initialization
+    expected in_place_zero_initialization{cuda::std::in_place};
+    assert(in_place_zero_initialization.has_value());
+    assert(*in_place_zero_initialization == 0);
+  }
+
+  { // in_place initialization
+    expected in_place_initialization{cuda::std::in_place, 42};
+    assert(in_place_initialization.has_value());
+    assert(*in_place_initialization == 42);
+  }
+
+  { // initializer_list initialization
+    expected init_list_initialization{cuda::std::in_place, cuda::std::initializer_list<int>{}, 42};
+    assert(init_list_initialization.has_value());
+    assert(*init_list_initialization == 42);
+  }
+
+  { // unexpect zero initialization
+    expected in_place_zero_initialization{cuda::std::unexpect};
+    assert(!in_place_zero_initialization.has_value());
+    assert(in_place_zero_initialization.error() == 0);
+  }
+
+  { // unexpect initialization
+    expected in_place_initialization{cuda::std::unexpect, 42};
+    assert(!in_place_initialization.has_value());
+    assert(in_place_initialization.error() == 42);
+  }
+
+  { // initializer_list initialization
+    expected init_list_initialization{cuda::std::unexpect, cuda::std::initializer_list<int>{}, 42};
+    assert(!init_list_initialization.has_value());
+    assert(init_list_initialization.error() == 42);
+  }
+
+  { // value initialization
+    expected value_initialization{42};
+    assert(value_initialization.has_value());
+    assert(*value_initialization == 42);
+  }
+
+  { // copy construction
+    expected input{42};
+    expected dest{input};
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // move construction
+    expected input{42};
+    expected dest{cuda::std::move(input)};
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, value to value
+    expected input{42};
+    expected dest{1337};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, value to empty
+    expected input{42};
+    expected dest{};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, empty to value
+    expected input{};
+    expected dest{1337};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 0);
+  }
+
+  { // assignment, empty to empty
+    expected input{};
+    expected dest{};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 0);
+  }
+
+  { // assignment, error to value
+    expected input{cuda::std::unexpect, 42};
+    expected dest{1337};
+    dest = input;
+    assert(!dest.has_value());
+    assert(dest.error() == 42);
+  }
+
+  { // assignment, value to error
+    expected input{42};
+    expected dest{cuda::std::unexpect, 1337};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, error to error
+    expected input{cuda::std::unexpect, 42};
+    expected dest{cuda::std::unexpect, 1337};
+    dest = input;
+    assert(!dest.has_value());
+    assert(dest.error() == 42);
+  }
+
+  { // comparison with expected with value
+    expected lhs{42};
+    expected rhs{1337};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+  }
+
+  { // comparison with expected with error
+    expected lhs{cuda::std::unexpect, 42};
+    expected rhs{cuda::std::unexpect, 1337};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+  }
+
+  { // comparison with type and value
+    expected expect{42};
+    assert(expect == device_only_type{42});
+    assert(device_only_type{42} == expect);
+    assert(expect != device_only_type{1337});
+    assert(device_only_type{1337} != expect);
+  }
+
+  { // comparison with type and error
+    expected expect{cuda::std::unexpect, 42};
+    assert(expect == cuda::std::unexpected<device_only_type>{42});
+    assert(cuda::std::unexpected<device_only_type>{42} == expect);
+    assert(expect != cuda::std::unexpected<device_only_type>{1337});
+    assert(cuda::std::unexpected<device_only_type>{1337} != expect);
+  }
+
+  { // swap
+    expected lhs{42};
+    expected rhs{1337};
+    lhs.swap(rhs);
+    assert(*lhs == 1337);
+    assert(*rhs == 42);
+
+    swap(lhs, rhs);
+    assert(*lhs == 42);
+    assert(*rhs == 1337);
+  }
+
+  { // swap cross error
+    expected lhs{42};
+    expected rhs{cuda::std::unexpect, 1337};
+    lhs.swap(rhs);
+    assert(lhs.error() == 1337);
+    assert(*rhs == 42);
+
+    swap(lhs, rhs);
+    assert(*lhs == 42);
+    assert(rhs.error() == 1337);
+  }
+}
+
+int main(int arg, char** argv)
+{
+  NV_IF_TARGET(NV_IS_DEVICE, (test();))
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/expected/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/expected/host_only_types.pass.cpp
new file mode 100644
index 00000000000..282288b7be8
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/utilities/expected/host_only_types.pass.cpp
@@ -0,0 +1,199 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+
+#include <cuda/std/cassert>
+#include <cuda/std/expected>
+#include <cuda/std/initializer_list>
+
+#include "host_device_types.h"
+#include "test_macros.h"
+
+void test()
+{
+  using expected = cuda::std::expected<host_only_type, host_only_type>;
+  { // default construction
+    expected default_constructed{};
+    assert(default_constructed.has_value());
+    assert(*default_constructed == 0);
+  }
+
+  { // in_place zero initialization
+    expected in_place_zero_initialization{cuda::std::in_place};
+    assert(in_place_zero_initialization.has_value());
+    assert(*in_place_zero_initialization == 0);
+  }
+
+  { // in_place initialization
+    expected in_place_initialization{cuda::std::in_place, 42};
+    assert(in_place_initialization.has_value());
+    assert(*in_place_initialization == 42);
+  }
+
+  { // initializer_list initialization
+    expected init_list_initialization{cuda::std::in_place, cuda::std::initializer_list<int>{}, 42};
+    assert(init_list_initialization.has_value());
+    assert(*init_list_initialization == 42);
+  }
+
+  { // unexpect zero initialization
+    expected in_place_zero_initialization{cuda::std::unexpect};
+    assert(!in_place_zero_initialization.has_value());
+    assert(in_place_zero_initialization.error() == 0);
+  }
+
+  { // unexpect initialization
+    expected in_place_initialization{cuda::std::unexpect, 42};
+    assert(!in_place_initialization.has_value());
+    assert(in_place_initialization.error() == 42);
+  }
+
+  { // initializer_list initialization
+    expected init_list_initialization{cuda::std::unexpect, cuda::std::initializer_list<int>{}, 42};
+    assert(!init_list_initialization.has_value());
+    assert(init_list_initialization.error() == 42);
+  }
+
+  { // value initialization
+    expected value_initialization{42};
+    assert(value_initialization.has_value());
+    assert(*value_initialization == 42);
+  }
+
+  { // copy construction
+    expected input{42};
+    expected dest{input};
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // move construction
+    expected input{42};
+    expected dest{cuda::std::move(input)};
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, value to value
+    expected input{42};
+    expected dest{1337};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, value to empty
+    expected input{42};
+    expected dest{};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, empty to value
+    expected input{};
+    expected dest{1337};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 0);
+  }
+
+  { // assignment, empty to empty
+    expected input{};
+    expected dest{};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 0);
+  }
+
+  { // assignment, error to value
+    expected input{cuda::std::unexpect, 42};
+    expected dest{1337};
+    dest = input;
+    assert(!dest.has_value());
+    assert(dest.error() == 42);
+  }
+
+  { // assignment, value to error
+    expected input{42};
+    expected dest{cuda::std::unexpect, 1337};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, error to error
+    expected input{cuda::std::unexpect, 42};
+    expected dest{cuda::std::unexpect, 1337};
+    dest = input;
+    assert(!dest.has_value());
+    assert(dest.error() == 42);
+  }
+
+  { // comparison with expected with value
+    expected lhs{42};
+    expected rhs{1337};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+  }
+
+  { // comparison with expected with error
+    expected lhs{cuda::std::unexpect, 42};
+    expected rhs{cuda::std::unexpect, 1337};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+  }
+
+  { // comparison with type and value
+    expected expect{42};
+    assert(expect == host_only_type{42});
+    assert(host_only_type{42} == expect);
+    assert(expect != host_only_type{1337});
+    assert(host_only_type{1337} != expect);
+  }
+
+  { // comparison with type and error
+    expected expect{cuda::std::unexpect, 42};
+    assert(expect == cuda::std::unexpected<host_only_type>{42});
+    assert(cuda::std::unexpected<host_only_type>{42} == expect);
+    assert(expect != cuda::std::unexpected<host_only_type>{1337});
+    assert(cuda::std::unexpected<host_only_type>{1337} != expect);
+  }
+
+  { // swap
+    expected lhs{42};
+    expected rhs{1337};
+    lhs.swap(rhs);
+    assert(*lhs == 1337);
+    assert(*rhs == 42);
+
+    swap(lhs, rhs);
+    assert(*lhs == 42);
+    assert(*rhs == 1337);
+  }
+
+  { // swap cross error
+    expected lhs{42};
+    expected rhs{cuda::std::unexpect, 1337};
+    lhs.swap(rhs);
+    assert(lhs.error() == 1337);
+    assert(*rhs == 42);
+
+    swap(lhs, rhs);
+    assert(*lhs == 42);
+    assert(rhs.error() == 1337);
+  }
+}
+
+int main(int arg, char** argv)
+{
+  NV_IF_TARGET(NV_IS_HOST, (test();))
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/optional/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/optional/device_only_types.pass.cpp
new file mode 100644
index 00000000000..766b6ae821c
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/utilities/optional/device_only_types.pass.cpp
@@ -0,0 +1,136 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// We cannot suppress execution checks in cuda::std::construct_at
+// XFAIL: c++20 && !nvrtc && nvcc && !msvc
+// UNSUPPORTED: clang-14
+
+#include <cuda/std/cassert>
+#include <cuda/std/optional>
+
+#include "host_device_types.h"
+#include "test_macros.h"
+
+__device__ void test()
+{
+  using optional = cuda::std::optional<device_only_type>;
+  { // default construction
+    optional default_constructed{};
+    assert(!default_constructed.has_value());
+  }
+
+  { // in_place zero initialization
+    optional in_place_zero_initialization{cuda::std::in_place};
+    assert(in_place_zero_initialization.has_value());
+    assert(*in_place_zero_initialization == 0);
+  }
+
+  { // in_place initialization
+    optional in_place_initialization{cuda::std::in_place, 42};
+    assert(in_place_initialization.has_value());
+    assert(*in_place_initialization == 42);
+  }
+
+  { // value initialization
+    optional value_initialization{42};
+    assert(value_initialization.has_value());
+    assert(*value_initialization == 42);
+  }
+
+  { // copy construction
+    optional input{42};
+    optional dest{input};
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // move construction
+    optional input{42};
+    optional dest{cuda::std::move(input)};
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, value to value
+    optional input{42};
+    optional dest{1337};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, value to empty
+    optional input{42};
+    optional dest{};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, empty to value
+    optional input{};
+    optional dest{1337};
+    dest = input;
+    assert(!dest.has_value());
+  }
+
+  { // assignment, empty to empty
+    optional input{};
+    optional dest{};
+    dest = input;
+    assert(!dest.has_value());
+  }
+
+  { // comparison with optional
+    optional lhs{42};
+    optional rhs{1337};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+    assert(lhs < rhs);
+    assert(lhs <= rhs);
+    assert(!(lhs > rhs));
+    assert(!(lhs >= rhs));
+  }
+
+  { // comparison with type
+    optional opt{42};
+    assert(opt == device_only_type{42});
+    assert(device_only_type{42} == opt);
+    assert(opt != device_only_type{1337});
+    assert(device_only_type{1337} != opt);
+
+    assert(opt < device_only_type{1337});
+    assert(device_only_type{7} < opt);
+    assert(opt <= device_only_type{1337});
+    assert(device_only_type{7} <= opt);
+
+    assert(opt > device_only_type{7});
+    assert(device_only_type{1337} > opt);
+    assert(opt >= device_only_type{7});
+    assert(device_only_type{1337} >= opt);
+  }
+
+  { // swap
+    optional lhs{42};
+    optional rhs{1337};
+    lhs.swap(rhs);
+    assert(*lhs == 1337);
+    assert(*rhs == 42);
+
+    swap(lhs, rhs);
+    assert(*lhs == 42);
+    assert(*rhs == 1337);
+  }
+}
+
+int main(int arg, char** argv)
+{
+  NV_IF_TARGET(NV_IS_DEVICE, (test();))
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/optional/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/optional/host_only_types.pass.cpp
new file mode 100644
index 00000000000..3bf26d0fb2e
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/utilities/optional/host_only_types.pass.cpp
@@ -0,0 +1,134 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+
+#include <cuda/std/cassert>
+#include <cuda/std/optional>
+
+#include "host_device_types.h"
+#include "test_macros.h"
+
+void test()
+{
+  using optional = cuda::std::optional<host_only_type>;
+  { // default construction
+    optional default_constructed{};
+    assert(!default_constructed.has_value());
+  }
+
+  { // in_place zero initialization
+    optional in_place_zero_initialization{cuda::std::in_place};
+    assert(in_place_zero_initialization.has_value());
+    assert(*in_place_zero_initialization == 0);
+  }
+
+  { // in_place initialization
+    optional in_place_initialization{cuda::std::in_place, 42};
+    assert(in_place_initialization.has_value());
+    assert(*in_place_initialization == 42);
+  }
+
+  { // value initialization
+    optional value_initialization{42};
+    assert(value_initialization.has_value());
+    assert(*value_initialization == 42);
+  }
+
+  { // copy construction
+    optional input{42};
+    optional dest{input};
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // move construction
+    optional input{42};
+    optional dest{cuda::std::move(input)};
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, value to value
+    optional input{42};
+    optional dest{1337};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, value to empty
+    optional input{42};
+    optional dest{};
+    dest = input;
+    assert(dest.has_value());
+    assert(*dest == 42);
+  }
+
+  { // assignment, empty to value
+    optional input{};
+    optional dest{1337};
+    dest = input;
+    assert(!dest.has_value());
+  }
+
+  { // assignment, empty to empty
+    optional input{};
+    optional dest{};
+    dest = input;
+    assert(!dest.has_value());
+  }
+
+  { // comparison with optional
+    optional lhs{42};
+    optional rhs{1337};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+    assert(lhs < rhs);
+    assert(lhs <= rhs);
+    assert(!(lhs > rhs));
+    assert(!(lhs >= rhs));
+  }
+
+  { // comparison with type
+    optional opt{42};
+    assert(opt == host_only_type{42});
+    assert(host_only_type{42} == opt);
+    assert(opt != host_only_type{1337});
+    assert(host_only_type{1337} != opt);
+
+    assert(opt < host_only_type{1337});
+    assert(host_only_type{7} < opt);
+    assert(opt <= host_only_type{1337});
+    assert(host_only_type{7} <= opt);
+
+    assert(opt > host_only_type{7});
+    assert(host_only_type{1337} > opt);
+    assert(opt >= host_only_type{7});
+    assert(host_only_type{1337} >= opt);
+  }
+
+  { // swap
+    optional lhs{42};
+    optional rhs{1337};
+    lhs.swap(rhs);
+    assert(*lhs == 1337);
+    assert(*rhs == 42);
+
+    swap(lhs, rhs);
+    assert(*lhs == 42);
+    assert(*rhs == 1337);
+  }
+}
+
+int main(int arg, char** argv)
+{
+  NV_IF_TARGET(NV_IS_HOST, (test();))
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/tuple/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/device_only_types.pass.cpp
new file mode 100644
index 00000000000..d8820409d10
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/device_only_types.pass.cpp
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/cassert>
+#include <cuda/std/tuple>
+
+#include "host_device_types.h"
+#include "test_macros.h"
+
+__device__ void test()
+{
+  using tuple = cuda::std::tuple<device_only_type>;
+  { // default construction
+    tuple default_constructed{};
+    assert(cuda::std::get<0>(default_constructed) == 0);
+  }
+
+  { // value initialization
+    tuple value_initialization{device_only_type{42}};
+    assert(cuda::std::get<0>(value_initialization) == 42);
+  }
+
+  { // value initialization
+    tuple value_initialization{42};
+    assert(cuda::std::get<0>(value_initialization) == 42);
+  }
+
+  { // copy construction
+    tuple input{42};
+    tuple dest{input};
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // move construction
+    tuple input{42};
+    tuple dest{cuda::std::move(input)};
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // assignment, value to value
+    tuple input{42};
+    tuple dest{1337};
+    dest = input;
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // comparison with tuple
+    tuple lhs{42};
+    tuple rhs{1337};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+    assert(lhs < rhs);
+    assert(lhs <= rhs);
+    assert(!(lhs > rhs));
+    assert(!(lhs >= rhs));
+  }
+
+  { // swap
+    tuple lhs{42};
+    tuple rhs{1337};
+    lhs.swap(rhs);
+    assert(cuda::std::get<0>(lhs) == 1337);
+    assert(cuda::std::get<0>(rhs) == 42);
+
+    swap(lhs, rhs);
+    assert(cuda::std::get<0>(lhs) == 42);
+    assert(cuda::std::get<0>(rhs) == 1337);
+  }
+}
+
+int main(int arg, char** argv)
+{
+  NV_IF_TARGET(NV_IS_DEVICE, (test();))
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/forward_as_tuple_interop.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/forward_as_tuple_interop.pass.cpp
similarity index 100%
rename from libcudacxx/test/libcudacxx/cuda/tuple/forward_as_tuple_interop.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/forward_as_tuple_interop.pass.cpp
diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/tuple/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/host_only_types.pass.cpp
new file mode 100644
index 00000000000..4942d051b1c
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/host_only_types.pass.cpp
@@ -0,0 +1,90 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+
+#include <cuda/std/cassert>
+#include <cuda/std/tuple>
+
+#include "host_device_types.h"
+#include "test_macros.h"
+
+void test()
+{
+  using tuple = cuda::std::tuple<host_only_type>;
+  { // default construction
+    tuple default_constructed{};
+    assert(cuda::std::get<0>(default_constructed) == 0);
+  }
+
+  { // value initialization
+    tuple value_initialization{host_only_type{42}};
+    assert(cuda::std::get<0>(value_initialization) == 42);
+  }
+
+  { // value initialization
+    tuple value_initialization{42};
+    assert(cuda::std::get<0>(value_initialization) == 42);
+  }
+
+  { // copy construction
+    tuple input{42};
+    tuple dest{input};
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // move construction
+    tuple input{42};
+    tuple dest{cuda::std::move(input)};
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // assignment, value to value
+    tuple input{42};
+    tuple dest{1337};
+    dest = input;
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // assignment, value to empty
+    tuple input{42};
+    tuple dest{};
+    dest = input;
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // comparison with tuple
+    tuple lhs{42};
+    tuple rhs{1337};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+    assert(lhs < rhs);
+    assert(lhs <= rhs);
+    assert(!(lhs > rhs));
+    assert(!(lhs >= rhs));
+  }
+
+  { // swap
+    tuple lhs{42};
+    tuple rhs{1337};
+    lhs.swap(rhs);
+    assert(cuda::std::get<0>(lhs) == 1337);
+    assert(cuda::std::get<0>(rhs) == 42);
+
+    swap(lhs, rhs);
+    assert(cuda::std::get<0>(lhs) == 42);
+    assert(cuda::std::get<0>(rhs) == 1337);
+  }
+}
+
+int main(int arg, char** argv)
+{
+  NV_IF_TARGET(NV_IS_HOST, (test();))
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/vector_types_get.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_get.pass.cpp
similarity index 100%
rename from libcudacxx/test/libcudacxx/cuda/tuple/vector_types_get.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_get.pass.cpp
diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/vector_types_structured_bindings.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_structured_bindings.pass.cpp
similarity index 100%
rename from libcudacxx/test/libcudacxx/cuda/tuple/vector_types_structured_bindings.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_structured_bindings.pass.cpp
diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/vector_types_tuple_element.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_tuple_element.pass.cpp
similarity index 100%
rename from libcudacxx/test/libcudacxx/cuda/tuple/vector_types_tuple_element.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_tuple_element.pass.cpp
diff --git a/libcudacxx/test/libcudacxx/cuda/tuple/vector_types_tuple_size.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_tuple_size.pass.cpp
similarity index 100%
rename from libcudacxx/test/libcudacxx/cuda/tuple/vector_types_tuple_size.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/utilities/tuple/vector_types_tuple_size.pass.cpp
diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/device_only_types.pass.cpp
new file mode 100644
index 00000000000..f36e86c2c3f
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/device_only_types.pass.cpp
@@ -0,0 +1,82 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/cassert>
+#include <cuda/std/expected>
+
+#include "host_device_types.h"
+#include "test_macros.h"
+
+__device__ void test()
+{
+  using unexpected = cuda::std::unexpected<device_only_type>;
+  { // in_place zero initialization
+    unexpected in_place_zero_initialization{cuda::std::in_place};
+    assert(in_place_zero_initialization.error() == 0);
+  }
+
+  { // in_place initialization
+    unexpected in_place_initialization{cuda::std::in_place, 42};
+    assert(in_place_initialization.error() == 42);
+  }
+
+  { // value initialization
+    unexpected value_initialization{42};
+    assert(value_initialization.error() == 42);
+  }
+
+  { // initializer_list initialization
+    unexpected init_list_initialization{cuda::std::in_place, cuda::std::initializer_list<int>{}, 42};
+    assert(init_list_initialization.error() == 42);
+  }
+
+  { // copy construction
+    unexpected input{42};
+    unexpected dest{input};
+    assert(dest.error() == 42);
+  }
+
+  { // move construction
+    unexpected input{42};
+    unexpected dest{cuda::std::move(input)};
+    assert(dest.error() == 42);
+  }
+
+  { // assignment
+    unexpected input{42};
+    unexpected dest{1337};
+    dest = input;
+    assert(dest.error() == 42);
+  }
+
+  { // comparison with unexpected
+    unexpected lhs{42};
+    unexpected rhs{1337};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+  }
+
+  { // swap
+    unexpected lhs{42};
+    unexpected rhs{1337};
+    lhs.swap(rhs);
+    assert(lhs.error() == 1337);
+    assert(rhs.error() == 42);
+
+    swap(lhs, rhs);
+    assert(lhs.error() == 42);
+    assert(rhs.error() == 1337);
+  }
+}
+
+int main(int arg, char** argv)
+{
+  NV_IF_TARGET(NV_IS_DEVICE, (test();))
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/host_only_types.pass.cpp
new file mode 100644
index 00000000000..ca12494418c
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/utilities/unexpected/host_only_types.pass.cpp
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+
+#include <cuda/std/cassert>
+#include <cuda/std/expected>
+#include <cuda/std/initializer_list>
+
+#include "host_device_types.h"
+#include "test_macros.h"
+
+void test()
+{
+  using unexpected = cuda::std::unexpected<host_only_type>;
+  { // in_place zero initialization
+    unexpected in_place_zero_initialization{cuda::std::in_place};
+    assert(in_place_zero_initialization.error() == 0);
+  }
+
+  { // in_place initialization
+    unexpected in_place_initialization{cuda::std::in_place, 42};
+    assert(in_place_initialization.error() == 42);
+  }
+
+  { // value initialization
+    unexpected value_initialization{42};
+    assert(value_initialization.error() == 42);
+  }
+
+  { // initializer_list initialization
+    unexpected init_list_initialization{cuda::std::in_place, cuda::std::initializer_list<int>{}, 42};
+    assert(init_list_initialization.error() == 42);
+  }
+
+  { // copy construction
+    unexpected input{42};
+    unexpected dest{input};
+    assert(dest.error() == 42);
+  }
+
+  { // move construction
+    unexpected input{42};
+    unexpected dest{cuda::std::move(input)};
+    assert(dest.error() == 42);
+  }
+
+  { // assignment
+    unexpected input{42};
+    unexpected dest{1337};
+    dest = input;
+    assert(dest.error() == 42);
+  }
+
+  { // comparison with unexpected
+    unexpected lhs{42};
+    unexpected rhs{1337};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+  }
+
+  { // swap
+    unexpected lhs{42};
+    unexpected rhs{1337};
+    lhs.swap(rhs);
+    assert(lhs.error() == 1337);
+    assert(rhs.error() == 42);
+
+    swap(lhs, rhs);
+    assert(lhs.error() == 42);
+    assert(rhs.error() == 1337);
+  }
+}
+
+int main(int arg, char** argv)
+{
+  NV_IF_TARGET(NV_IS_HOST, (test();))
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/device_only_types.pass.cpp
new file mode 100644
index 00000000000..aebdd6e12ea
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/device_only_types.pass.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+
+#include <cuda/std/cassert>
+#include <cuda/std/utility>
+
+#include "host_device_types.h"
+#include "test_macros.h"
+
+__device__ void test()
+{
+  using pair = cuda::std::pair<device_only_type, device_only_type>;
+  { // default construction
+    pair default_constructed{};
+    assert(default_constructed.first == 0);
+    assert(default_constructed.second == 0);
+  }
+
+  { // value initialization
+    pair value_initialization{device_only_type{42}, device_only_type{1337}};
+    assert(value_initialization.first == 42);
+    assert(value_initialization.second == 1337);
+  }
+
+  { // value initialization
+    pair value_initialization{42, 1337};
+    assert(value_initialization.first == 42);
+    assert(value_initialization.second == 1337);
+  }
+
+  { // copy construction
+    pair input{42, 1337};
+    pair dest{input};
+    assert(dest.first == 42);
+    assert(dest.second == 1337);
+  }
+
+  { // move construction
+    pair input{42, 1337};
+    pair dest{cuda::std::move(input)};
+    assert(dest.first == 42);
+    assert(dest.second == 1337);
+  }
+
+  { // assignment, value to value
+    pair input{42, 1337};
+    pair dest{1337, 42};
+    dest = input;
+    assert(dest.first == 42);
+    assert(dest.second == 1337);
+  }
+
+  { // comparison with pair
+    pair lhs{42, 1337};
+    pair rhs{1337, 42};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+    assert(lhs < rhs);
+    assert(lhs <= rhs);
+    assert(!(lhs > rhs));
+    assert(!(lhs >= rhs));
+  }
+
+  { // swap
+    pair lhs{42, 1337};
+    pair rhs{1337, 42};
+    lhs.swap(rhs);
+    assert(lhs.first == 1337);
+    assert(lhs.second == 42);
+    assert(rhs.first == 42);
+    assert(rhs.second == 1337);
+
+    swap(lhs, rhs);
+    assert(lhs.first == 42);
+    assert(lhs.second == 1337);
+    assert(rhs.first == 1337);
+    assert(rhs.second == 42);
+  }
+}
+
+int main(int arg, char** argv)
+{
+  NV_IF_TARGET(NV_IS_DEVICE, (test();))
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/host_only_types.pass.cpp
new file mode 100644
index 00000000000..cf1195f204d
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/host_only_types.pass.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+
+#include <cuda/std/cassert>
+#include <cuda/std/utility>
+
+#include "host_device_types.h"
+#include "test_macros.h"
+
+void test()
+{
+  using pair = cuda::std::pair<host_only_type, host_only_type>;
+  { // default construction
+    pair default_constructed{};
+    assert(default_constructed.first == 0);
+    assert(default_constructed.second == 0);
+  }
+
+  { // value initialization
+    pair value_initialization{host_only_type{42}, host_only_type{1337}};
+    assert(value_initialization.first == 42);
+    assert(value_initialization.second == 1337);
+  }
+
+  { // value initialization
+    pair value_initialization{42, 1337};
+    assert(value_initialization.first == 42);
+    assert(value_initialization.second == 1337);
+  }
+
+  { // copy construction
+    pair input{42, 1337};
+    pair dest{input};
+    assert(dest.first == 42);
+    assert(dest.second == 1337);
+  }
+
+  { // move construction
+    pair input{42, 1337};
+    pair dest{cuda::std::move(input)};
+    assert(dest.first == 42);
+    assert(dest.second == 1337);
+  }
+
+  { // assignment, value to value
+    pair input{42, 1337};
+    pair dest{1337, 42};
+    dest = input;
+    assert(dest.first == 42);
+    assert(dest.second == 1337);
+  }
+
+  { // comparison with pair
+    pair lhs{42, 1337};
+    pair rhs{1337, 42};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+    assert(lhs < rhs);
+    assert(lhs <= rhs);
+    assert(!(lhs > rhs));
+    assert(!(lhs >= rhs));
+  }
+
+  { // swap
+    pair lhs{42, 1337};
+    pair rhs{1337, 42};
+    lhs.swap(rhs);
+    assert(lhs.first == 1337);
+    assert(lhs.second == 42);
+    assert(rhs.first == 42);
+    assert(rhs.second == 1337);
+
+    swap(lhs, rhs);
+    assert(lhs.first == 42);
+    assert(lhs.second == 1337);
+    assert(rhs.first == 1337);
+    assert(rhs.second == 42);
+  }
+}
+
+int main(int arg, char** argv)
+{
+  NV_IF_TARGET(NV_IS_HOST, (test();))
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/pair_interop/pair.assign.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.assign.pass.cpp
similarity index 100%
rename from libcudacxx/test/libcudacxx/cuda/pair_interop/pair.assign.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.assign.pass.cpp
diff --git a/libcudacxx/test/libcudacxx/cuda/pair_interop/pair.cons.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.cons.pass.cpp
similarity index 100%
rename from libcudacxx/test/libcudacxx/cuda/pair_interop/pair.cons.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.cons.pass.cpp
diff --git a/libcudacxx/test/libcudacxx/cuda/pair_interop/pair.conv.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.conv.pass.cpp
similarity index 100%
rename from libcudacxx/test/libcudacxx/cuda/pair_interop/pair.conv.pass.cpp
rename to libcudacxx/test/libcudacxx/cuda/utilities/utility/pair/interop/pair.conv.pass.cpp
diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/variant/device_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/variant/device_only_types.pass.cpp
new file mode 100644
index 00000000000..38ee416a8fc
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/utilities/variant/device_only_types.pass.cpp
@@ -0,0 +1,120 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/cassert>
+#include <cuda/std/variant>
+
+#include "host_device_types.h"
+#include "test_macros.h"
+
+__device__ void test()
+{
+  using variant = cuda::std::variant<device_only_type>;
+  { // default construction
+    variant default_constructed{};
+    assert(cuda::std::get<0>(default_constructed) == 0);
+  }
+
+  { // value initialization
+    variant value_initialization{device_only_type{42}};
+    assert(cuda::std::get<0>(value_initialization) == 42);
+  }
+
+  { // value initialization
+    variant value_initialization{42};
+    assert(cuda::std::get<0>(value_initialization) == 42);
+  }
+
+  { // in_place_type_t initialization
+    variant in_place_initialization{cuda::std::in_place_type_t<device_only_type>{}, 42};
+    assert(cuda::std::get<0>(in_place_initialization) == 42);
+  }
+
+  { // in_place_index_t initialization
+    variant in_place_initialization{cuda::std::in_place_index_t<0>{}, 42};
+    assert(cuda::std::get<0>(in_place_initialization) == 42);
+  }
+
+  { // in_place_type_t initializer_list initialization
+    variant init_list_initialization{
+      cuda::std::in_place_type_t<device_only_type>{}, cuda::std::initializer_list<int>{}, 42};
+    assert(cuda::std::get<0>(init_list_initialization) == 42);
+  }
+
+  { // in_place_type_t initializer_list initialization
+    variant init_list_initialization{cuda::std::in_place_index_t<0>{}, cuda::std::initializer_list<int>{}, 42};
+    assert(cuda::std::get<0>(init_list_initialization) == 42);
+  }
+
+  { // copy construction
+    variant input{42};
+    variant dest{input};
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // move construction
+    variant input{42};
+    variant dest{cuda::std::move(input)};
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // assignment, value to value
+    variant input{42};
+    variant dest{1337};
+    dest = input;
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // emplace
+    variant var{42};
+    var.emplace<device_only_type>(42);
+    assert(cuda::std::get<0>(var) == 42);
+  }
+
+  { // emplace
+    variant var{42};
+    var.emplace<0>(42);
+    assert(cuda::std::get<0>(var) == 42);
+  }
+
+  { // emplace init list
+    variant var{42};
+    var.emplace<device_only_type>(cuda::std::initializer_list<int>{}, 42);
+    assert(cuda::std::get<0>(var) == 42);
+  }
+
+  { // comparison with variant
+    variant lhs{42};
+    variant rhs{1337};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+    assert(lhs < rhs);
+    assert(lhs <= rhs);
+    assert(!(lhs > rhs));
+    assert(!(lhs >= rhs));
+  }
+
+  { // swap
+    variant lhs{42};
+    variant rhs{1337};
+    lhs.swap(rhs);
+    assert(cuda::std::get<0>(lhs) == 1337);
+    assert(cuda::std::get<0>(rhs) == 42);
+
+    swap(lhs, rhs);
+    assert(cuda::std::get<0>(lhs) == 42);
+    assert(cuda::std::get<0>(rhs) == 1337);
+  }
+}
+
+int main(int arg, char** argv)
+{
+  NV_IF_TARGET(NV_IS_DEVICE, (test();))
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/utilities/variant/host_only_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/utilities/variant/host_only_types.pass.cpp
new file mode 100644
index 00000000000..5f12da6074b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/utilities/variant/host_only_types.pass.cpp
@@ -0,0 +1,129 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: nvrtc
+
+#include <cuda/std/cassert>
+#include <cuda/std/variant>
+
+#include "host_device_types.h"
+#include "test_macros.h"
+
+void test()
+{
+  using variant = cuda::std::variant<host_only_type>;
+  { // default construction
+    variant default_constructed{};
+    assert(cuda::std::get<0>(default_constructed) == 0);
+  }
+
+  { // value initialization
+    variant value_initialization{host_only_type{42}};
+    assert(cuda::std::get<0>(value_initialization) == 42);
+  }
+
+  { // value initialization
+    variant value_initialization{42};
+    assert(cuda::std::get<0>(value_initialization) == 42);
+  }
+
+  { // in_place_type_t initialization
+    variant in_place_initialization{cuda::std::in_place_type_t<host_only_type>{}, 42};
+    assert(cuda::std::get<0>(in_place_initialization) == 42);
+  }
+
+  { // in_place_index_t initialization
+    variant in_place_initialization{cuda::std::in_place_index_t<0>{}, 42};
+    assert(cuda::std::get<0>(in_place_initialization) == 42);
+  }
+
+  { // in_place_type_t initializer_list initialization
+    variant init_list_initialization{
+      cuda::std::in_place_type_t<host_only_type>{}, cuda::std::initializer_list<int>{}, 42};
+    assert(cuda::std::get<0>(init_list_initialization) == 42);
+  }
+
+  { // in_place_type_t initializer_list initialization
+    variant init_list_initialization{cuda::std::in_place_index_t<0>{}, cuda::std::initializer_list<int>{}, 42};
+    assert(cuda::std::get<0>(init_list_initialization) == 42);
+  }
+
+  { // copy construction
+    variant input{42};
+    variant dest{input};
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // move construction
+    variant input{42};
+    variant dest{cuda::std::move(input)};
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // assignment, value to value
+    variant input{42};
+    variant dest{1337};
+    dest = input;
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // assignment, value to empty
+    variant input{42};
+    variant dest{};
+    dest = input;
+    assert(cuda::std::get<0>(dest) == 42);
+  }
+
+  { // emplace
+    variant var{42};
+    var.emplace<host_only_type>(42);
+    assert(cuda::std::get<0>(var) == 42);
+  }
+
+  { // emplace
+    variant var{42};
+    var.emplace<0>(42);
+    assert(cuda::std::get<0>(var) == 42);
+  }
+
+  { // emplace init list
+    variant var{42};
+    var.emplace<host_only_type>(cuda::std::initializer_list<int>{}, 42);
+    assert(cuda::std::get<0>(var) == 42);
+  }
+
+  { // comparison with variant
+    variant lhs{42};
+    variant rhs{1337};
+    assert(!(lhs == rhs));
+    assert(lhs != rhs);
+    assert(lhs < rhs);
+    assert(lhs <= rhs);
+    assert(!(lhs > rhs));
+    assert(!(lhs >= rhs));
+  }
+
+  { // swap
+    variant lhs{42};
+    variant rhs{1337};
+    lhs.swap(rhs);
+    assert(cuda::std::get<0>(lhs) == 1337);
+    assert(cuda::std::get<0>(rhs) == 42);
+
+    swap(lhs, rhs);
+    assert(cuda::std::get<0>(lhs) == 42);
+    assert(cuda::std::get<0>(rhs) == 1337);
+  }
+}
+
+int main(int arg, char** argv)
+{
+  NV_IF_TARGET(NV_IS_HOST, (test();))
+  return 0;
+}
diff --git a/libcudacxx/test/support/host_device_types.h b/libcudacxx/test/support/host_device_types.h
new file mode 100644
index 00000000000..e8fa21b85b9
--- /dev/null
+++ b/libcudacxx/test/support/host_device_types.h
@@ -0,0 +1,148 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_SUPPORT_HOST_DEVICE_TYPES
+#define TEST_SUPPORT_HOST_DEVICE_TYPES
+
+#include <cuda/std/initializer_list>
+#include <cuda/std/utility>
+
+#if !_CCCL_COMPILER(NVRTC)
+struct host_only_type
+{
+  int val_;
+
+  host_only_type(const int val = 0) noexcept
+      : val_(val)
+  {}
+  host_only_type(cuda::std::initializer_list<int>, const int val) noexcept
+      : val_(val)
+  {}
+
+  host_only_type(const host_only_type& other) noexcept
+      : val_(other.val_)
+  {}
+  host_only_type(host_only_type&& other) noexcept
+      : val_(cuda::std::exchange(other.val_, -1))
+  {}
+
+  host_only_type& operator=(const host_only_type& other) noexcept
+  {
+    val_ = other.val_;
+    return *this;
+  }
+
+  host_only_type& operator=(host_only_type&& other) noexcept
+
+  {
+    val_ = cuda::std::exchange(other.val_, -1);
+    return *this;
+  }
+
+  ~host_only_type() noexcept {}
+
+  _CCCL_NODISCARD_FRIEND bool operator==(const host_only_type& lhs, const host_only_type& rhs) noexcept
+  {
+    return lhs.val_ == rhs.val_;
+  }
+  _CCCL_NODISCARD_FRIEND bool operator!=(const host_only_type& lhs, const host_only_type& rhs) noexcept
+  {
+    return lhs.val_ != rhs.val_;
+  }
+  _CCCL_NODISCARD_FRIEND bool operator<(const host_only_type& lhs, const host_only_type& rhs) noexcept
+  {
+    return lhs.val_ < rhs.val_;
+  }
+  _CCCL_NODISCARD_FRIEND bool operator<=(const host_only_type& lhs, const host_only_type& rhs) noexcept
+  {
+    return lhs.val_ <= rhs.val_;
+  }
+  _CCCL_NODISCARD_FRIEND bool operator>(const host_only_type& lhs, const host_only_type& rhs) noexcept
+  {
+    return lhs.val_ > rhs.val_;
+  }
+  _CCCL_NODISCARD_FRIEND bool operator>=(const host_only_type& lhs, const host_only_type& rhs) noexcept
+  {
+    return lhs.val_ >= rhs.val_;
+  }
+
+  void swap(host_only_type& other) noexcept
+  {
+    cuda::std::swap(val_, other.val_);
+  }
+};
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#if _CCCL_HAS_CUDA_COMPILER
+struct device_only_type
+{
+  int val_;
+
+  __device__ device_only_type(const int val = 0) noexcept
+      : val_(val)
+  {}
+  __device__ device_only_type(cuda::std::initializer_list<int>, const int val) noexcept
+      : val_(val)
+  {}
+
+  __device__ device_only_type(const device_only_type& other) noexcept
+      : val_(other.val_)
+  {}
+  __device__ device_only_type(device_only_type&& other) noexcept
+      : val_(cuda::std::exchange(other.val_, -1))
+  {}
+
+  __device__ device_only_type& operator=(const device_only_type& other) noexcept
+  {
+    val_ = other.val_;
+    return *this;
+  }
+
+  __device__ device_only_type& operator=(device_only_type&& other) noexcept
+
+  {
+    val_ = cuda::std::exchange(other.val_, -1);
+    return *this;
+  }
+
+  __device__ ~device_only_type() noexcept {}
+
+  __device__ _CCCL_NODISCARD_FRIEND bool operator==(const device_only_type& lhs, const device_only_type& rhs) noexcept
+  {
+    return lhs.val_ == rhs.val_;
+  }
+  __device__ _CCCL_NODISCARD_FRIEND bool operator!=(const device_only_type& lhs, const device_only_type& rhs) noexcept
+  {
+    return lhs.val_ != rhs.val_;
+  }
+  __device__ _CCCL_NODISCARD_FRIEND bool operator<(const device_only_type& lhs, const device_only_type& rhs) noexcept
+  {
+    return lhs.val_ < rhs.val_;
+  }
+  __device__ _CCCL_NODISCARD_FRIEND bool operator<=(const device_only_type& lhs, const device_only_type& rhs) noexcept
+  {
+    return lhs.val_ <= rhs.val_;
+  }
+  __device__ _CCCL_NODISCARD_FRIEND bool operator>(const device_only_type& lhs, const device_only_type& rhs) noexcept
+  {
+    return lhs.val_ > rhs.val_;
+  }
+  __device__ _CCCL_NODISCARD_FRIEND bool operator>=(const device_only_type& lhs, const device_only_type& rhs) noexcept
+  {
+    return lhs.val_ >= rhs.val_;
+  }
+
+  __device__ void swap(device_only_type& other) noexcept
+  {
+    cuda::std::swap(val_, other.val_);
+  }
+};
+#endif // _CCCL_HAS_CUDA_COMPILER
+
+#endif // TEST_SUPPORT_HOST_DEVICE_TYPES

From 8615f321e6305a1dbbd72b8050c47e4e6b27790f Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Thu, 30 Jan 2025 00:09:17 -0800
Subject: [PATCH 05/15] [nv/target] Add sm_120 macros. (#3550)

Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
---
 libcudacxx/include/nv/detail/__target_macros | 21 ++++++++++++++++++++
 libcudacxx/include/nv/target                 |  9 +++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/include/nv/detail/__target_macros b/libcudacxx/include/nv/detail/__target_macros
index 85df652c7d4..2de10fc8ec4 100644
--- a/libcudacxx/include/nv/detail/__target_macros
+++ b/libcudacxx/include/nv/detail/__target_macros
@@ -35,6 +35,7 @@
 #define _NV_TARGET_ARCH_TO_SELECTOR_900  nv::target::sm_90
 #define _NV_TARGET_ARCH_TO_SELECTOR_1000 nv::target::sm_100
 #define _NV_TARGET_ARCH_TO_SELECTOR_1010 nv::target::sm_101
+#define _NV_TARGET_ARCH_TO_SELECTOR_1200 nv::target::sm_120
 
 #define _NV_TARGET_ARCH_TO_SM_350  35
 #define _NV_TARGET_ARCH_TO_SM_370  37
@@ -54,6 +55,7 @@
 #define _NV_TARGET_ARCH_TO_SM_900  90
 #define _NV_TARGET_ARCH_TO_SM_1000 100
 #define _NV_TARGET_ARCH_TO_SM_1010 101
+#define _NV_TARGET_ARCH_TO_SM_1200 120
 
 // Only enable when compiling for CUDA/stdpar
 #if defined(_NV_COMPILER_NVCXX) && defined(_NVHPC_CUDA)
@@ -76,6 +78,7 @@
 #  define _NV_TARGET_VAL_SM_90  nv::target::sm_90
 #  define _NV_TARGET_VAL_SM_100 nv::target::sm_100
 #  define _NV_TARGET_VAL_SM_101 nv::target::sm_101
+#  define _NV_TARGET_VAL_SM_120 nv::target::sm_120
 
 #  define _NV_TARGET___NV_IS_HOST   nv::target::is_host
 #  define _NV_TARGET___NV_IS_DEVICE nv::target::is_device
@@ -112,6 +115,7 @@
 #  define _NV_TARGET_VAL_SM_90  900
 #  define _NV_TARGET_VAL_SM_100 1000
 #  define _NV_TARGET_VAL_SM_101 1010
+#  define _NV_TARGET_VAL_SM_120 1200
 
 #  if defined(__CUDA_ARCH__)
 #    define _NV_TARGET_VAL                __CUDA_ARCH__
@@ -160,6 +164,7 @@
 #  define _NV_TARGET_VAL_SM_90  900
 #  define _NV_TARGET_VAL_SM_100 1000
 #  define _NV_TARGET_VAL_SM_101 1010
+#  define _NV_TARGET_VAL_SM_120 1200
 
 #  define _NV_TARGET_VAL 0
 
@@ -191,6 +196,7 @@
 #define _NV_TARGET___NV_PROVIDES_SM_90  (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_90))
 #define _NV_TARGET___NV_PROVIDES_SM_100 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_100))
 #define _NV_TARGET___NV_PROVIDES_SM_101 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_101))
+#define _NV_TARGET___NV_PROVIDES_SM_120 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_120))
 
 #define _NV_TARGET___NV_IS_EXACTLY_SM_35  (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_35))
 #define _NV_TARGET___NV_IS_EXACTLY_SM_37  (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_37))
@@ -210,6 +216,7 @@
 #define _NV_TARGET___NV_IS_EXACTLY_SM_90  (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_90))
 #define _NV_TARGET___NV_IS_EXACTLY_SM_100 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_100))
 #define _NV_TARGET___NV_IS_EXACTLY_SM_101 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_101))
+#define _NV_TARGET___NV_IS_EXACTLY_SM_120 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_120))
 
 #define NV_PROVIDES_SM_35  __NV_PROVIDES_SM_35
 #define NV_PROVIDES_SM_37  __NV_PROVIDES_SM_37
@@ -229,6 +236,7 @@
 #define NV_PROVIDES_SM_90  __NV_PROVIDES_SM_90
 #define NV_PROVIDES_SM_100 __NV_PROVIDES_SM_100
 #define NV_PROVIDES_SM_101 __NV_PROVIDES_SM_101
+#define NV_PROVIDES_SM_120 __NV_PROVIDES_SM_120
 
 #define NV_IS_EXACTLY_SM_35  __NV_IS_EXACTLY_SM_35
 #define NV_IS_EXACTLY_SM_37  __NV_IS_EXACTLY_SM_37
@@ -248,6 +256,7 @@
 #define NV_IS_EXACTLY_SM_90  __NV_IS_EXACTLY_SM_90
 #define NV_IS_EXACTLY_SM_100 __NV_IS_EXACTLY_SM_100
 #define NV_IS_EXACTLY_SM_101 __NV_IS_EXACTLY_SM_101
+#define NV_IS_EXACTLY_SM_120 __NV_IS_EXACTLY_SM_120
 
 // Disable SM_90a support on non-supporting compilers.
 // Will re-enable for nvcc below.
@@ -381,6 +390,12 @@
 #    define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_101 0
 #  endif
 
+#  if (_NV_TARGET___NV_IS_EXACTLY_SM_120)
+#    define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_120 1
+#  else
+#    define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_120 0
+#  endif
+
 // Re-enable sm_90a support in nvcc.
 #  undef NV_HAS_FEATURE_SM_90a
 #  define NV_HAS_FEATURE_SM_90a __NV_HAS_FEATURE_SM_90a
@@ -529,6 +544,12 @@
 #    define _NV_TARGET_BOOL___NV_PROVIDES_SM_101 0
 #  endif
 
+#  if (_NV_TARGET___NV_PROVIDES_SM_120)
+#    define _NV_TARGET_BOOL___NV_PROVIDES_SM_120 1
+#  else
+#    define _NV_TARGET_BOOL___NV_PROVIDES_SM_120 0
+#  endif
+
 #  define _NV_ARCH_COND_CAT1(cond) _NV_TARGET_BOOL_##cond
 #  define _NV_ARCH_COND_CAT(cond)  _NV_EVAL(_NV_ARCH_COND_CAT1(cond))
 
diff --git a/libcudacxx/include/nv/target b/libcudacxx/include/nv/target
index d8617220c84..4b77011243f 100644
--- a/libcudacxx/include/nv/target
+++ b/libcudacxx/include/nv/target
@@ -68,9 +68,10 @@ constexpr base_int_t sm_89_bit  = 1 << 15;
 constexpr base_int_t sm_90_bit  = 1 << 16;
 constexpr base_int_t sm_100_bit = 1 << 17;
 constexpr base_int_t sm_101_bit = 1 << 18;
+constexpr base_int_t sm_120_bit = 1 << 19;
 constexpr base_int_t all_devices =
   sm_35_bit | sm_37_bit | sm_50_bit | sm_52_bit | sm_53_bit | sm_60_bit | sm_61_bit | sm_62_bit | sm_70_bit | sm_72_bit
-  | sm_75_bit | sm_80_bit | sm_86_bit | sm_87_bit | sm_89_bit | sm_90_bit | sm_100_bit | sm_101_bit;
+  | sm_75_bit | sm_80_bit | sm_86_bit | sm_87_bit | sm_89_bit | sm_90_bit | sm_100_bit | sm_101_bit | sm_120_bit;
 
 // Store a set of targets as a set of bits
 struct _NV_BITSET_ATTRIBUTE target_description
@@ -103,6 +104,7 @@ enum class sm_selector : base_int_t
   sm_90  = 90,
   sm_100 = 100,
   sm_101 = 101,
+  sm_120 = 120,
 };
 
 constexpr base_int_t toint(sm_selector a)
@@ -130,12 +132,14 @@ constexpr base_int_t bitexact(sm_selector a)
        : toint(a) == 90  ? sm_90_bit
        : toint(a) == 100 ? sm_100_bit
        : toint(a) == 101 ? sm_101_bit
+       : toint(a) == 120 ? sm_120_bit
                          : 0;
 }
 
 constexpr base_int_t bitrounddown(sm_selector a)
 {
-  return toint(a) >= 101 ? sm_101_bit
+  return toint(a) >= 120 ? sm_120_bit
+       : toint(a) >= 101 ? sm_101_bit
        : toint(a) >= 100 ? sm_100_bit
        : toint(a) >= 90  ? sm_90_bit
        : toint(a) >= 89  ? sm_89_bit
@@ -214,6 +218,7 @@ constexpr sm_selector sm_89  = sm_selector::sm_89;
 constexpr sm_selector sm_90  = sm_selector::sm_90;
 constexpr sm_selector sm_100 = sm_selector::sm_100;
 constexpr sm_selector sm_101 = sm_selector::sm_101;
+constexpr sm_selector sm_120 = sm_selector::sm_120;
 
 using detail::is_exactly;
 using detail::provides;

From 3e888d8fd7953d595af016eacd89af610fb624e6 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 09:10:00 +0100
Subject: [PATCH 06/15] PTX: Remove internal instructions (#3583)

* barrier.cluster.aligned: Remove
This is not supposed to be exposed in CCCL.

* elect.sync: Remove
Not ready for inclusion yet. This needs to handle the optional extra
output mask as well.

* mapa: Remove
This has compiler bugs. We should use intrinsics instead.

Co-authored-by: Allard Hendriksen <ahendriksen@nvidia.com>
---
 .../generated/barrier_cluster_aligned.rst     |  63 ---------
 .../ptx/instructions/generated/elect_sync.rst |  11 --
 .../ptx/instructions/generated/mapa.rst       |  14 --
 .../generated/barrier_cluster_aligned.h       | 130 ------------------
 .../__ptx/instructions/generated/elect_sync.h |  36 -----
 .../cuda/__ptx/instructions/generated/mapa.h  |  33 -----
 .../ptx/generated/barrier_cluster_aligned.h   |  61 --------
 .../cuda/ptx/generated/elect_sync.h           |  26 ----
 .../test/libcudacxx/cuda/ptx/generated/mapa.h |  27 ----
 9 files changed, 401 deletions(-)
 delete mode 100644 docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst
 delete mode 100644 docs/libcudacxx/ptx/instructions/generated/elect_sync.rst
 delete mode 100644 docs/libcudacxx/ptx/instructions/generated/mapa.rst
 delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h
 delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h
 delete mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h

diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst
deleted file mode 100644
index a24093ac7b6..00000000000
--- a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst
+++ /dev/null
@@ -1,63 +0,0 @@
-..
-   This file was automatically generated. Do not edit.
-
-barrier.cluster.arrive.aligned
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90
-   // .aligned   = { .aligned }
-   // Marked volatile and as clobbering memory
-   template <typename = void>
-   __device__ static inline void barrier_cluster_arrive(
-     cuda::ptx::dot_aligned_t);
-
-barrier.cluster.wait.aligned
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // barrier.cluster.wait.aligned; // PTX ISA 78, SM_90
-   // .aligned   = { .aligned }
-   // Marked volatile and as clobbering memory
-   template <typename = void>
-   __device__ static inline void barrier_cluster_wait(
-     cuda::ptx::dot_aligned_t);
-
-barrier.cluster.arrive.release.aligned
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .aligned   = { .aligned }
-   // Marked volatile and as clobbering memory
-   template <typename = void>
-   __device__ static inline void barrier_cluster_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::dot_aligned_t);
-
-barrier.cluster.arrive.relaxed.aligned
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90
-   // .sem       = { .relaxed }
-   // .aligned   = { .aligned }
-   // Marked volatile
-   template <typename = void>
-   __device__ static inline void barrier_cluster_arrive(
-     cuda::ptx::sem_relaxed_t,
-     cuda::ptx::dot_aligned_t);
-
-barrier.cluster.wait.acquire.aligned
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .aligned   = { .aligned }
-   // Marked volatile and as clobbering memory
-   template <typename = void>
-   __device__ static inline void barrier_cluster_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::dot_aligned_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst b/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst
deleted file mode 100644
index bc909c54319..00000000000
--- a/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-..
-   This file was automatically generated. Do not edit.
-
-elect.sync
-^^^^^^^^^^
-.. code:: cuda
-
-   // elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90
-   template <typename = void>
-   __device__ static inline bool elect_sync(
-     const uint32_t& membermask);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mapa.rst b/docs/libcudacxx/ptx/instructions/generated/mapa.rst
deleted file mode 100644
index 4ffc70d85d9..00000000000
--- a/docs/libcudacxx/ptx/instructions/generated/mapa.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-..
-   This file was automatically generated. Do not edit.
-
-mapa.shared::cluster.u32
-^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mapa.space.u32  dest, addr, target_cta; // PTX ISA 78, SM_90
-   // .space     = { .shared::cluster }
-   template <typename Tp>
-   __device__ static inline Tp* mapa(
-     cuda::ptx::space_cluster_t,
-     const Tp* addr,
-     uint32_t target_cta);
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h
deleted file mode 100644
index 80fe3796e69..00000000000
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h
+++ /dev/null
@@ -1,130 +0,0 @@
-// This file was automatically generated. Do not edit.
-
-#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_
-#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_
-
-/*
-// barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90
-// .aligned   = { .aligned }
-// Marked volatile and as clobbering memory
-template <typename = void>
-__device__ static inline void barrier_cluster_arrive(
-  cuda::ptx::dot_aligned_t);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive(dot_aligned_t)
-{
-// __aligned == aligned (due to parameter type constraint)
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  asm volatile("barrier.cluster.arrive.aligned;" : : : "memory");
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-#  endif
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// barrier.cluster.wait.aligned; // PTX ISA 78, SM_90
-// .aligned   = { .aligned }
-// Marked volatile and as clobbering memory
-template <typename = void>
-__device__ static inline void barrier_cluster_wait(
-  cuda::ptx::dot_aligned_t);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_wait(dot_aligned_t)
-{
-// __aligned == aligned (due to parameter type constraint)
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  asm volatile("barrier.cluster.wait.aligned;" : : : "memory");
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-#  endif
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90
-// .sem       = { .release }
-// .aligned   = { .aligned }
-// Marked volatile and as clobbering memory
-template <typename = void>
-__device__ static inline void barrier_cluster_arrive(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::dot_aligned_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t, dot_aligned_t)
-{
-// __sem == sem_release (due to parameter type constraint)
-// __aligned == aligned (due to parameter type constraint)
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  asm volatile("barrier.cluster.arrive.release.aligned;" : : : "memory");
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-#  endif
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90
-// .sem       = { .relaxed }
-// .aligned   = { .aligned }
-// Marked volatile
-template <typename = void>
-__device__ static inline void barrier_cluster_arrive(
-  cuda::ptx::sem_relaxed_t,
-  cuda::ptx::dot_aligned_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t, dot_aligned_t)
-{
-// __sem == sem_relaxed (due to parameter type constraint)
-// __aligned == aligned (due to parameter type constraint)
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  asm volatile("barrier.cluster.arrive.relaxed.aligned;" : : :);
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-#  endif
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90
-// .sem       = { .acquire }
-// .aligned   = { .aligned }
-// Marked volatile and as clobbering memory
-template <typename = void>
-__device__ static inline void barrier_cluster_wait(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::dot_aligned_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t, dot_aligned_t)
-{
-// __sem == sem_acquire (due to parameter type constraint)
-// __aligned == aligned (due to parameter type constraint)
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  asm volatile("barrier.cluster.wait.acquire.aligned;" : : : "memory");
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-#  endif
-}
-#endif // __cccl_ptx_isa >= 800
-
-#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h b/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h
deleted file mode 100644
index e8691178f14..00000000000
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// This file was automatically generated. Do not edit.
-
-#ifndef _CUDA_PTX_GENERATED_ELECT_SYNC_H_
-#define _CUDA_PTX_GENERATED_ELECT_SYNC_H_
-
-/*
-// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90
-template <typename = void>
-__device__ static inline bool elect_sync(
-  const uint32_t& membermask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_elect_sync_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline bool elect_sync(const _CUDA_VSTD::uint32_t& __membermask)
-{
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  _CUDA_VSTD::uint32_t __is_elected;
-  asm volatile(
-    "{\n\t .reg .pred P_OUT; \n\t"
-    "elect.sync _|P_OUT, %1;\n\t"
-    "selp.b32 %0, 1, 0, P_OUT; \n"
-    "}"
-    : "=r"(__is_elected)
-    : "r"(__membermask)
-    :);
-  return static_cast<bool>(__is_elected);
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_elect_sync_is_not_supported_before_SM_90__();
-  return false;
-#  endif
-}
-#endif // __cccl_ptx_isa >= 800
-
-#endif // _CUDA_PTX_GENERATED_ELECT_SYNC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h
deleted file mode 100644
index f93c8a62157..00000000000
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// This file was automatically generated. Do not edit.
-
-#ifndef _CUDA_PTX_GENERATED_MAPA_H_
-#define _CUDA_PTX_GENERATED_MAPA_H_
-
-/*
-// mapa.space.u32  dest, addr, target_cta; // PTX ISA 78, SM_90
-// .space     = { .shared::cluster }
-template <typename Tp>
-__device__ static inline Tp* mapa(
-  cuda::ptx::space_cluster_t,
-  const Tp* addr,
-  uint32_t target_cta);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_mapa_is_not_supported_before_SM_90__();
-template <typename _Tp>
-_CCCL_DEVICE static inline _Tp* mapa(space_cluster_t, const _Tp* __addr, _CUDA_VSTD::uint32_t __target_cta)
-{
-// __space == space_cluster (due to parameter type constraint)
-#  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  _CUDA_VSTD::uint32_t __dest;
-  asm("mapa.shared::cluster.u32  %0, %1, %2;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)), "r"(__target_cta) :);
-  return __from_ptr_dsmem<_Tp>(__dest);
-#  else
-  // Unsupported architectures will have a linker error with a semi-decent error message
-  __cuda_ptx_mapa_is_not_supported_before_SM_90__();
-  return __from_ptr_dsmem<_Tp>(0);
-#  endif
-}
-#endif // __cccl_ptx_isa >= 780
-
-#endif // _CUDA_PTX_GENERATED_MAPA_H_
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h
deleted file mode 100644
index 6f5a022dbc8..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// This file was automatically generated. Do not edit.
-
-// We use a special strategy to force the generation of the PTX. This is mainly
-// a fight against dead-code-elimination in the NVVM layer.
-//
-// The reason we need this strategy is because certain older versions of ptxas
-// segfault when a non-sensical sequence of PTX is generated. So instead, we try
-// to force the instantiation and compilation to PTX of all the overloads of the
-// PTX wrapping functions.
-//
-// We do this by writing a function pointer of each overload to the kernel
-// parameter `fn_ptr`.
-//
-// Because `fn_ptr` is possibly visible outside this translation unit, the
-// compiler must compile all the functions which are stored.
-
-__global__ void test_barrier_cluster_aligned(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.arrive.aligned;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(cuda::ptx::dot_aligned_t)>(cuda::ptx::barrier_cluster_arrive));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.wait.aligned;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(cuda::ptx::dot_aligned_t)>(cuda::ptx::barrier_cluster_wait));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // barrier.cluster.arrive.release.aligned;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::dot_aligned_t)>(
-          cuda::ptx::barrier_cluster_arrive));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // barrier.cluster.arrive.relaxed.aligned;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_relaxed_t, cuda::ptx::dot_aligned_t)>(
-          cuda::ptx::barrier_cluster_arrive));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // barrier.cluster.wait.acquire.aligned;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_acquire_t, cuda::ptx::dot_aligned_t)>(
-          cuda::ptx::barrier_cluster_wait));));
-#endif // __cccl_ptx_isa >= 800
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h
deleted file mode 100644
index 298225881d1..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// This file was automatically generated. Do not edit.
-
-// We use a special strategy to force the generation of the PTX. This is mainly
-// a fight against dead-code-elimination in the NVVM layer.
-//
-// The reason we need this strategy is because certain older versions of ptxas
-// segfault when a non-sensical sequence of PTX is generated. So instead, we try
-// to force the instantiation and compilation to PTX of all the overloads of the
-// PTX wrapping functions.
-//
-// We do this by writing a function pointer of each overload to the kernel
-// parameter `fn_ptr`.
-//
-// Because `fn_ptr` is possibly visible outside this translation unit, the
-// compiler must compile all the functions which are stored.
-
-__global__ void test_elect_sync(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // elect.sync _|is_elected, membermask;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<bool (*)(const uint32_t&)>(cuda::ptx::elect_sync));));
-#endif // __cccl_ptx_isa >= 800
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h
deleted file mode 100644
index 9160be1fe2d..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// This file was automatically generated. Do not edit.
-
-// We use a special strategy to force the generation of the PTX. This is mainly
-// a fight against dead-code-elimination in the NVVM layer.
-//
-// The reason we need this strategy is because certain older versions of ptxas
-// segfault when a non-sensical sequence of PTX is generated. So instead, we try
-// to force the instantiation and compilation to PTX of all the overloads of the
-// PTX wrapping functions.
-//
-// We do this by writing a function pointer of each overload to the kernel
-// parameter `fn_ptr`.
-//
-// Because `fn_ptr` is possibly visible outside this translation unit, the
-// compiler must compile all the functions which are stored.
-
-__global__ void test_mapa(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mapa.shared::cluster.u32  dest, addr, target_cta;
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<uint64_t* (*) (cuda::ptx::space_cluster_t, const uint64_t*, uint32_t)>(cuda::ptx::mapa));));
-#endif // __cccl_ptx_isa >= 780
-}

From 15a011658172b1b63bfac8a96fb49fec6d6af92a Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Thu, 30 Jan 2025 03:03:11 -0600
Subject: [PATCH 07/15] Add dynamic CUB dispatch for merge_sort (#3525)

* Add `dependent_launch` parameter to `TripleChevronFactory`
* Add `ItemsPerTile()` method to `PolicyWrapper`
* Add `MergeSortPolicyWrapper`
* Add `KernelSource` and use `launcher_factory` to launch `merge_sort` kernels
* Move the vsmem_helper to kernel source and read `BlockThreads` from there instead of the policy directly
* Make `BlockThreads` templated on the policy type
* Obtain `ItemsPerTile` from the kernel source through vsmem helper
* Change vsmem indirection so that it is its own template parameter passed to `DispatchMergeSort`
* Use `_CCCL_HOST_DEVICE` for RTC
---
 cub/cub/detail/launcher/cuda_runtime.cuh      |   6 +-
 .../device/dispatch/dispatch_merge_sort.cuh   | 214 +++++++++++-------
 .../device/dispatch/kernels/merge_sort.cuh    |  22 ++
 .../dispatch/tuning/tuning_merge_sort.cuh     |  32 ++-
 cub/cub/util_device.cuh                       |   5 +
 5 files changed, 189 insertions(+), 90 deletions(-)

diff --git a/cub/cub/detail/launcher/cuda_runtime.cuh b/cub/cub/detail/launcher/cuda_runtime.cuh
index 81ef450f424..f59c26d7fbb 100644
--- a/cub/cub/detail/launcher/cuda_runtime.cuh
+++ b/cub/cub/detail/launcher/cuda_runtime.cuh
@@ -21,10 +21,10 @@ namespace detail
 
 struct TripleChevronFactory
 {
-  CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron
-  operator()(dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, cudaStream_t stream) const
+  CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron operator()(
+    dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, cudaStream_t stream, bool dependent_launch = false) const
   {
-    return THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid, block, shared_mem, stream);
+    return THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid, block, shared_mem, stream, dependent_launch);
   }
 
   CUB_RUNTIME_FUNCTION cudaError_t PtxVersion(int& version)
diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
index 056522e162d..98a4b40e8f8 100644
--- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -46,7 +46,6 @@
 #include <cub/util_vsmem.cuh>
 
 #include <thrust/detail/integer_math.h>
-#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__algorithm/min.h>
@@ -54,24 +53,89 @@
 
 CUB_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Policy
- ******************************************************************************/
-
-template <typename KeyInputIteratorT,
+namespace detail::merge_sort
+{
+template <typename MaxPolicyT,
+          typename KeyInputIteratorT,
           typename ValueInputIteratorT,
           typename KeyIteratorT,
           typename ValueIteratorT,
           typename OffsetT,
-          typename CompareOpT,
-          typename PolicyHub = detail::merge_sort::policy_hub<KeyIteratorT>>
+          typename CompareOpT>
+struct DeviceMergeSortKernelSource
+{
+  using KeyT   = cub::detail::value_t<KeyIteratorT>;
+  using ValueT = cub::detail::value_t<ValueIteratorT>;
+
+  CUB_DEFINE_KERNEL_GETTER(
+    MergeSortBlockSortKernel,
+    DeviceMergeSortBlockSortKernel<
+      MaxPolicyT,
+      KeyInputIteratorT,
+      ValueInputIteratorT,
+      KeyIteratorT,
+      ValueIteratorT,
+      OffsetT,
+      CompareOpT,
+      KeyT,
+      ValueT>);
+
+  CUB_DEFINE_KERNEL_GETTER(MergeSortPartitionKernel,
+                           DeviceMergeSortPartitionKernel<KeyIteratorT, OffsetT, CompareOpT, KeyT>);
+
+  CUB_DEFINE_KERNEL_GETTER(
+    MergeSortMergeKernel,
+    DeviceMergeSortMergeKernel<MaxPolicyT,
+                               KeyInputIteratorT,
+                               ValueInputIteratorT,
+                               KeyIteratorT,
+                               ValueIteratorT,
+                               OffsetT,
+                               CompareOpT,
+                               KeyT,
+                               ValueT>);
+};
+
+} // namespace detail::merge_sort
+
+/*******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <
+  typename KeyInputIteratorT,
+  typename ValueInputIteratorT,
+  typename KeyIteratorT,
+  typename ValueIteratorT,
+  typename OffsetT,
+  typename CompareOpT,
+  typename PolicyHub    = detail::merge_sort::policy_hub<KeyIteratorT>,
+  typename KernelSource = detail::merge_sort::DeviceMergeSortKernelSource<
+    typename PolicyHub::MaxPolicy,
+    KeyInputIteratorT,
+    ValueInputIteratorT,
+    KeyIteratorT,
+    ValueIteratorT,
+    OffsetT,
+    CompareOpT>,
+  typename KernelLauncherFactory = detail::TripleChevronFactory,
+  typename VSMemHelperPolicyT    = detail::merge_sort::merge_sort_vsmem_helper_t<
+       typename PolicyHub::MaxPolicy::MergeSortPolicy,
+       KeyInputIteratorT,
+       ValueInputIteratorT,
+       KeyIteratorT,
+       ValueIteratorT,
+       OffsetT,
+       CompareOpT,
+       cub::detail::value_t<KeyIteratorT>,
+       cub::detail::value_t<ValueIteratorT>>>
 struct DispatchMergeSort
 {
   using KeyT   = cub::detail::value_t<KeyIteratorT>;
   using ValueT = cub::detail::value_t<ValueIteratorT>;
 
   /// Whether or not there are values to be trucked along with keys
-  static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+  static constexpr bool KEYS_ONLY = ::cuda::std::is_same_v<ValueT, NullType>;
 
   // Problem state
 
@@ -106,6 +170,12 @@ struct DispatchMergeSort
 
   int ptx_version;
 
+  KernelSource kernel_source;
+
+  KernelLauncherFactory launcher_factory;
+
+  VSMemHelperPolicyT vsmem_helper;
+
   // Constructor
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchMergeSort(
     void* d_temp_storage,
@@ -117,7 +187,10 @@ struct DispatchMergeSort
     OffsetT num_items,
     CompareOpT compare_op,
     cudaStream_t stream,
-    int ptx_version)
+    int ptx_version,
+    KernelSource kernel_source             = {},
+    KernelLauncherFactory launcher_factory = {},
+    VSMemHelperPolicyT vsmem_helper        = {})
       : d_temp_storage(d_temp_storage)
       , temp_storage_bytes(temp_storage_bytes)
       , d_input_keys(d_input_keys)
@@ -128,28 +201,15 @@ struct DispatchMergeSort
       , compare_op(compare_op)
       , stream(stream)
       , ptx_version(ptx_version)
+      , kernel_source(kernel_source)
+      , launcher_factory(launcher_factory)
+      , vsmem_helper(vsmem_helper)
   {}
 
   // Invocation
   template <typename ActivePolicyT>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT policy = {})
   {
-    using MergePolicyT = typename ActivePolicyT::MergeSortPolicy;
-
-    using merge_sort_helper_t = detail::merge_sort::merge_sort_vsmem_helper_t<
-      MergePolicyT,
-      KeyInputIteratorT,
-      ValueInputIteratorT,
-      KeyIteratorT,
-      ValueIteratorT,
-      OffsetT,
-      CompareOpT,
-      KeyT,
-      ValueT>;
-
-    using BlockSortVSmemHelperT  = detail::vsmem_helper_impl<typename merge_sort_helper_t::block_sort_agent_t>;
-    using MergeAgentVSmemHelperT = detail::vsmem_helper_impl<typename merge_sort_helper_t::merge_agent_t>;
-
     cudaError error = cudaSuccess;
 
     if (num_items == 0)
@@ -163,8 +223,9 @@ struct DispatchMergeSort
 
     do
     {
-      constexpr auto tile_size = merge_sort_helper_t::policy_t::ITEMS_PER_TILE;
-      const auto num_tiles     = ::cuda::ceil_div(num_items, tile_size);
+      auto wrapped_policy  = detail::merge_sort::MakeMergeSortPolicyWrapper(policy);
+      const auto tile_size = vsmem_helper.ItemsPerTile(wrapped_policy.MergeSort());
+      const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);
 
       const auto merge_partitions_size         = static_cast<std::size_t>(1 + num_tiles) * sizeof(OffsetT);
       const auto temporary_keys_storage_size   = static_cast<std::size_t>(num_items * sizeof(KeyT));
@@ -174,8 +235,8 @@ struct DispatchMergeSort
        * Merge sort supports large types, which can lead to excessive shared memory size requirements. In these cases,
        * merge sort allocates virtual shared memory that resides in global memory.
        */
-      const std::size_t block_sort_smem_size       = num_tiles * BlockSortVSmemHelperT::vsmem_per_block;
-      const std::size_t merge_smem_size            = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block;
+      const std::size_t block_sort_smem_size       = num_tiles * vsmem_helper.block_sort_vsmem_per_block();
+      const std::size_t merge_smem_size            = num_tiles * vsmem_helper.merge_vsmem_per_block();
       const std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size);
 
       void* allocations[4]            = {nullptr, nullptr, nullptr, nullptr};
@@ -214,29 +275,19 @@ struct DispatchMergeSort
       auto items_buffer     = static_cast<ValueT*>(allocations[2]);
 
       // Invoke DeviceMergeSortBlockSortKernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-        static_cast<int>(num_tiles), merge_sort_helper_t::policy_t::BLOCK_THREADS, 0, stream, true)
-        .doit(
-          detail::merge_sort::DeviceMergeSortBlockSortKernel<
-            typename PolicyHub::MaxPolicy,
-            KeyInputIteratorT,
-            ValueInputIteratorT,
-            KeyIteratorT,
-            ValueIteratorT,
-            OffsetT,
-            CompareOpT,
-            KeyT,
-            ValueT>,
-          ping,
-          d_input_keys,
-          d_input_items,
-          d_output_keys,
-          d_output_items,
-          num_items,
-          keys_buffer,
-          items_buffer,
-          compare_op,
-          cub::detail::vsmem_t{allocations[3]});
+      launcher_factory(
+        static_cast<int>(num_tiles), vsmem_helper.BlockThreads(wrapped_policy.MergeSort()), 0, stream, true)
+        .doit(kernel_source.MergeSortBlockSortKernel(),
+              ping,
+              d_input_keys,
+              d_input_items,
+              d_output_keys,
+              d_output_items,
+              num_items,
+              keys_buffer,
+              items_buffer,
+              compare_op,
+              cub::detail::vsmem_t{allocations[3]});
 
       error = CubDebug(detail::DebugSyncStream(stream));
       if (cudaSuccess != error)
@@ -273,9 +324,8 @@ struct DispatchMergeSort
         const OffsetT target_merged_tiles_number = OffsetT(2) << pass;
 
         // Partition
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-          partition_grid_size, threads_per_partition_block, 0, stream, true)
-          .doit(detail::merge_sort::DeviceMergeSortPartitionKernel<KeyIteratorT, OffsetT, CompareOpT, KeyT>,
+        launcher_factory(partition_grid_size, threads_per_partition_block, 0, stream, true)
+          .doit(kernel_source.MergeSortPartitionKernel(),
                 ping,
                 d_output_keys,
                 keys_buffer,
@@ -300,29 +350,19 @@ struct DispatchMergeSort
         }
 
         // Merge
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-          static_cast<int>(num_tiles), static_cast<int>(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream, true)
-          .doit(
-            detail::merge_sort::DeviceMergeSortMergeKernel<
-              typename PolicyHub::MaxPolicy,
-              KeyInputIteratorT,
-              ValueInputIteratorT,
-              KeyIteratorT,
-              ValueIteratorT,
-              OffsetT,
-              CompareOpT,
-              KeyT,
-              ValueT>,
-            ping,
-            d_output_keys,
-            d_output_items,
-            num_items,
-            keys_buffer,
-            items_buffer,
-            compare_op,
-            merge_partitions,
-            target_merged_tiles_number,
-            cub::detail::vsmem_t{allocations[3]});
+        launcher_factory(
+          static_cast<int>(num_tiles), vsmem_helper.BlockThreads(wrapped_policy.MergeSort()), 0, stream, true)
+          .doit(kernel_source.MergeSortMergeKernel(),
+                ping,
+                d_output_keys,
+                d_output_items,
+                num_items,
+                keys_buffer,
+                items_buffer,
+                compare_op,
+                merge_partitions,
+                target_merged_tiles_number,
+                cub::detail::vsmem_t{allocations[3]});
 
         error = CubDebug(detail::DebugSyncStream(stream));
         if (cudaSuccess != error)
@@ -342,6 +382,7 @@ struct DispatchMergeSort
     return error;
   }
 
+  template <typename MaxPolicyT = typename PolicyHub::MaxPolicy>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
     std::size_t& temp_storage_bytes,
@@ -351,7 +392,11 @@ struct DispatchMergeSort
     ValueIteratorT d_output_items,
     OffsetT num_items,
     CompareOpT compare_op,
-    cudaStream_t stream)
+    cudaStream_t stream,
+    KernelSource kernel_source             = {},
+    KernelLauncherFactory launcher_factory = {},
+    MaxPolicyT max_policy                  = {},
+    VSMemHelperPolicyT vsmem_helper        = {})
   {
     cudaError error = cudaSuccess;
     do
@@ -375,10 +420,13 @@ struct DispatchMergeSort
         num_items,
         compare_op,
         stream,
-        ptx_version);
+        ptx_version,
+        kernel_source,
+        launcher_factory,
+        vsmem_helper);
 
       // Dispatch to chained policy
-      error = CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch));
+      error = CubDebug(max_policy.Invoke(ptx_version, dispatch));
       if (cudaSuccess != error)
       {
         break;
diff --git a/cub/cub/device/dispatch/kernels/merge_sort.cuh b/cub/cub/device/dispatch/kernels/merge_sort.cuh
index 1065313c20d..c9a8a61395a 100644
--- a/cub/cub/device/dispatch/kernels/merge_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/merge_sort.cuh
@@ -116,6 +116,28 @@ public:
   using block_sort_agent_t =
     ::cuda::std::_If<uses_fallback_policy, fallback_block_sort_agent_t, default_block_sort_agent_t>;
   using merge_agent_t = ::cuda::std::_If<uses_fallback_policy, fallback_merge_agent_t, default_merge_agent_t>;
+
+  _CCCL_HOST_DEVICE static constexpr ::cuda::std::size_t block_sort_vsmem_per_block()
+  {
+    return detail::vsmem_helper_impl<block_sort_agent_t>::vsmem_per_block;
+  }
+
+  _CCCL_HOST_DEVICE static constexpr ::cuda::std::size_t merge_vsmem_per_block()
+  {
+    return detail::vsmem_helper_impl<merge_agent_t>::vsmem_per_block;
+  }
+
+  template <typename PolicyT>
+  _CCCL_HOST_DEVICE static constexpr int BlockThreads(PolicyT /*policy*/)
+  {
+    return policy_t::BLOCK_THREADS;
+  }
+
+  template <typename PolicyT>
+  _CCCL_HOST_DEVICE static constexpr int ItemsPerTile(PolicyT /*policy*/)
+  {
+    return policy_t::ITEMS_PER_TILE;
+  }
 };
 template <typename ChainedPolicyT,
           typename KeyInputIteratorT,
diff --git a/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh b/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh
index 29e98a3898a..2c93b1b1147 100644
--- a/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_merge_sort.cuh
@@ -42,10 +42,34 @@
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail
+namespace detail::merge_sort
 {
-namespace merge_sort
+
+template <typename PolicyT, typename = void>
+struct MergeSortPolicyWrapper : PolicyT
+{
+  CUB_RUNTIME_FUNCTION MergeSortPolicyWrapper(PolicyT base)
+      : PolicyT(base)
+  {}
+};
+
+template <typename StaticPolicyT>
+struct MergeSortPolicyWrapper<StaticPolicyT, ::cuda::std::void_t<typename StaticPolicyT::MergeSortPolicy>>
+    : StaticPolicyT
+{
+  CUB_RUNTIME_FUNCTION MergeSortPolicyWrapper(StaticPolicyT base)
+      : StaticPolicyT(base)
+  {}
+
+  CUB_DEFINE_SUB_POLICY_GETTER(MergeSort);
+};
+
+template <typename PolicyT>
+CUB_RUNTIME_FUNCTION MergeSortPolicyWrapper<PolicyT> MakeMergeSortPolicyWrapper(PolicyT policy)
 {
+  return MergeSortPolicyWrapper<PolicyT>{policy};
+}
+
 template <typename KeyIteratorT>
 struct policy_hub
 {
@@ -88,8 +112,8 @@ struct policy_hub
 
   using MaxPolicy = Policy600;
 };
-} // namespace merge_sort
-} // namespace detail
+
+} // namespace detail::merge_sort
 
 template <typename KeyIteratorT>
 using DeviceMergeSortPolicy CCCL_DEPRECATED_BECAUSE("This class is considered an implementation detail and it will be "
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index fd356b8f9e5..ca365b531fc 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -553,6 +553,11 @@ struct PolicyWrapper<
   {
     return StaticPolicyT::ITEMS_PER_THREAD;
   }
+
+  CUB_RUNTIME_FUNCTION static constexpr int ItemsPerTile()
+  {
+    return StaticPolicyT::ITEMS_PER_TILE;
+  }
 };
 
 template <typename PolicyT>

From 5ce5d28f0572d34126e00f0765977d8c54391e8e Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 10:52:13 +0100
Subject: [PATCH 08/15] PTX: Update existing instructions (#3584)

* mbarrier.expect_tx: Add missing source and test
It was already documented(!)

* cp.async.bulk.tensor: Add .{gather,scatter}4
* fence: Add .sync_restrict, .proxy.async.sync_restrict

Co-authored-by: Allard Hendriksen <ahendriksen@nvidia.com>
---
 .../ptx/instructions/cp_async_bulk_tensor.rst |  5 +++
 docs/libcudacxx/ptx/instructions/fence.rst    | 10 +++++
 .../__ptx/instructions/cp_async_bulk_tensor.h |  1 +
 .../include/cuda/__ptx/instructions/fence.h   |  2 +
 .../__ptx/instructions/mbarrier_expect_tx.h   | 37 +++++++++++++++++++
 libcudacxx/include/cuda/ptx                   |  1 +
 .../ptx.cp.async.bulk.tensor.compile.pass.cpp |  1 +
 .../cuda/ptx/ptx.fence.compile.pass.cpp       |  2 +
 .../ptx.mbarrier.expect_tx.compile.pass.cpp   | 22 +++++++++++
 9 files changed, 81 insertions(+)
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.expect_tx.compile.pass.cpp

diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst
index bde3488bac9..8dc9def989b 100644
--- a/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst
+++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst
@@ -21,3 +21,8 @@ Multicast
 ---------
 
 .. include:: generated/cp_async_bulk_tensor_multicast.rst
+
+Scatter / Gather
+----------------
+
+.. include:: generated/cp_async_bulk_tensor_gather_scatter.rst
diff --git a/docs/libcudacxx/ptx/instructions/fence.rst b/docs/libcudacxx/ptx/instructions/fence.rst
index 82de170f63b..4d9126be62f 100644
--- a/docs/libcudacxx/ptx/instructions/fence.rst
+++ b/docs/libcudacxx/ptx/instructions/fence.rst
@@ -13,6 +13,11 @@ fence
 
 .. include:: generated/fence.rst
 
+fence.sync_restrict
+-------------------
+
+.. include:: generated/fence_sync_restrict.rst
+
 fence.mbarrier_init
 -------------------
 
@@ -29,6 +34,11 @@ fence.proxy.async
 
 .. include:: generated/fence_proxy_async.rst
 
+fence.proxy.async.sync_restrict
+-------------------------------
+
+.. include:: generated/fence_proxy_async_generic_sync_restrict.rst
+
 fence.proxy.tensormap
 ---------------------
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
index 7de5b41b744..f99c0c6f73b 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
@@ -33,6 +33,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 #include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h>
 #include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
diff --git a/libcudacxx/include/cuda/__ptx/instructions/fence.h b/libcudacxx/include/cuda/__ptx/instructions/fence.h
index a8dccf979c2..3c123840797 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/fence.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/fence.h
@@ -36,7 +36,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 #include <cuda/__ptx/instructions/generated/fence_mbarrier_init.h>
 #include <cuda/__ptx/instructions/generated/fence_proxy_alias.h>
 #include <cuda/__ptx/instructions/generated/fence_proxy_async.h>
+#include <cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h>
 #include <cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h>
+#include <cuda/__ptx/instructions/generated/fence_sync_restrict.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h
new file mode 100644
index 00000000000..886bfe64d75
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_MBARRIER_EXPECT_TX_H_
+#define _CUDA_PTX_MBARRIER_EXPECT_TX_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/mbarrier_expect_tx.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_MBARRIER_EXPECT_TX_H_
diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index 44edb20c98e..4798973df77 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -80,6 +80,7 @@
 #include <cuda/__ptx/instructions/get_sreg.h>
 #include <cuda/__ptx/instructions/getctarank.h>
 #include <cuda/__ptx/instructions/mbarrier_arrive.h>
+#include <cuda/__ptx/instructions/mbarrier_expect_tx.h>
 #include <cuda/__ptx/instructions/mbarrier_init.h>
 #include <cuda/__ptx/instructions/mbarrier_wait.h>
 #include <cuda/__ptx/instructions/red_async.h>
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp
index 42bc5b8e355..efd66a8fa4e 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp
@@ -17,6 +17,7 @@
 #include "nvrtc_workaround.h"
 // above header needs to be included before the generated test header
 #include "generated/cp_async_bulk_tensor.h"
+#include "generated/cp_async_bulk_tensor_gather_scatter.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp
index c439720b8f8..aa2c9ec6152 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp
@@ -20,7 +20,9 @@
 #include "generated/fence_mbarrier_init.h"
 #include "generated/fence_proxy_alias.h"
 #include "generated/fence_proxy_async.h"
+#include "generated/fence_proxy_async_generic_sync_restrict.h"
 #include "generated/fence_proxy_tensormap_generic.h"
+#include "generated/fence_sync_restrict.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.expect_tx.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.expect_tx.compile.pass.cpp
new file mode 100644
index 00000000000..f4d06bdb8ba
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.expect_tx.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/mbarrier_expect_tx.h"
+
+int main(int, char**)
+{
+  return 0;
+}

From a1a73a8708eac531498762c22999d0a5aea076d0 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 11:26:03 +0100
Subject: [PATCH 09/15] Internalize cuda/detail/core/util.h (#3505)

---
 cub/cub/agent/agent_adjacent_difference.cuh   |  4 +-
 cub/cub/agent/agent_merge.cuh                 |  8 +-
 cub/cub/agent/agent_merge_sort.cuh            | 15 ++--
 cub/cub/agent/agent_sub_warp_merge_sort.cuh   |  4 +-
 cub/cub/device/dispatch/dispatch_merge.cuh    |  2 +-
 .../device/dispatch/kernels/merge_sort.cuh    | 19 ++---
 .../system/cuda/detail/core/agent_launcher.h  | 46 +++---------
 .../system/cuda/detail/core/load_iterator.h   |  4 +-
 .../cuda/detail/core/make_load_iterator.h     |  4 +-
 thrust/thrust/system/cuda/detail/core/util.h  | 51 +------------
 thrust/thrust/system/cuda/detail/extrema.h    | 18 ++---
 thrust/thrust/system/cuda/detail/reduce.h     | 40 +++++-----
 .../thrust/system/cuda/detail/reduce_by_key.h | 48 ++++++------
 .../system/cuda/detail/set_operations.h       | 73 +++++++++----------
 thrust/thrust/system/cuda/detail/sort.h       |  6 +-
 thrust/thrust/system/cuda/detail/unique.h     | 43 +++++------
 16 files changed, 156 insertions(+), 229 deletions(-)

diff --git a/cub/cub/agent/agent_adjacent_difference.cuh b/cub/cub/agent/agent_adjacent_difference.cuh
index c19cb90079a..8617c78193b 100644
--- a/cub/cub/agent/agent_adjacent_difference.cuh
+++ b/cub/cub/agent/agent_adjacent_difference.cuh
@@ -79,7 +79,7 @@ template <typename Policy,
           bool ReadLeft>
 struct AgentDifference
 {
-  using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, InputIteratorT>::type;
+  using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, InputIteratorT>::type;
 
   using BlockLoad  = typename cub::BlockLoadType<Policy, LoadIt>::type;
   using BlockStore = typename cub::BlockStoreType<Policy, OutputIteratorT, OutputT>::type;
@@ -119,7 +119,7 @@ struct AgentDifference
     OffsetT num_items)
       : temp_storage(temp_storage.Alias())
       , input_it(input_it)
-      , load_it(THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(), input_it))
+      , load_it(THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(Policy(), input_it))
       , first_tile_previous(first_tile_previous)
       , result(result)
       , difference_op(difference_op)
diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh
index 9ae14c3e42e..5c7d5322456 100644
--- a/cub/cub/agent/agent_merge.cuh
+++ b/cub/cub/agent/agent_merge.cuh
@@ -64,10 +64,10 @@ struct agent_t
   using key_type  = typename ::cuda::std::iterator_traits<KeysIt1>::value_type;
   using item_type = typename ::cuda::std::iterator_traits<ItemsIt1>::value_type;
 
-  using keys_load_it1  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt1>::type;
-  using keys_load_it2  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt2>::type;
-  using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt1>::type;
-  using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt2>::type;
+  using keys_load_it1  = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeysIt1>::type;
+  using keys_load_it2  = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeysIt2>::type;
+  using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ItemsIt1>::type;
+  using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ItemsIt2>::type;
 
   using block_load_keys1  = typename BlockLoadType<Policy, keys_load_it1>::type;
   using block_load_keys2  = typename BlockLoadType<Policy, keys_load_it2>::type;
diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh
index bf4984f7256..1ec952187a7 100644
--- a/cub/cub/agent/agent_merge_sort.cuh
+++ b/cub/cub/agent/agent_merge_sort.cuh
@@ -91,8 +91,10 @@ struct AgentBlockSort
 
   using BlockMergeSortT = BlockMergeSort<KeyT, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, ValueT>;
 
-  using KeysLoadIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyInputIteratorT>::type;
-  using ItemsLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueInputIteratorT>::type;
+  using KeysLoadIt =
+    typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyInputIteratorT>::type;
+  using ItemsLoadIt =
+    typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueInputIteratorT>::type;
 
   using BlockLoadKeys  = typename cub::BlockLoadType<Policy, KeysLoadIt>::type;
   using BlockLoadItems = typename cub::BlockLoadType<Policy, ItemsLoadIt>::type;
@@ -438,10 +440,11 @@ struct AgentMerge
   //---------------------------------------------------------------------
   // Types and constants
   //---------------------------------------------------------------------
-  using KeysLoadPingIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyIteratorT>::type;
-  using ItemsLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueIteratorT>::type;
-  using KeysLoadPongIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyT*>::type;
-  using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueT*>::type;
+  using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyIteratorT>::type;
+  using ItemsLoadPingIt =
+    typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueIteratorT>::type;
+  using KeysLoadPongIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyT*>::type;
+  using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueT*>::type;
 
   using KeysOutputPongIt  = KeyIteratorT;
   using ItemsOutputPongIt = ValueIteratorT;
diff --git a/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/cub/agent/agent_sub_warp_merge_sort.cuh
index b10f1cda3ea..9f98ac42e3b 100644
--- a/cub/cub/agent/agent_sub_warp_merge_sort.cuh
+++ b/cub/cub/agent/agent_sub_warp_merge_sort.cuh
@@ -183,8 +183,8 @@ public:
 
   using WarpMergeSortT = WarpMergeSort<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::WARP_THREADS, ValueT>;
 
-  using KeysLoadItT  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<PolicyT, const KeyT*>::type;
-  using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<PolicyT, const ValueT*>::type;
+  using KeysLoadItT  = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<PolicyT, const KeyT*>::type;
+  using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<PolicyT, const ValueT*>::type;
 
   using WarpLoadKeysT = cub::WarpLoad<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::LOAD_ALGORITHM, PolicyT::WARP_THREADS>;
   using WarpLoadItemsT =
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
index b3d0c8ab2ca..c4df61fd29a 100644
--- a/cub/cub/device/dispatch/dispatch_merge.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -138,7 +138,7 @@ __launch_bounds__(
     CompareOp>::type;
   using MergePolicy = typename MergeAgent::policy;
 
-  using THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator;
+  using THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator;
   using vsmem_helper_t = vsmem_helper_impl<MergeAgent>;
   __shared__ typename vsmem_helper_t::static_temp_storage_t shared_temp_storage;
   auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
diff --git a/cub/cub/device/dispatch/kernels/merge_sort.cuh b/cub/cub/device/dispatch/kernels/merge_sort.cuh
index c9a8a61395a..79f7c6bbe40 100644
--- a/cub/cub/device/dispatch/kernels/merge_sort.cuh
+++ b/cub/cub/device/dispatch/kernels/merge_sort.cuh
@@ -19,12 +19,13 @@
 
 THRUST_NAMESPACE_BEGIN
 
-namespace cuda_cub::core
+namespace cuda_cub::core::detail
 {
 // We must forward declare here because make_load_iterator.h pulls in non NVRTC compilable code
 template <class PtxPlan, class It>
-typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE make_load_iterator(PtxPlan const&, It it);
-} // namespace cuda_cub::core
+typename detail::LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE
+make_load_iterator(PtxPlan const&, It it);
+} // namespace cuda_cub::core::detail
 
 THRUST_NAMESPACE_END
 
@@ -196,8 +197,8 @@ __launch_bounds__(
   AgentBlockSortT agent(
     ping,
     temp_storage,
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in),
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in),
+    THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_in),
+    THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_in),
     keys_count,
     keys_out,
     items_out,
@@ -302,10 +303,10 @@ __launch_bounds__(
   AgentMergeT agent(
     ping,
     temp_storage,
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping),
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping),
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong),
-    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong),
+    THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_ping),
+    THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_ping),
+    THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_pong),
+    THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_pong),
     keys_count,
     keys_pong,
     items_pong,
diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
index fb7c1ef22d6..d9baeb47593 100644
--- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h
+++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
@@ -62,7 +62,8 @@ namespace cuda_cub
 {
 namespace core
 {
-
+namespace detail
+{
 #  ifndef THRUST_DETAIL_KERNEL_ATTRIBUTES
 #    define THRUST_DETAIL_KERNEL_ATTRIBUTES CCCL_DETAIL_KERNEL_ATTRIBUTES
 #  endif
@@ -97,7 +98,7 @@ THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*, Args... args)
 template <class Agent>
 struct AgentLauncher : Agent
 {
-  core::AgentPlan plan;
+  AgentPlan plan;
   size_t count;
   cudaStream_t stream;
   char const* name;
@@ -121,7 +122,7 @@ struct AgentLauncher : Agent
       , name(name_)
       , grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile))
       , vshmem(nullptr)
-      , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
+      , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
       , shmem_size(has_shmem ? plan.shared_memory_size : 0)
   {
     assert(count > 0);
@@ -136,7 +137,7 @@ struct AgentLauncher : Agent
       , name(name_)
       , grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile))
       , vshmem(vshmem)
-      , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
+      , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
       , shmem_size(has_shmem ? plan.shared_memory_size : 0)
   {
     assert(count > 0);
@@ -149,7 +150,7 @@ struct AgentLauncher : Agent
       , name(name_)
       , grid(plan.grid_size)
       , vshmem(nullptr)
-      , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
+      , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
       , shmem_size(has_shmem ? plan.shared_memory_size : 0)
   {
     assert(plan.grid_size > 0);
@@ -162,43 +163,19 @@ struct AgentLauncher : Agent
       , name(name_)
       , grid(plan.grid_size)
       , vshmem(vshmem)
-      , has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
+      , has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
       , shmem_size(has_shmem ? plan.shared_memory_size : 0)
   {
     assert(plan.grid_size > 0);
   }
 
-#  if 0
-    THRUST_RUNTIME_FUNCTION
-    AgentPlan static get_plan(cudaStream_t s, void* d_ptr = 0)
-    {
-      // in separable compilation mode, we have no choice
-      // but to call kernel to get agent_plan
-      // otherwise the risk is something may fail
-      // if user mix & match ptx versions in a separably compiled function
-      // http://nvbugs/1772071
-      // XXX may be it is too string of a requirements, consider relaxing it in
-      // the future
-#    ifdef __CUDACC_RDC__
-      return core::get_agent_plan<Agent>(s, d_ptr);
-#    else
-      return get_agent_plan<Agent>(core::get_ptx_version());
-#    endif
-    }
-    THRUST_RUNTIME_FUNCTION
-    AgentPlan static get_plan_default()
-    {
-      return get_agent_plan<Agent>(sm_arch<0>::type::ver);
-    }
-#  endif
-
-  THRUST_RUNTIME_FUNCTION typename core::get_plan<Agent>::type static get_plan(cudaStream_t, void* d_ptr = 0)
+  THRUST_RUNTIME_FUNCTION typename get_plan<Agent>::type static get_plan(cudaStream_t, void* d_ptr = 0)
   {
     THRUST_UNUSED_VAR(d_ptr);
-    return get_agent_plan<Agent>(core::get_ptx_version());
+    return get_agent_plan<Agent>(get_ptx_version());
   }
 
-  THRUST_RUNTIME_FUNCTION typename core::get_plan<Agent>::type static get_plan()
+  THRUST_RUNTIME_FUNCTION typename detail::get_plan<Agent>::type static get_plan()
   {
     return get_agent_plan<Agent>(lowest_supported_sm_arch::ver);
   }
@@ -227,7 +204,7 @@ struct AgentLauncher : Agent
   {
 #  if THRUST_DEBUG_SYNC_FLAG
     cuda_optional<int> occ = max_sm_occupancy(k);
-    const int ptx_version  = core::get_ptx_version();
+    const int ptx_version  = get_ptx_version();
     if (count > 0)
     {
       _CubLog(
@@ -305,6 +282,7 @@ struct AgentLauncher : Agent
   }
 };
 
+} // namespace detail
 } // namespace core
 } // namespace cuda_cub
 
diff --git a/thrust/thrust/system/cuda/detail/core/load_iterator.h b/thrust/thrust/system/cuda/detail/core/load_iterator.h
index 07c5eba0eaa..6f2c118b151 100644
--- a/thrust/thrust/system/cuda/detail/core/load_iterator.h
+++ b/thrust/thrust/system/cuda/detail/core/load_iterator.h
@@ -34,7 +34,7 @@
 
 THRUST_NAMESPACE_BEGIN
 
-namespace cuda_cub::core
+namespace cuda_cub::core::detail
 {
 
 // LoadIterator
@@ -52,6 +52,6 @@ struct LoadIterator
                                cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER, value_type, size_type>,
                                It>;
 }; // struct Iterator
-} // namespace cuda_cub::core
+} // namespace cuda_cub::core::detail
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/system/cuda/detail/core/make_load_iterator.h b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h
index 28c65c813ea..9497ccacca9 100644
--- a/thrust/thrust/system/cuda/detail/core/make_load_iterator.h
+++ b/thrust/thrust/system/cuda/detail/core/make_load_iterator.h
@@ -33,7 +33,7 @@
 
 THRUST_NAMESPACE_BEGIN
 
-namespace cuda_cub::core
+namespace cuda_cub::core::detail
 {
 template <class PtxPlan, class It>
 typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE
@@ -55,6 +55,6 @@ typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE make_loa
   return make_load_iterator_impl<PtxPlan>(it, typename is_contiguous_iterator<It>::type());
 }
 
-} // namespace cuda_cub::core
+} // namespace cuda_cub::core::detail
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h
index 94a7e750aeb..b3bdcf1f086 100644
--- a/thrust/thrust/system/cuda/detail/core/util.h
+++ b/thrust/thrust/system/cuda/detail/core/util.h
@@ -78,6 +78,8 @@ namespace core
 #  endif
 #endif
 
+namespace detail
+{
 /// Typelist - a container of types
 template <typename...>
 struct typelist;
@@ -458,22 +460,9 @@ THRUST_RUNTIME_FUNCTION inline size_t get_max_shared_memory_per_block()
   return static_cast<size_t>(i32value);
 }
 
-THRUST_RUNTIME_FUNCTION inline size_t virtual_shmem_size(size_t shmem_per_block)
-{
-  size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
-  if (shmem_per_block > max_shmem_per_block)
-  {
-    return shmem_per_block;
-  }
-  else
-  {
-    return 0;
-  }
-}
-
 THRUST_RUNTIME_FUNCTION inline size_t vshmem_size(size_t shmem_per_block, size_t num_blocks)
 {
-  size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
+  size_t max_shmem_per_block = get_max_shared_memory_per_block();
   if (shmem_per_block > max_shmem_per_block)
   {
     return shmem_per_block * num_blocks;
@@ -509,22 +498,6 @@ struct BlockLoad
                    get_arch<PtxPlan>::type::ver>;
 };
 
-// BlockStore
-// -----------
-// a helper metaprogram that returns type of a block loader
-template <class PtxPlan, class It, class T = typename iterator_traits<It>::value_type>
-struct BlockStore
-{
-  using type =
-    cub::BlockStore<T,
-                    PtxPlan::BLOCK_THREADS,
-                    PtxPlan::ITEMS_PER_THREAD,
-                    PtxPlan::STORE_ALGORITHM,
-                    1,
-                    1,
-                    get_arch<PtxPlan>::type::ver>;
-};
-
 // cuda_optional
 // --------------
 // used for function that return cudaError_t along with the result
@@ -619,16 +592,6 @@ THRUST_RUNTIME_FUNCTION inline int get_ptx_version()
   return ptx_version;
 }
 
-THRUST_RUNTIME_FUNCTION inline cudaError_t sync_stream(cudaStream_t stream)
-{
-  return cub::SyncStream(stream);
-}
-
-inline void _CCCL_DEVICE sync_threadblock()
-{
-  __syncthreads();
-}
-
 // Deprecated [Since 2.8]
 #define CUDA_CUB_RET_IF_FAIL(e)                \
   {                                            \
@@ -719,11 +682,6 @@ struct uninitialized_array
   }
 };
 
-_CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t align_to(size_t n, size_t align)
-{
-  return ((n + align - 1) / align) * align;
-}
-
 namespace host
 {
 inline cuda_optional<size_t> get_max_shared_memory_per_block()
@@ -753,9 +711,8 @@ THRUST_RUNTIME_FUNCTION cudaError_t alias_storage(
   return cub::AliasTemporaries(storage_ptr, storage_size, allocations, allocation_sizes);
 }
 
+} // namespace detail
 } // namespace core
-using core::sm52;
-using core::sm60;
 } // namespace cuda_cub
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/system/cuda/detail/extrema.h b/thrust/thrust/system/cuda/detail/extrema.h
index 617eb8bbc79..b2124323424 100644
--- a/thrust/thrust/system/cuda/detail/extrema.h
+++ b/thrust/thrust/system/cuda/detail/extrema.h
@@ -184,10 +184,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   OutputIt output_it,
   cudaStream_t stream)
 {
-  using core::AgentLauncher;
-  using core::AgentPlan;
-  using core::cuda_optional;
-  using core::get_agent_plan;
+  using core::detail::AgentLauncher;
+  using core::detail::AgentPlan;
+  using core::detail::cuda_optional;
+  using core::detail::get_agent_plan;
 
   using UnsignedSize = typename detail::make_unsigned_special<Size>::type;
 
@@ -204,7 +204,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
 
   if (num_items <= reduce_plan.items_per_tile)
   {
-    size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1);
+    size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1);
 
     // small, single tile size
     if (d_temp_storage == nullptr)
@@ -221,7 +221,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   else
   {
     // regular size
-    cuda_optional<int> sm_count = core::get_sm_count();
+    cuda_optional<int> sm_count = core::detail::get_sm_count();
     CUDA_CUB_RET_IF_FAIL(sm_count.status());
 
     // reduction will not use more cta counts than requested
@@ -245,7 +245,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
     // we will launch at most "max_blocks" blocks in a grid
     // so preallocate virtual shared memory storage for this if required
     //
-    size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, max_blocks);
+    size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks);
 
     // Temporary storage allocation requirements
     void* allocations[3]       = {nullptr, nullptr, nullptr};
@@ -331,14 +331,14 @@ extrema(execution_policy<Derived>& policy, InputIt first, Size num_items, Binary
   void* allocations[2]       = {nullptr, nullptr};
 
   size_t storage_size = 0;
-  status              = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
+  status              = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "extrema failed on 1st alias storage");
 
   // Allocate temporary storage.
   thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
   void* ptr = static_cast<void*>(tmp.data().get());
 
-  status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "extrema failed on 2nd alias storage");
 
   T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
diff --git a/thrust/thrust/system/cuda/detail/reduce.h b/thrust/thrust/system/cuda/detail/reduce.h
index 3787ab62367..61ec2086adf 100644
--- a/thrust/thrust/system/cuda/detail/reduce.h
+++ b/thrust/thrust/system/cuda/detail/reduce.h
@@ -109,7 +109,7 @@ template <class, class>
 struct Tuning;
 
 template <class T>
-struct Tuning<sm52, T>
+struct Tuning<core::detail::sm52, T>
 {
   enum
   {
@@ -155,7 +155,7 @@ struct ReduceAgent
     using tuning = Tuning<Arch, T>;
 
     using Vector      = typename cub::CubVector<T, PtxPlan::VECTOR_LOAD_LENGTH>;
-    using LoadIt      = typename core::LoadIterator<PtxPlan, InputIt>::type;
+    using LoadIt      = typename core::detail::LoadIterator<PtxPlan, InputIt>::type;
     using BlockReduce = cub::BlockReduce<T, PtxPlan::BLOCK_THREADS, PtxPlan::BLOCK_ALGORITHM, 1, 1, Arch::ver>;
 
     using VectorLoadIt = cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER, Vector, Size>;
@@ -175,7 +175,7 @@ struct ReduceAgent
   // Other algorithms, e.g. merge, may not need additional information,
   // and may use AgentPlan directly, instead of defining their own Plan type.
   //
-  struct Plan : core::AgentPlan
+  struct Plan : core::detail::AgentPlan
   {
     cub::GridMappingStrategy grid_mapping;
 
@@ -183,7 +183,7 @@ struct ReduceAgent
 
     template <class P>
     THRUST_RUNTIME_FUNCTION Plan(P)
-        : core::AgentPlan(P())
+        : core::detail::AgentPlan(P())
         , grid_mapping(P::GRID_MAPPING)
     {}
   };
@@ -192,7 +192,7 @@ struct ReduceAgent
   // ptx_plan type *must* only be used from device code
   // Its use from host code will result in *undefined behaviour*
   //
-  using ptx_plan = typename core::specialize_plan_msvc10_war<PtxPlan>::type::type;
+  using ptx_plan = typename core::detail::specialize_plan_msvc10_war<PtxPlan>::type::type;
 
   using TempStorage  = typename ptx_plan::TempStorage;
   using Vector       = typename ptx_plan::Vector;
@@ -230,7 +230,7 @@ struct ReduceAgent
     THRUST_DEVICE_FUNCTION impl(TempStorage& storage_, InputIt input_it_, ReductionOp reduction_op_)
         : storage(storage_)
         , input_it(input_it_)
-        , load_it(core::make_load_iterator(ptx_plan(), input_it))
+        , load_it(core::detail::make_load_iterator(ptx_plan(), input_it))
         , reduction_op(reduction_op_)
     {}
 
@@ -428,8 +428,6 @@ struct ReduceAgent
     THRUST_DEVICE_FUNCTION T
     consume_tiles_impl(Size num_items, cub::GridQueue<UnsignedSize> queue, CAN_VECTORIZE can_vectorize)
     {
-      using core::sync_threadblock;
-
       // We give each thread block at least one tile of input.
       T thread_aggregate;
       Size block_offset    = blockIdx.x * ITEMS_PER_TILE;
@@ -454,7 +452,7 @@ struct ReduceAgent
           storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base;
         }
 
-        sync_threadblock();
+        __syncthreads();
 
         // Grab tile offset and check if we're done with full tiles
         block_offset = storage.dequeue_offset;
@@ -465,7 +463,7 @@ struct ReduceAgent
           consume_tile<false>(
             thread_aggregate, block_offset, ITEMS_PER_TILE, thrust::detail::true_type(), can_vectorize);
 
-          sync_threadblock();
+          __syncthreads();
 
           // Dequeue a tile of items
           if (threadIdx.x == 0)
@@ -473,7 +471,7 @@ struct ReduceAgent
             storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base;
           }
 
-          sync_threadblock();
+          __syncthreads();
 
           // Grab tile offset and check if we're done with full tiles
           block_offset = storage.dequeue_offset;
@@ -586,7 +584,7 @@ struct DrainAgent
   template <class Arch>
   struct PtxPlan : PtxPolicy<1>
   {};
-  using ptx_plan = core::specialize_plan<PtxPlan>;
+  using ptx_plan = core::detail::specialize_plan<PtxPlan>;
 
   //---------------------------------------------------------------------
   // Agent entry point
@@ -609,10 +607,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   OutputIt output_it,
   cudaStream_t stream)
 {
-  using core::AgentLauncher;
-  using core::AgentPlan;
-  using core::cuda_optional;
-  using core::get_agent_plan;
+  using core::detail::AgentLauncher;
+  using core::detail::AgentPlan;
+  using core::detail::cuda_optional;
+  using core::detail::get_agent_plan;
 
   using UnsignedSize = typename detail::make_unsigned_special<Size>::type;
 
@@ -629,7 +627,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
 
   if (num_items <= reduce_plan.items_per_tile)
   {
-    size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1);
+    size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, 1);
 
     // small, single tile size
     if (d_temp_storage == nullptr)
@@ -646,7 +644,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   else
   {
     // regular size
-    cuda_optional<int> sm_count = core::get_sm_count();
+    cuda_optional<int> sm_count = core::detail::get_sm_count();
     CUDA_CUB_RET_IF_FAIL(sm_count.status());
 
     // reduction will not use more cta counts than requested
@@ -670,7 +668,7 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
     // we will launch at most "max_blocks" blocks in a grid
     // so preallocate virtual shared memory storage for this if required
     //
-    size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, max_blocks);
+    size_t vshmem_size = core::detail::vshmem_size(reduce_plan.shared_memory_size, max_blocks);
 
     // Temporary storage allocation requirements
     void* allocations[3]       = {nullptr, nullptr, nullptr};
@@ -755,14 +753,14 @@ reduce(execution_policy<Derived>& policy, InputIt first, Size num_items, T init,
   void* allocations[2]       = {nullptr, nullptr};
 
   size_t storage_size = 0;
-  status              = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
+  status              = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage");
 
   // Allocate temporary storage.
   thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
   void* ptr = static_cast<void*>(tmp.data().get());
 
-  status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage");
 
   T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
diff --git a/thrust/thrust/system/cuda/detail/reduce_by_key.h b/thrust/thrust/system/cuda/detail/reduce_by_key.h
index ae1f0ffab96..8c1db436085 100644
--- a/thrust/thrust/system/cuda/detail/reduce_by_key.h
+++ b/thrust/thrust/system/cuda/detail/reduce_by_key.h
@@ -115,7 +115,7 @@ template <class Arch, class Key, class Value>
 struct Tuning;
 
 template <class Key, class Value>
-struct Tuning<sm52, Key, Value>
+struct Tuning<core::detail::sm52, Key, Value>
 {
   enum
   {
@@ -163,11 +163,11 @@ struct ReduceByKeyAgent
   {
     using tuning = Tuning<Arch, key_type, value_type>;
 
-    using KeysLoadIt   = typename core::LoadIterator<PtxPlan, KeysInputIt>::type;
-    using ValuesLoadIt = typename core::LoadIterator<PtxPlan, ValuesInputIt>::type;
+    using KeysLoadIt   = typename core::detail::LoadIterator<PtxPlan, KeysInputIt>::type;
+    using ValuesLoadIt = typename core::detail::LoadIterator<PtxPlan, ValuesInputIt>::type;
 
-    using BlockLoadKeys   = typename core::BlockLoad<PtxPlan, KeysLoadIt>::type;
-    using BlockLoadValues = typename core::BlockLoad<PtxPlan, ValuesLoadIt>::type;
+    using BlockLoadKeys   = typename core::detail::BlockLoad<PtxPlan, KeysLoadIt>::type;
+    using BlockLoadValues = typename core::detail::BlockLoad<PtxPlan, ValuesLoadIt>::type;
 
     using BlockDiscontinuityKeys = cub::BlockDiscontinuity<key_type, PtxPlan::BLOCK_THREADS, 1, 1, Arch::ver>;
 
@@ -188,11 +188,11 @@ struct ReduceByKeyAgent
       typename BlockLoadKeys::TempStorage load_keys;
       typename BlockLoadValues::TempStorage load_values;
 
-      core::uninitialized_array<key_value_pair_t, PtxPlan::ITEMS_PER_TILE + 1> raw_exchange;
+      core::detail::uninitialized_array<key_value_pair_t, PtxPlan::ITEMS_PER_TILE + 1> raw_exchange;
     }; // union TempStorage
   }; // struct PtxPlan
 
-  using ptx_plan = typename core::specialize_plan_msvc10_war<PtxPlan>::type::type;
+  using ptx_plan = typename core::detail::specialize_plan_msvc10_war<PtxPlan>::type::type;
 
   using KeysLoadIt             = typename ptx_plan::KeysLoadIt;
   using ValuesLoadIt           = typename ptx_plan::ValuesLoadIt;
@@ -360,9 +360,7 @@ struct ReduceByKeyAgent
       size_type num_tile_segments,
       size_type num_tile_segments_prefix)
     {
-      using core::sync_threadblock;
-
-      sync_threadblock();
+      __syncthreads();
 
       // Compact and scatter keys
 #  pragma unroll
@@ -375,7 +373,7 @@ struct ReduceByKeyAgent
         }
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
       {
@@ -445,8 +443,6 @@ struct ReduceByKeyAgent
     template <bool IS_LAST_TILE>
     THRUST_DEVICE_FUNCTION void consume_first_tile(Size num_remaining, Size tile_offset, ScanTileState& tile_state)
     {
-      using core::sync_threadblock;
-
       key_type keys[ITEMS_PER_THREAD]; // Tile keys
       key_type pred_keys[ITEMS_PER_THREAD]; // Tile keys shifted up (predecessor)
       value_type values[ITEMS_PER_THREAD]; // Tile values
@@ -468,7 +464,7 @@ struct ReduceByKeyAgent
         BlockLoadKeys(storage.load_keys).Load(keys_load_it + tile_offset, keys);
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       // Load values (last tile repeats final element)
       if (IS_LAST_TILE)
@@ -481,7 +477,7 @@ struct ReduceByKeyAgent
         BlockLoadValues(storage.load_values).Load(values_load_it + tile_offset, values);
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       // Set head segment_flags.
       // First tile sets the first flag for the first item
@@ -540,8 +536,6 @@ struct ReduceByKeyAgent
     THRUST_DEVICE_FUNCTION void
     consume_subsequent_tile(Size num_remaining, int tile_idx, Size tile_offset, ScanTileState& tile_state)
     {
-      using core::sync_threadblock;
-
       key_type keys[ITEMS_PER_THREAD]; // Tile keys
       key_type pred_keys[ITEMS_PER_THREAD]; // Tile keys shifted up (predecessor)
       value_type values[ITEMS_PER_THREAD]; // Tile values
@@ -563,7 +557,7 @@ struct ReduceByKeyAgent
 
       key_type tile_pred_key = (threadIdx.x == 0) ? key_type(keys_load_it[tile_offset - 1]) : key_type();
 
-      sync_threadblock();
+      __syncthreads();
 
       // Load values (last tile repeats final element)
       if (IS_LAST_TILE)
@@ -576,7 +570,7 @@ struct ReduceByKeyAgent
         BlockLoadValues(storage.load_values).Load(values_load_it + tile_offset, values);
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       // Set head segment_flags
       BlockDiscontinuityKeys(storage.scan_storage.discontinuity)
@@ -635,8 +629,8 @@ struct ReduceByKeyAgent
       int /*num_tiles*/,
       ScanTileState& tile_state)
         : storage(storage_)
-        , keys_load_it(core::make_load_iterator(ptx_plan(), keys_input_it_))
-        , values_load_it(core::make_load_iterator(ptx_plan(), values_input_it_))
+        , keys_load_it(core::detail::make_load_iterator(ptx_plan(), keys_input_it_))
+        , values_load_it(core::detail::make_load_iterator(ptx_plan(), values_input_it_))
         , keys_output_it(keys_output_it_)
         , values_output_it(values_output_it_)
         , num_runs_output_it(num_runs_output_it_)
@@ -703,7 +697,7 @@ struct InitAgent
   template <class Arch>
   struct PtxPlan : PtxPolicy<128>
   {};
-  using ptx_plan = core::specialize_plan<PtxPlan>;
+  using ptx_plan = core::detail::specialize_plan<PtxPlan>;
 
   //---------------------------------------------------------------------
   // Agent entry point
@@ -740,8 +734,8 @@ THRUST_RUNTIME_FUNCTION cudaError_t doit_step(
   Size num_items,
   cudaStream_t stream)
 {
-  using core::AgentLauncher;
-  using core::AgentPlan;
+  using core::detail::AgentLauncher;
+  using core::detail::AgentPlan;
 
   cudaError_t status = cudaSuccess;
   if (num_items == 0)
@@ -762,7 +756,7 @@ THRUST_RUNTIME_FUNCTION cudaError_t doit_step(
   int tile_size  = reduce_by_key_plan.items_per_tile;
   Size num_tiles = ::cuda::ceil_div(num_items, tile_size);
 
-  size_t vshmem_size = core::vshmem_size(reduce_by_key_plan.shared_memory_size, num_tiles);
+  size_t vshmem_size = core::detail::vshmem_size(reduce_by_key_plan.shared_memory_size, num_tiles);
 
   size_t allocation_sizes[2] = {9, vshmem_size};
   status                     = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
@@ -848,14 +842,14 @@ THRUST_RUNTIME_FUNCTION pair<KeysOutputIt, ValuesOutputIt> reduce_by_key_dispatc
   void* allocations[2]       = {nullptr, nullptr};
 
   size_t storage_size = 0;
-  status              = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
+  status              = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage");
 
   // Allocate temporary storage.
   thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
   void* ptr = static_cast<void*>(tmp.data().get());
 
-  status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage");
 
   Size* d_num_runs_out = thrust::detail::aligned_reinterpret_cast<Size*>(allocations[0]);
diff --git a/thrust/thrust/system/cuda/detail/set_operations.h b/thrust/thrust/system/cuda/detail/set_operations.h
index 7a267080bf8..b336f8e55fa 100644
--- a/thrust/thrust/system/cuda/detail/set_operations.h
+++ b/thrust/thrust/system/cuda/detail/set_operations.h
@@ -222,7 +222,7 @@ struct Tuning;
 namespace mpl = thrust::detail::mpl::math;
 
 template <class T, class U>
-struct Tuning<sm52, T, U>
+struct Tuning<core::detail::sm52, T, U>
 {
   enum
   {
@@ -243,7 +243,7 @@ struct Tuning<sm52, T, U>
 }; // tuning sm52
 
 template <class T, class U>
-struct Tuning<sm60, T, U>
+struct Tuning<core::detail::sm60, T, U>
 {
   enum
   {
@@ -290,15 +290,15 @@ struct SetOpAgent
   {
     using tuning = Tuning<Arch, key_type, value_type>;
 
-    using KeysLoadIt1   = typename core::LoadIterator<PtxPlan, KeysIt1>::type;
-    using KeysLoadIt2   = typename core::LoadIterator<PtxPlan, KeysIt2>::type;
-    using ValuesLoadIt1 = typename core::LoadIterator<PtxPlan, ValuesIt1>::type;
-    using ValuesLoadIt2 = typename core::LoadIterator<PtxPlan, ValuesIt2>::type;
+    using KeysLoadIt1   = typename core::detail::LoadIterator<PtxPlan, KeysIt1>::type;
+    using KeysLoadIt2   = typename core::detail::LoadIterator<PtxPlan, KeysIt2>::type;
+    using ValuesLoadIt1 = typename core::detail::LoadIterator<PtxPlan, ValuesIt1>::type;
+    using ValuesLoadIt2 = typename core::detail::LoadIterator<PtxPlan, ValuesIt2>::type;
 
-    using BlockLoadKeys1   = typename core::BlockLoad<PtxPlan, KeysLoadIt1>::type;
-    using BlockLoadKeys2   = typename core::BlockLoad<PtxPlan, KeysLoadIt2>::type;
-    using BlockLoadValues1 = typename core::BlockLoad<PtxPlan, ValuesLoadIt1>::type;
-    using BlockLoadValues2 = typename core::BlockLoad<PtxPlan, ValuesLoadIt2>::type;
+    using BlockLoadKeys1   = typename core::detail::BlockLoad<PtxPlan, KeysLoadIt1>::type;
+    using BlockLoadKeys2   = typename core::detail::BlockLoad<PtxPlan, KeysLoadIt2>::type;
+    using BlockLoadValues1 = typename core::detail::BlockLoad<PtxPlan, ValuesLoadIt1>::type;
+    using BlockLoadValues2 = typename core::detail::BlockLoad<PtxPlan, ValuesLoadIt2>::type;
 
     using TilePrefixCallback = cub::TilePrefixCallbackOp<Size, ::cuda::std::plus<>, ScanTileState, Arch::ver>;
 
@@ -316,7 +316,7 @@ struct SetOpAgent
 
       struct LoadStorage
       {
-        core::uninitialized_array<int, PtxPlan::BLOCK_THREADS> offset;
+        core::detail::uninitialized_array<int, PtxPlan::BLOCK_THREADS> offset;
         union
         {
           // FIXME These don't appear to be used anywhere?
@@ -328,15 +328,15 @@ struct SetOpAgent
           // Allocate extra shmem than truly necessary
           // This will permit to avoid range checks in
           // serial set operations, e.g. serial_set_difference
-          core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE + PtxPlan::BLOCK_THREADS> keys_shared;
+          core::detail::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE + PtxPlan::BLOCK_THREADS> keys_shared;
 
-          core::uninitialized_array<value_type, PtxPlan::ITEMS_PER_TILE + PtxPlan::BLOCK_THREADS> values_shared;
+          core::detail::uninitialized_array<value_type, PtxPlan::ITEMS_PER_TILE + PtxPlan::BLOCK_THREADS> values_shared;
         }; // anon union
       } load_storage; // struct LoadStorage
     }; // union TempStorage
   }; // struct PtxPlan
 
-  using ptx_plan = typename core::specialize_plan_msvc10_war<PtxPlan>::type::type;
+  using ptx_plan = typename core::detail::specialize_plan_msvc10_war<PtxPlan>::type::type;
 
   using KeysLoadIt1   = typename ptx_plan::KeysLoadIt1;
   using KeysLoadIt2   = typename ptx_plan::KeysLoadIt2;
@@ -441,8 +441,6 @@ struct SetOpAgent
       Size tile_output_prefix,
       int tile_output_count)
     {
-      using core::sync_threadblock;
-
       int local_scatter_idx = thread_output_prefix - tile_output_prefix;
 #  pragma unroll
       for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
@@ -452,7 +450,7 @@ struct SetOpAgent
           shared[local_scatter_idx++] = input[ITEM];
         }
       }
-      sync_threadblock();
+      __syncthreads();
 
       for (int item = threadIdx.x; item < tile_output_count; item += BLOCK_THREADS)
       {
@@ -483,8 +481,7 @@ struct SetOpAgent
     template <bool IS_LAST_TILE>
     void THRUST_DEVICE_FUNCTION consume_tile(Size tile_idx)
     {
-      using core::sync_threadblock;
-      using core::uninitialized_array;
+      using core::detail::uninitialized_array;
 
       pair<Size, Size> partition_beg = partitions[tile_idx + 0];
       pair<Size, Size> partition_end = partitions[tile_idx + 1];
@@ -506,7 +503,7 @@ struct SetOpAgent
 
       reg_to_shared(&storage.load_storage.keys_shared[0], keys_loc);
 
-      sync_threadblock();
+      __syncthreads();
 
       int diag_loc = min<int>(ITEMS_PER_THREAD * threadIdx.x, num_keys1 + num_keys2);
 
@@ -529,7 +526,7 @@ struct SetOpAgent
       int dst                          = threadIdx.x == 0 ? BLOCK_THREADS - 1 : threadIdx.x - 1;
       storage.load_storage.offset[dst] = value;
 
-      core::sync_threadblock();
+      __syncthreads();
 
       pair<int, int> partition1_loc = thrust::make_pair(
         storage.load_storage.offset[threadIdx.x] >> 16, storage.load_storage.offset[threadIdx.x] & 0xFFFF);
@@ -554,7 +551,7 @@ struct SetOpAgent
         indices,
         compare_op,
         set_op);
-      sync_threadblock();
+      __syncthreads();
 #  if 0
         if (ITEMS_PER_THREAD*threadIdx.x >= num_keys1 + num_keys2)
           active_mask = 0;
@@ -588,7 +585,7 @@ struct SetOpAgent
         tile_output_prefix = prefix_cb.GetExclusivePrefix();
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       // scatter results
       //
@@ -605,11 +602,11 @@ struct SetOpAgent
         value_type values_loc[ITEMS_PER_THREAD];
         gmem_to_reg<!IS_LAST_TILE>(values_loc, values1_in + keys1_beg, values2_in + keys2_beg, num_keys1, num_keys2);
 
-        sync_threadblock();
+        __syncthreads();
 
         reg_to_shared(&storage.load_storage.values_shared[0], values_loc);
 
-        sync_threadblock();
+        __syncthreads();
 
         // gather items from shared mem
         //
@@ -622,7 +619,7 @@ struct SetOpAgent
           }
         }
 
-        sync_threadblock();
+        __syncthreads();
 
         scatter(values_out,
                 values_loc,
@@ -660,10 +657,10 @@ struct SetOpAgent
          std::size_t* output_count_)
         : storage(storage_)
         , tile_state(tile_state_)
-        , keys1_in(core::make_load_iterator(ptx_plan(), keys1_))
-        , keys2_in(core::make_load_iterator(ptx_plan(), keys2_))
-        , values1_in(core::make_load_iterator(ptx_plan(), values1_))
-        , values2_in(core::make_load_iterator(ptx_plan(), values2_))
+        , keys1_in(core::detail::make_load_iterator(ptx_plan(), keys1_))
+        , keys2_in(core::detail::make_load_iterator(ptx_plan(), keys2_))
+        , values1_in(core::detail::make_load_iterator(ptx_plan(), values1_))
+        , values2_in(core::detail::make_load_iterator(ptx_plan(), values2_))
         , keys1_count(keys1_count_)
         , keys2_count(keys2_count_)
         , keys_out(keys_out_)
@@ -733,7 +730,7 @@ struct PartitionAgent
   struct PtxPlan : PtxPolicy<256>
   {};
 
-  using ptx_plan = core::specialize_plan<PtxPlan>;
+  using ptx_plan = core::detail::specialize_plan<PtxPlan>;
 
   //---------------------------------------------------------------------
   // Agent entry point
@@ -767,7 +764,7 @@ struct InitAgent
   struct PtxPlan : PtxPolicy<128>
   {};
 
-  using ptx_plan = core::specialize_plan<PtxPlan>;
+  using ptx_plan = core::detail::specialize_plan<PtxPlan>;
 
   //---------------------------------------------------------------------
   // Agent entry point
@@ -1058,8 +1055,8 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
 
   cudaError_t status = cudaSuccess;
 
-  using core::AgentLauncher;
-  using core::AgentPlan;
+  using core::detail::AgentLauncher;
+  using core::detail::AgentPlan;
 
   using set_op_agent = AgentLauncher<
     SetOpAgent<KeysIt1, KeysIt2, ValuesIt1, ValuesIt2, KeysOutputIt, ValuesOutputIt, Size, CompareOp, SetOp, HAS_VALUES>>;
@@ -1080,13 +1077,13 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), tile_agent_storage);
   CUDA_CUB_RET_IF_FAIL(status);
 
-  size_t vshmem_storage          = core::vshmem_size(set_op_plan.shared_memory_size, num_tiles);
+  size_t vshmem_storage          = core::detail::vshmem_size(set_op_plan.shared_memory_size, num_tiles);
   size_t partition_agent_storage = (num_tiles + 1) * sizeof(Size) * 2;
 
   void* allocations[3]       = {nullptr, nullptr, nullptr};
   size_t allocation_sizes[3] = {tile_agent_storage, partition_agent_storage, vshmem_storage};
 
-  status = core::alias_storage(d_temp_storage, temp_storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(d_temp_storage, temp_storage_size, allocations, allocation_sizes);
   CUDA_CUB_RET_IF_FAIL(status);
 
   if (d_temp_storage == nullptr)
@@ -1192,14 +1189,14 @@ THRUST_RUNTIME_FUNCTION pair<KeysOutputIt, ValuesOutputIt> set_operations(
 
   size_t storage_size = 0;
 
-  status = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "set_operations failed on 1st alias_storage");
 
   // Allocate temporary storage.
   thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
   void* ptr = static_cast<void*>(tmp.data().get());
 
-  status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "set_operations failed on 2nd alias_storage");
 
   std::size_t* d_output_count = thrust::detail::aligned_reinterpret_cast<std::size_t*>(allocations[0]);
diff --git a/thrust/thrust/system/cuda/detail/sort.h b/thrust/thrust/system/cuda/detail/sort.h
index 2c3ef85202d..7ad67fd4e0c 100644
--- a/thrust/thrust/system/cuda/detail/sort.h
+++ b/thrust/thrust/system/cuda/detail/sort.h
@@ -58,6 +58,8 @@
 #  include <thrust/system/cuda/detail/util.h>
 #  include <thrust/type_traits/is_contiguous_iterator.h>
 
+#  include <cuda/cmath>
+
 #  include <cstdint>
 
 #  if defined(_CCCL_HAS_NVFP16)
@@ -277,8 +279,8 @@ THRUST_RUNTIME_FUNCTION void radix_sort(execution_policy<Derived>& policy, Key*
     dispatch<SORT_ITEMS, CompareOp>::doit(nullptr, temp_storage_bytes, keys_buffer, items_buffer, keys_count, stream);
   cuda_cub::throw_on_error(status, "radix_sort: failed on 1st step");
 
-  size_t keys_temp_storage  = core::align_to(sizeof(Key) * keys_count, 128);
-  size_t items_temp_storage = core::align_to(sizeof(Item) * items_count, 128);
+  size_t keys_temp_storage  = ::cuda::round_up(sizeof(Key) * keys_count, 128);
+  size_t items_temp_storage = ::cuda::round_up(sizeof(Item) * items_count, 128);
 
   size_t storage_size = keys_temp_storage + items_temp_storage + temp_storage_bytes;
 
diff --git a/thrust/thrust/system/cuda/detail/unique.h b/thrust/thrust/system/cuda/detail/unique.h
index ac94017758b..1d39b161866 100644
--- a/thrust/thrust/system/cuda/detail/unique.h
+++ b/thrust/thrust/system/cuda/detail/unique.h
@@ -123,7 +123,7 @@ struct items_per_thread
 };
 
 template <class T>
-struct Tuning<sm52, T>
+struct Tuning<core::detail::sm52, T>
 {
   const static int INPUT_SIZE = sizeof(T);
   enum
@@ -149,16 +149,16 @@ struct UniqueAgent
   {
     using tuning = Tuning<Arch, item_type>;
 
-    using ItemsLoadIt = typename core::LoadIterator<PtxPlan, ItemsIt>::type;
+    using ItemsLoadIt = typename core::detail::LoadIterator<PtxPlan, ItemsIt>::type;
 
-    using BlockLoadItems = typename core::BlockLoad<PtxPlan, ItemsLoadIt>::type;
+    using BlockLoadItems = typename core::detail::BlockLoad<PtxPlan, ItemsLoadIt>::type;
 
     using BlockDiscontinuityItems = cub::BlockDiscontinuity<item_type, PtxPlan::BLOCK_THREADS, 1, 1, Arch::ver>;
 
     using TilePrefixCallback = cub::TilePrefixCallbackOp<Size, ::cuda::std::plus<>, ScanTileState, Arch::ver>;
     using BlockScan          = cub::BlockScan<Size, PtxPlan::BLOCK_THREADS, PtxPlan::SCAN_ALGORITHM, 1, 1, Arch::ver>;
 
-    using shared_items_t = core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE>;
+    using shared_items_t = core::detail::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE>;
 
     union TempStorage
     {
@@ -175,7 +175,7 @@ struct UniqueAgent
     }; // union TempStorage
   }; // struct PtxPlan
 
-  using ptx_plan = typename core::specialize_plan_msvc10_war<PtxPlan>::type::type;
+  using ptx_plan = typename core::detail::specialize_plan_msvc10_war<PtxPlan>::type::type;
 
   using ItemsLoadIt             = typename ptx_plan::ItemsLoadIt;
   using BlockLoadItems          = typename ptx_plan::BlockLoadItems;
@@ -224,8 +224,6 @@ struct UniqueAgent
       Size num_selections_prefix,
       Size /*num_selections*/)
     {
-      using core::sync_threadblock;
-
 #  pragma unroll
       for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
       {
@@ -236,14 +234,14 @@ struct UniqueAgent
         }
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
       {
         items_out[num_selections_prefix + item] = get_shared()[item];
       }
 
-      sync_threadblock();
+      __syncthreads();
     }
 
     //---------------------------------------------------------------------
@@ -253,8 +251,7 @@ struct UniqueAgent
     template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
     Size THRUST_DEVICE_FUNCTION consume_tile_impl(int num_tile_items, int tile_idx, Size tile_base)
     {
-      using core::sync_threadblock;
-      using core::uninitialized_array;
+      using core::detail::uninitialized_array;
 
       item_type items_loc[ITEMS_PER_THREAD];
       Size selection_flags[ITEMS_PER_THREAD];
@@ -270,7 +267,7 @@ struct UniqueAgent
         BlockLoadItems(temp_storage.load_items).Load(items_in + tile_base, items_loc);
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       if (IS_FIRST_TILE)
       {
@@ -294,7 +291,7 @@ struct UniqueAgent
         }
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       Size num_tile_selections   = 0;
       Size num_selections        = 0;
@@ -337,7 +334,7 @@ struct UniqueAgent
         }
       }
 
-      sync_threadblock();
+      __syncthreads();
 
       scatter(items_loc,
               selection_flags,
@@ -420,7 +417,7 @@ struct UniqueAgent
 
     impl(storage,
          tile_state,
-         core::make_load_iterator(ptx_plan(), items_in),
+         core::detail::make_load_iterator(ptx_plan(), items_in),
          items_out,
          binary_pred,
          num_items,
@@ -435,7 +432,7 @@ struct InitAgent
   template <class Arch>
   struct PtxPlan : PtxPolicy<128>
   {};
-  using ptx_plan = core::specialize_plan<PtxPlan>;
+  using ptx_plan = core::detail::specialize_plan<PtxPlan>;
 
   //---------------------------------------------------------------------
   // Agent entry point
@@ -463,9 +460,9 @@ static cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
   Size num_items,
   cudaStream_t stream)
 {
-  using core::AgentLauncher;
-  using core::AgentPlan;
-  using core::get_agent_plan;
+  using core::detail::AgentLauncher;
+  using core::detail::AgentPlan;
+  using core::detail::get_agent_plan;
 
   using unique_agent = AgentLauncher<UniqueAgent<ItemsInputIt, ItemsOutputIt, BinaryPred, Size, NumSelectedOutIt>>;
 
@@ -473,14 +470,14 @@ static cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
 
   using init_agent = AgentLauncher<InitAgent<ScanTileState, NumSelectedOutIt, Size>>;
 
-  using core::get_plan;
+  using core::detail::get_plan;
   typename get_plan<init_agent>::type init_plan     = init_agent::get_plan();
   typename get_plan<unique_agent>::type unique_plan = unique_agent::get_plan(stream);
 
   int tile_size    = unique_plan.items_per_tile;
   size_t num_tiles = ::cuda::ceil_div(num_items, tile_size);
 
-  size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size, num_tiles);
+  size_t vshmem_size = core::detail::vshmem_size(unique_plan.shared_memory_size, num_tiles);
 
   cudaError_t status         = cudaSuccess;
   size_t allocation_sizes[2] = {0, vshmem_size};
@@ -550,14 +547,14 @@ THRUST_RUNTIME_FUNCTION ItemsOutputIt unique(
   void* allocations[2]       = {nullptr, nullptr};
 
   size_t storage_size = 0;
-  status              = core::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
+  status              = core::detail::alias_storage(nullptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "unique: failed on 1st step");
 
   // Allocate temporary storage.
   thrust::detail::temporary_array<std::uint8_t, Derived> tmp(policy, storage_size);
   void* ptr = static_cast<void*>(tmp.data().get());
 
-  status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes);
+  status = core::detail::alias_storage(ptr, storage_size, allocations, allocation_sizes);
   cuda_cub::throw_on_error(status, "unique: failed on 2nd step");
 
   size_type* d_num_selected_out = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);

From 9a27ba3ba2da14dd9b8bd22c04ea057d9a7f493b Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 13:07:45 +0100
Subject: [PATCH 10/15] PTX: Add clusterlaunchcontrol (#3589)

Co-authored-by: Allard Hendriksen <ahendriksen@nvidia.com>
---
 docs/libcudacxx/ptx/instructions.rst          |  1 +
 .../ptx/instructions/clusterlaunchcontrol.rst | 11 ++++++
 .../__ptx/instructions/clusterlaunchcontrol.h | 37 +++++++++++++++++++
 libcudacxx/include/cuda/ptx                   |  1 +
 .../ptx.clusterlaunchcontrol.compile.pass.cpp | 22 +++++++++++
 5 files changed, 72 insertions(+)
 create mode 100644 docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/clusterlaunchcontrol.h
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp

diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst
index f0776974eec..32db843c28d 100644
--- a/docs/libcudacxx/ptx/instructions.rst
+++ b/docs/libcudacxx/ptx/instructions.rst
@@ -7,6 +7,7 @@ PTX Instructions
    :maxdepth: 1
 
    instructions/barrier_cluster
+   instructions/clusterlaunchcontrol
    instructions/cp_async_bulk
    instructions/cp_async_bulk_commit_group
    instructions/cp_async_bulk_wait_group
diff --git a/docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst b/docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst
new file mode 100644
index 00000000000..75fe44f6f22
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst
@@ -0,0 +1,11 @@
+.. _libcudacxx-ptx-instructions-clusterlaunchcontrol:
+
+clusterlaunchcontrol
+====================
+
+-  PTX ISA:
+   `clusterlaunchcontrol.try_cancel <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-clusterlaunchcontrol-try-cancel>`__
+-  PTX ISA:
+   `clusterlaunchcontrol.query_cancel <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-clusterlaunchcontrol-query-cancel>`__
+
+.. include:: generated/clusterlaunchcontrol.rst
diff --git a/libcudacxx/include/cuda/__ptx/instructions/clusterlaunchcontrol.h b/libcudacxx/include/cuda/__ptx/instructions/clusterlaunchcontrol.h
new file mode 100644
index 00000000000..b15cfddf4a0
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/clusterlaunchcontrol.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_CLUSTERLAUNCHCONTROL_H_
+#define _CUDA_PTX_CLUSTERLAUNCHCONTROL_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/clusterlaunchcontrol.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_CLUSTERLAUNCHCONTROL_H_
diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index 4798973df77..7087dd97d2a 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -70,6 +70,7 @@
  */
 
 #include <cuda/__ptx/instructions/barrier_cluster.h>
+#include <cuda/__ptx/instructions/clusterlaunchcontrol.h>
 #include <cuda/__ptx/instructions/cp_async_bulk.h>
 #include <cuda/__ptx/instructions/cp_async_bulk_commit_group.h>
 #include <cuda/__ptx/instructions/cp_async_bulk_tensor.h>
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp
new file mode 100644
index 00000000000..212414c4535
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/clusterlaunchcontrol.h"
+
+int main(int, char**)
+{
+  return 0;
+}

From b1f2e63dafcb8d1379819e80375b1cd33393f449 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 13:10:21 +0100
Subject: [PATCH 11/15] PTX: Add st.bulk (#3604)

Co-authored-by: Allard Hendriksen <ahendriksen@nvidia.com>
---
 docs/libcudacxx/ptx/instructions.rst          |  1 +
 docs/libcudacxx/ptx/instructions/st_bulk.rst  |  9 +++++
 .../include/cuda/__ptx/instructions/st_bulk.h | 37 +++++++++++++++++++
 libcudacxx/include/cuda/ptx                   |  1 +
 .../cuda/ptx/ptx.st.bulk.compile.pass.cpp     | 22 +++++++++++
 5 files changed, 70 insertions(+)
 create mode 100644 docs/libcudacxx/ptx/instructions/st_bulk.rst
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/st_bulk.h
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp

diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst
index 32db843c28d..ebf6e31f716 100644
--- a/docs/libcudacxx/ptx/instructions.rst
+++ b/docs/libcudacxx/ptx/instructions.rst
@@ -24,6 +24,7 @@ PTX Instructions
    instructions/mbarrier_try_wait
    instructions/red_async
    instructions/st_async
+   instructions/st_bulk
    instructions/tensormap_replace
    instructions/tensormap_cp_fenceproxy
    instructions/special_registers
diff --git a/docs/libcudacxx/ptx/instructions/st_bulk.rst b/docs/libcudacxx/ptx/instructions/st_bulk.rst
new file mode 100644
index 00000000000..64886598909
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/st_bulk.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-st-bulk:
+
+st.bulk
+=======
+
+-  PTX ISA:
+   `st.bulk <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-bulk>`__
+
+.. include:: generated/st_bulk.rst
diff --git a/libcudacxx/include/cuda/__ptx/instructions/st_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/st_bulk.h
new file mode 100644
index 00000000000..686e0ecf166
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/st_bulk.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_ST_BULK_H_
+#define _CUDA_PTX_ST_BULK_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/st_bulk.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_ST_BULK_H_
diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index 7087dd97d2a..db9e70ab7e6 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -86,6 +86,7 @@
 #include <cuda/__ptx/instructions/mbarrier_wait.h>
 #include <cuda/__ptx/instructions/red_async.h>
 #include <cuda/__ptx/instructions/st_async.h>
+#include <cuda/__ptx/instructions/st_bulk.h>
 #include <cuda/__ptx/instructions/tensormap_cp_fenceproxy.h>
 #include <cuda/__ptx/instructions/tensormap_replace.h>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp
new file mode 100644
index 00000000000..951e1a9f513
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/st_bulk.h"
+
+int main(int, char**)
+{
+  return 0;
+}

From afa2ca25d00fc9bd8037b3b2ca064f2c18708bfc Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 13:10:35 +0100
Subject: [PATCH 12/15] PTX: Add multimem instructions (#3603)

* Add multimem.ld_reduce
* Add multimem.red
* Add multimem.st

Co-authored-by: Allard Hendriksen <ahendriksen@nvidia.com>
---
 docs/libcudacxx/ptx/instructions.rst          |  3 ++
 .../ptx/instructions/multimem_ld_reduce.rst   |  9 +++++
 .../ptx/instructions/multimem_red.rst         |  9 +++++
 .../ptx/instructions/multimem_st.rst          |  9 +++++
 .../__ptx/instructions/multimem_ld_reduce.h   | 37 +++++++++++++++++++
 .../cuda/__ptx/instructions/multimem_red.h    | 37 +++++++++++++++++++
 .../cuda/__ptx/instructions/multimem_st.h     | 37 +++++++++++++++++++
 libcudacxx/include/cuda/ptx                   |  3 ++
 .../ptx.multimem.ld_reduce.compile.pass.cpp   | 22 +++++++++++
 .../ptx/ptx.multimem.red.compile.pass.cpp     | 22 +++++++++++
 .../cuda/ptx/ptx.multimem.st.compile.pass.cpp | 22 +++++++++++
 11 files changed, 210 insertions(+)
 create mode 100644 docs/libcudacxx/ptx/instructions/multimem_ld_reduce.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/multimem_red.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/multimem_st.rst
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/multimem_ld_reduce.h
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/multimem_red.h
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/multimem_st.h
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.ld_reduce.compile.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.red.compile.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.st.compile.pass.cpp

diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst
index ebf6e31f716..797e26d9911 100644
--- a/docs/libcudacxx/ptx/instructions.rst
+++ b/docs/libcudacxx/ptx/instructions.rst
@@ -22,6 +22,9 @@ PTX Instructions
    instructions/mbarrier_expect_tx
    instructions/mbarrier_test_wait
    instructions/mbarrier_try_wait
+   instructions/multimem_ld_reduce
+   instructions/multimem_red
+   instructions/multimem_st
    instructions/red_async
    instructions/st_async
    instructions/st_bulk
diff --git a/docs/libcudacxx/ptx/instructions/multimem_ld_reduce.rst b/docs/libcudacxx/ptx/instructions/multimem_ld_reduce.rst
new file mode 100644
index 00000000000..e9f5212131b
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/multimem_ld_reduce.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-multimem-ld_reduce:
+
+multimem.ld_reduce
+==================
+
+-  PTX ISA:
+   `multimem.ld_reduce <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red>`__
+
+.. include:: generated/multimem_ld_reduce.rst
diff --git a/docs/libcudacxx/ptx/instructions/multimem_red.rst b/docs/libcudacxx/ptx/instructions/multimem_red.rst
new file mode 100644
index 00000000000..0a6511b78d1
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/multimem_red.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-multimem-red:
+
+multimem.red
+============
+
+-  PTX ISA:
+   `multimem.red <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red>`__
+
+.. include:: generated/multimem_red.rst
diff --git a/docs/libcudacxx/ptx/instructions/multimem_st.rst b/docs/libcudacxx/ptx/instructions/multimem_st.rst
new file mode 100644
index 00000000000..75197f440c6
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/multimem_st.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-multimem-st:
+
+multimem.st
+===========
+
+-  PTX ISA:
+   `multimem.st <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red>`__
+
+.. include:: generated/multimem_st.rst
diff --git a/libcudacxx/include/cuda/__ptx/instructions/multimem_ld_reduce.h b/libcudacxx/include/cuda/__ptx/instructions/multimem_ld_reduce.h
new file mode 100644
index 00000000000..29081e6107e
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/multimem_ld_reduce.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_MULTIMEM_LD_REDUCE_H_
+#define _CUDA_PTX_MULTIMEM_LD_REDUCE_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/multimem_ld_reduce.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_MULTIMEM_LD_REDUCE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/multimem_red.h b/libcudacxx/include/cuda/__ptx/instructions/multimem_red.h
new file mode 100644
index 00000000000..f0fc4e4d0e5
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/multimem_red.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_MULTIMEM_RED_H_
+#define _CUDA_PTX_MULTIMEM_RED_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/multimem_red.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_MULTIMEM_RED_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/multimem_st.h b/libcudacxx/include/cuda/__ptx/instructions/multimem_st.h
new file mode 100644
index 00000000000..608402f0131
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/multimem_st.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_MULTIMEM_ST_H_
+#define _CUDA_PTX_MULTIMEM_ST_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/multimem_st.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_MULTIMEM_ST_H_
diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index db9e70ab7e6..d11659ac6fb 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -84,6 +84,9 @@
 #include <cuda/__ptx/instructions/mbarrier_expect_tx.h>
 #include <cuda/__ptx/instructions/mbarrier_init.h>
 #include <cuda/__ptx/instructions/mbarrier_wait.h>
+#include <cuda/__ptx/instructions/multimem_ld_reduce.h>
+#include <cuda/__ptx/instructions/multimem_red.h>
+#include <cuda/__ptx/instructions/multimem_st.h>
 #include <cuda/__ptx/instructions/red_async.h>
 #include <cuda/__ptx/instructions/st_async.h>
 #include <cuda/__ptx/instructions/st_bulk.h>
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.ld_reduce.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.ld_reduce.compile.pass.cpp
new file mode 100644
index 00000000000..cbe0ba81971
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.ld_reduce.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/multimem_ld_reduce.h"
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.red.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.red.compile.pass.cpp
new file mode 100644
index 00000000000..b4aefa3b338
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.red.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/multimem_red.h"
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.st.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.st.compile.pass.cpp
new file mode 100644
index 00000000000..4998c854382
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.multimem.st.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/multimem_st.h"
+
+int main(int, char**)
+{
+  return 0;
+}

From 0f52dd50c8a049372dfba62950f490813c2217ea Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 13:38:22 +0100
Subject: [PATCH 13/15] PTX: Add cp.async.mbarrier.arrive{.noinc} (#3602)

Co-authored-by: Allard Hendriksen <ahendriksen@nvidia.com>
---
 docs/libcudacxx/ptx/instructions.rst          |  1 +
 .../instructions/cp_async_mbarrier_arrive.rst | 10 +++++
 .../instructions/cp_async_mbarrier_arrive.h   | 38 +++++++++++++++++++
 libcudacxx/include/cuda/ptx                   |  1 +
 ....cp.async.mbarrier.arrive.compile.pass.cpp | 23 +++++++++++
 5 files changed, 73 insertions(+)
 create mode 100644 docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.mbarrier.arrive.compile.pass.cpp

diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst
index 797e26d9911..87ccc82b5b1 100644
--- a/docs/libcudacxx/ptx/instructions.rst
+++ b/docs/libcudacxx/ptx/instructions.rst
@@ -12,6 +12,7 @@ PTX Instructions
    instructions/cp_async_bulk_commit_group
    instructions/cp_async_bulk_wait_group
    instructions/cp_async_bulk_tensor
+   instructions/cp_async_mbarrier_arrive
    instructions/cp_reduce_async_bulk
    instructions/cp_reduce_async_bulk_tensor
    instructions/fence
diff --git a/docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst
new file mode 100644
index 00000000000..f2ff2ff5ee7
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst
@@ -0,0 +1,10 @@
+.. _libcudacxx-ptx-instructions-cp-async-mbarrier-arrive:
+
+cp.async.mbarrier.arrive
+========================
+
+-  PTX ISA:
+   `cp.async.mbarrier.arrive <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive>`__
+
+.. include:: generated/cp_async_mbarrier_arrive.rst
+.. include:: generated/cp_async_mbarrier_arrive_noinc.rst
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h
new file mode 100644
index 00000000000..c19a09e2922
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h
@@ -0,0 +1,38 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_CP_ASYNC_MBARRIER_ARRIVE_H_
+#define _CUDA_PTX_CP_ASYNC_MBARRIER_ARRIVE_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h>
+#include <cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_CP_ASYNC_MBARRIER_ARRIVE_H_
diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index d11659ac6fb..0d699b2e2ca 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -75,6 +75,7 @@
 #include <cuda/__ptx/instructions/cp_async_bulk_commit_group.h>
 #include <cuda/__ptx/instructions/cp_async_bulk_tensor.h>
 #include <cuda/__ptx/instructions/cp_async_bulk_wait_group.h>
+#include <cuda/__ptx/instructions/cp_async_mbarrier_arrive.h>
 #include <cuda/__ptx/instructions/cp_reduce_async_bulk.h>
 #include <cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h>
 #include <cuda/__ptx/instructions/fence.h>
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.mbarrier.arrive.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.mbarrier.arrive.compile.pass.cpp
new file mode 100644
index 00000000000..97623078198
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.mbarrier.arrive.compile.pass.cpp
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/cp_async_mbarrier_arrive.h"
+#include "generated/cp_async_mbarrier_arrive_noinc.h"
+
+int main(int, char**)
+{
+  return 0;
+}

From 38983ebc42de5683e212562c931aa0789c6eefe7 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 30 Jan 2025 16:40:27 +0100
Subject: [PATCH 14/15] PTX: Add tcgen05 instructions (#3607)

* ptx: Add tcgen05.alloc

* ptx: Add tcgen05.commit

* ptx: Add tcgen05.cp

* ptx: Add tcgen05.fence

* ptx: Add tcgen05.ld

* ptx: Add tcgen05.mma

* ptx: Add tcgen05.mma.ws

* ptx: Add tcgen05.shift

* ptx: Add tcgen05.st

* ptx: Add tcgen05.wait

* fix docs

---------

Co-authored-by: Allard Hendriksen <ahendriksen@nvidia.com>
---
 docs/libcudacxx/ptx/instructions.rst          | 10 +++++
 .../ptx/instructions/tcgen05_alloc.rst        |  9 +++++
 .../ptx/instructions/tcgen05_commit.rst       |  9 +++++
 .../ptx/instructions/tcgen05_cp.rst           |  9 +++++
 .../ptx/instructions/tcgen05_fence.rst        |  9 +++++
 .../ptx/instructions/tcgen05_ld.rst           |  9 +++++
 .../ptx/instructions/tcgen05_mma.rst          |  9 +++++
 .../ptx/instructions/tcgen05_mma_ws.rst       |  9 +++++
 .../ptx/instructions/tcgen05_shift.rst        |  9 +++++
 .../ptx/instructions/tcgen05_st.rst           |  9 +++++
 .../ptx/instructions/tcgen05_wait.rst         |  9 +++++
 .../cuda/__ptx/instructions/tcgen05_alloc.h   | 37 +++++++++++++++++++
 .../cuda/__ptx/instructions/tcgen05_commit.h  | 37 +++++++++++++++++++
 .../cuda/__ptx/instructions/tcgen05_cp.h      | 37 +++++++++++++++++++
 .../cuda/__ptx/instructions/tcgen05_fence.h   | 37 +++++++++++++++++++
 .../cuda/__ptx/instructions/tcgen05_ld.h      | 37 +++++++++++++++++++
 .../cuda/__ptx/instructions/tcgen05_mma.h     | 37 +++++++++++++++++++
 .../cuda/__ptx/instructions/tcgen05_mma_ws.h  | 37 +++++++++++++++++++
 .../cuda/__ptx/instructions/tcgen05_shift.h   | 37 +++++++++++++++++++
 .../cuda/__ptx/instructions/tcgen05_st.h      | 37 +++++++++++++++++++
 .../cuda/__ptx/instructions/tcgen05_wait.h    | 37 +++++++++++++++++++
 libcudacxx/include/cuda/ptx                   | 10 +++++
 .../ptx/ptx.tcgen05.alloc.compile.pass.cpp    | 22 +++++++++++
 .../ptx/ptx.tcgen05.commit.compile.pass.cpp   | 22 +++++++++++
 .../cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp  | 22 +++++++++++
 .../ptx/ptx.tcgen05.fence.compile.pass.cpp    | 22 +++++++++++
 .../cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp  | 22 +++++++++++
 .../cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp | 22 +++++++++++
 .../ptx/ptx.tcgen05.mma.ws.compile.pass.cpp   | 22 +++++++++++
 .../ptx/ptx.tcgen05.shift.compile.pass.cpp    | 22 +++++++++++
 .../cuda/ptx/ptx.tcgen05.st.compile.pass.cpp  | 22 +++++++++++
 .../ptx/ptx.tcgen05.wait.compile.pass.cpp     | 22 +++++++++++
 32 files changed, 700 insertions(+)
 create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_alloc.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_commit.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_cp.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_fence.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_ld.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_mma.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_mma_ws.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_shift.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_st.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/tcgen05_wait.rst
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_alloc.h
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_commit.h
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_cp.h
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_fence.h
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_ld.h
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma.h
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma_ws.h
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_shift.h
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_st.h
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/tcgen05_wait.h
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp

diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst
index 87ccc82b5b1..136dfb81fc3 100644
--- a/docs/libcudacxx/ptx/instructions.rst
+++ b/docs/libcudacxx/ptx/instructions.rst
@@ -29,6 +29,16 @@ PTX Instructions
    instructions/red_async
    instructions/st_async
    instructions/st_bulk
+   instructions/tcgen05_alloc
+   instructions/tcgen05_commit
+   instructions/tcgen05_cp
+   instructions/tcgen05_fence
+   instructions/tcgen05_ld
+   instructions/tcgen05_mma
+   instructions/tcgen05_mma_ws
+   instructions/tcgen05_shift
+   instructions/tcgen05_st
+   instructions/tcgen05_wait
    instructions/tensormap_replace
    instructions/tensormap_cp_fenceproxy
    instructions/special_registers
diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_alloc.rst b/docs/libcudacxx/ptx/instructions/tcgen05_alloc.rst
new file mode 100644
index 00000000000..a30f2a2560c
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tcgen05_alloc.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tcgen05-alloc:
+
+tcgen05.alloc
+=============
+
+-  PTX ISA:
+   `tcgen05.alloc <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tensorcore-5th-generation-instructions-tcgen05-alloc-tcgen05-dealloc-tcgen05-relinquish-alloc-permit>`__
+
+.. include:: generated/tcgen05_alloc.rst
diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_commit.rst b/docs/libcudacxx/ptx/instructions/tcgen05_commit.rst
new file mode 100644
index 00000000000..a431350dea8
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tcgen05_commit.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tcgen05-commit:
+
+tcgen05.commit
+==============
+
+-  PTX ISA:
+   `tcgen05.commit <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tensorcore-5th-generation-instructions-tcgen05-commit>`__
+
+.. include:: generated/tcgen05_commit.rst
diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_cp.rst b/docs/libcudacxx/ptx/instructions/tcgen05_cp.rst
new file mode 100644
index 00000000000..5a220536d6e
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tcgen05_cp.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tcgen05-cp:
+
+tcgen05.cp
+==========
+
+-  PTX ISA:
+   `tcgen05.cp <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tensorcore-5th-generation-instructions-tcgen05-cp>`__
+
+.. include:: generated/tcgen05_cp.rst
diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_fence.rst b/docs/libcudacxx/ptx/instructions/tcgen05_fence.rst
new file mode 100644
index 00000000000..6635131f707
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tcgen05_fence.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tcgen05-fence:
+
+tcgen05.fence
+=============
+
+-  PTX ISA:
+   `tcgen05.fence <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tensorcore-5th-generation-instructions-tcgen05-fence>`__
+
+.. include:: generated/tcgen05_fence.rst
diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_ld.rst b/docs/libcudacxx/ptx/instructions/tcgen05_ld.rst
new file mode 100644
index 00000000000..165b8eb935a
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tcgen05_ld.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tcgen05-ld:
+
+tcgen05.ld
+==========
+
+-  PTX ISA:
+   `tcgen05.ld <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tensorcore-5th-generation-instructions-tcgen05-ld>`__
+
+.. include:: generated/tcgen05_ld.rst
diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_mma.rst b/docs/libcudacxx/ptx/instructions/tcgen05_mma.rst
new file mode 100644
index 00000000000..9672ae0d0a1
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tcgen05_mma.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tcgen05-mma:
+
+tcgen05.mma
+===========
+
+-  PTX ISA:
+   `tcgen05.mma <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tensorcore-5th-generation-instructions-tcgen05-mma>`__
+
+.. include:: generated/tcgen05_mma.rst
diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_mma_ws.rst b/docs/libcudacxx/ptx/instructions/tcgen05_mma_ws.rst
new file mode 100644
index 00000000000..e22066298ac
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tcgen05_mma_ws.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tcgen05-mma-ws:
+
+tcgen05.mma.ws
+==============
+
+-  PTX ISA:
+   `tcgen05.mma.ws <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tensorcore-5th-generation-instructions-tcgen05-mma-ws>`__
+
+.. include:: generated/tcgen05_mma_ws.rst
diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_shift.rst b/docs/libcudacxx/ptx/instructions/tcgen05_shift.rst
new file mode 100644
index 00000000000..eef04ae4d5e
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tcgen05_shift.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tcgen05-shift:
+
+tcgen05.shift
+=============
+
+-  PTX ISA:
+   `tcgen05.shift <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tensorcore-5th-generation-instructions-tcgen05-shift>`__
+
+.. include:: generated/tcgen05_shift.rst
diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_st.rst b/docs/libcudacxx/ptx/instructions/tcgen05_st.rst
new file mode 100644
index 00000000000..f101149481f
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tcgen05_st.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tcgen05-st:
+
+tcgen05.st
+==========
+
+-  PTX ISA:
+   `tcgen05.st <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tensorcore-5th-generation-instructions-tcgen05-st>`__
+
+.. include:: generated/tcgen05_st.rst
diff --git a/docs/libcudacxx/ptx/instructions/tcgen05_wait.rst b/docs/libcudacxx/ptx/instructions/tcgen05_wait.rst
new file mode 100644
index 00000000000..cb149e5c9a1
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tcgen05_wait.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tcgen05-wait:
+
+tcgen05.wait
+============
+
+-  PTX ISA:
+   `tcgen05.wait <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tensorcore-5th-generation-instructions-tcgen05-wait>`__
+
+.. include:: generated/tcgen05_wait.rst
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_alloc.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_alloc.h
new file mode 100644
index 00000000000..743ee4306ee
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_alloc.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_TCGEN05_ALLOC_H_
+#define _CUDA_PTX_TCGEN05_ALLOC_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/tcgen05_alloc.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_TCGEN05_ALLOC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_commit.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_commit.h
new file mode 100644
index 00000000000..ca06ec6b97d
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_commit.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_TCGEN05_COMMIT_H_
+#define _CUDA_PTX_TCGEN05_COMMIT_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/tcgen05_commit.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_TCGEN05_COMMIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_cp.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_cp.h
new file mode 100644
index 00000000000..e0c6ebf74ad
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_cp.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_TCGEN05_CP_H_
+#define _CUDA_PTX_TCGEN05_CP_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/tcgen05_cp.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_TCGEN05_CP_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_fence.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_fence.h
new file mode 100644
index 00000000000..a36847cd0f3
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_fence.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_TCGEN05_FENCE_H_
+#define _CUDA_PTX_TCGEN05_FENCE_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/tcgen05_fence.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_TCGEN05_FENCE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_ld.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_ld.h
new file mode 100644
index 00000000000..782ba20e804
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_ld.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_TCGEN05_LD_H_
+#define _CUDA_PTX_TCGEN05_LD_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/tcgen05_ld.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_TCGEN05_LD_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma.h
new file mode 100644
index 00000000000..ff9d159930b
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_TCGEN05_MMA_H_
+#define _CUDA_PTX_TCGEN05_MMA_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/tcgen05_mma.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_TCGEN05_MMA_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma_ws.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma_ws.h
new file mode 100644
index 00000000000..5d0bd5b8b5a
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_mma_ws.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_TCGEN05_MMA_WS_H_
+#define _CUDA_PTX_TCGEN05_MMA_WS_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/tcgen05_mma_ws.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_TCGEN05_MMA_WS_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_shift.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_shift.h
new file mode 100644
index 00000000000..aab5cbe27b8
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_shift.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_TCGEN05_SHIFT_H_
+#define _CUDA_PTX_TCGEN05_SHIFT_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/tcgen05_shift.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_TCGEN05_SHIFT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_st.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_st.h
new file mode 100644
index 00000000000..94c86614b1e
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_st.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_TCGEN05_ST_H_
+#define _CUDA_PTX_TCGEN05_ST_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/tcgen05_st.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_TCGEN05_ST_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tcgen05_wait.h b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_wait.h
new file mode 100644
index 00000000000..1684d9afd65
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/tcgen05_wait.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_TCGEN05_WAIT_H_
+#define _CUDA_PTX_TCGEN05_WAIT_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/tcgen05_wait.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_TCGEN05_WAIT_H_
diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx
index 0d699b2e2ca..971288b456c 100644
--- a/libcudacxx/include/cuda/ptx
+++ b/libcudacxx/include/cuda/ptx
@@ -91,6 +91,16 @@
 #include <cuda/__ptx/instructions/red_async.h>
 #include <cuda/__ptx/instructions/st_async.h>
 #include <cuda/__ptx/instructions/st_bulk.h>
+#include <cuda/__ptx/instructions/tcgen05_alloc.h>
+#include <cuda/__ptx/instructions/tcgen05_commit.h>
+#include <cuda/__ptx/instructions/tcgen05_cp.h>
+#include <cuda/__ptx/instructions/tcgen05_fence.h>
+#include <cuda/__ptx/instructions/tcgen05_ld.h>
+#include <cuda/__ptx/instructions/tcgen05_mma.h>
+#include <cuda/__ptx/instructions/tcgen05_mma_ws.h>
+#include <cuda/__ptx/instructions/tcgen05_shift.h>
+#include <cuda/__ptx/instructions/tcgen05_st.h>
+#include <cuda/__ptx/instructions/tcgen05_wait.h>
 #include <cuda/__ptx/instructions/tensormap_cp_fenceproxy.h>
 #include <cuda/__ptx/instructions/tensormap_replace.h>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp
new file mode 100644
index 00000000000..49f9df928e9
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/tcgen05_alloc.h"
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp
new file mode 100644
index 00000000000..73ea1851bec
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/tcgen05_commit.h"
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp
new file mode 100644
index 00000000000..85ddc17efe4
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/tcgen05_cp.h"
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp
new file mode 100644
index 00000000000..fda57b348de
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/tcgen05_fence.h"
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp
new file mode 100644
index 00000000000..8da8e54f18d
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/tcgen05_ld.h"
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp
new file mode 100644
index 00000000000..098cbbfa896
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/tcgen05_mma.h"
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp
new file mode 100644
index 00000000000..350c964d749
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/tcgen05_mma_ws.h"
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp
new file mode 100644
index 00000000000..5ecfff7ff3b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/tcgen05_shift.h"
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp
new file mode 100644
index 00000000000..92a49224f0e
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/tcgen05_st.h"
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp
new file mode 100644
index 00000000000..4bb3156ed12
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/tcgen05_wait.h"
+
+int main(int, char**)
+{
+  return 0;
+}

From cea61a3410fdea796154dcd9157e010659aab837 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 30 Jan 2025 16:48:09 +0100
Subject: [PATCH 15/15] Use a differrent implementation for
 `tuple_of_iterator_references` to tuple conversion (#3609)

---
 .../include/cuda/std/detail/libcxx/include/tuple    | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
index 6ff1039e61b..47f8b16222b 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/tuple
@@ -891,10 +891,19 @@ public:
             enable_if_t<__is_tuple_of_iterator_references<_TupleOfIteratorReferences>::value, int> = 0,
             enable_if_t<(tuple_size<_TupleOfIteratorReferences>::value == sizeof...(_Tp)), int>    = 0>
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple(_TupleOfIteratorReferences&& __t)
-      : tuple(_CUDA_VSTD::forward<_TupleOfIteratorReferences>(__t).template __to_tuple<_Tp...>(
-          __make_tuple_indices_t<sizeof...(_Tp)>()))
+      : tuple(_CUDA_VSTD::forward<_TupleOfIteratorReferences>(__t),
+              typename __make_tuple_indices<sizeof...(_Tp)>::type{})
   {}
 
+private:
+  template <class _TupleOfIteratorReferences,
+            size_t... _Indices,
+            enable_if_t<__is_tuple_of_iterator_references<_TupleOfIteratorReferences>::value, int> = 0>
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 tuple(_TupleOfIteratorReferences&& __t, __tuple_indices<_Indices...>)
+      : tuple(_CUDA_VSTD::get<_Indices>(_CUDA_VSTD::forward<_TupleOfIteratorReferences>(__t))...)
+  {}
+
+public:
   template <class _Tuple,
             class _Constraints                                          = __tuple_like_constraints<_Tuple>,
             enable_if_t<!_PackExpandsToThisTuple<_Tuple>::value, int>   = 0,