diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst
index 971f0213cb0..bf0070dfc08 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst
@@ -112,7 +112,7 @@ cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
-   // cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_100a, SM_101a
+   // cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 86, SM_100a, SM_101a
    // .dst       = { .global }
    // .src       = { .shared::cta }
    template <typename = void>
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
index fea199e4747..7b6ee3325f6 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
@@ -118,14 +118,14 @@ mbarrier.arrive.relaxed.cta.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90
-   // .space     = { .shared::cta }
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
    template <cuda::ptx::dot_scope Scope>
    __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::space_shared_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
      uint64_t* addr,
      const uint32_t& count);
 
@@ -134,14 +134,14 @@ mbarrier.arrive.relaxed.cluster.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90
-   // .space     = { .shared::cta }
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
    template <cuda::ptx::dot_scope Scope>
    __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::space_shared_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
      uint64_t* addr,
      const uint32_t& count);
 
@@ -150,14 +150,14 @@ mbarrier.arrive.relaxed.cta.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90
-   // .space     = { .shared::cta }
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
    template <cuda::ptx::dot_scope Scope>
    __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::space_shared_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
      uint64_t* addr);
 
 mbarrier.arrive.relaxed.cluster.shared::cta.b64
@@ -165,14 +165,14 @@ mbarrier.arrive.relaxed.cluster.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90
-   // .space     = { .shared::cta }
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
    template <cuda::ptx::dot_scope Scope>
    __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::space_shared_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
      uint64_t* addr);
 
 mbarrier.arrive.relaxed.cluster.shared::cluster.b64
@@ -180,14 +180,14 @@ mbarrier.arrive.relaxed.cluster.shared::cluster.b64
 .. code:: cuda
 
    // mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // PTX ISA 86, SM_90
-   // .space     = { .shared::cluster }
    // .sem       = { .relaxed }
    // .scope     = { .cluster }
+   // .space     = { .shared::cluster }
    template <typename = void>
    __device__ static inline void mbarrier_arrive(
-     cuda::ptx::space_cluster_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_cluster_t,
+     cuda::ptx::space_cluster_t,
      uint64_t* addr,
      const uint32_t& count);
 
@@ -196,12 +196,12 @@ mbarrier.arrive.relaxed.cluster.shared::cluster.b64
 .. code:: cuda
 
    // mbarrier.arrive.sem.scope.space.b64 _, [addr]; // PTX ISA 86, SM_90
-   // .space     = { .shared::cluster }
    // .sem       = { .relaxed }
    // .scope     = { .cluster }
+   // .space     = { .shared::cluster }
    template <typename = void>
    __device__ static inline void mbarrier_arrive(
-     cuda::ptx::space_cluster_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_cluster_t,
+     cuda::ptx::space_cluster_t,
      uint64_t* addr);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
index 318a7eb5b98..fb171b05b55 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
@@ -54,14 +54,14 @@ mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90
-   // .space     = { .shared::cta }
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
    template <cuda::ptx::dot_scope Scope>
    __device__ static inline uint64_t mbarrier_arrive_expect_tx(
-     cuda::ptx::space_shared_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
      uint64_t* addr,
      const uint32_t& txCount);
 
@@ -70,14 +70,14 @@ mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90
-   // .space     = { .shared::cta }
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
    template <cuda::ptx::dot_scope Scope>
    __device__ static inline uint64_t mbarrier_arrive_expect_tx(
-     cuda::ptx::space_shared_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
      uint64_t* addr,
      const uint32_t& txCount);
 
@@ -86,13 +86,13 @@ mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64
 .. code:: cuda
 
    // mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], txCount; // PTX ISA 86, SM_90
-   // .space     = { .shared::cluster }
    // .sem       = { .relaxed }
    // .scope     = { .cluster }
+   // .space     = { .shared::cluster }
    template <typename = void>
    __device__ static inline void mbarrier_arrive_expect_tx(
-     cuda::ptx::space_cluster_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_cluster_t,
+     cuda::ptx::space_cluster_t,
      uint64_t* addr,
      const uint32_t& txCount);
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h
index f376f1b48c3..2f553b98f4c 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h
@@ -243,7 +243,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4(
 #endif // __cccl_ptx_isa >= 860
 
 /*
-// cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80,
+// cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 86,
 SM_100a, SM_101a
 // .dst       = { .global }
 // .src       = { .shared::cta }
@@ -255,7 +255,7 @@ __device__ static inline void cp_async_bulk_tensor_tile_scatter4(
   const int32_t (&tensorCoords)[5],
   const void* srcMem);
 */
-#if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 860
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_scatter4_is_not_supported_before_SM_100a_SM_101a__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_scatter4(
@@ -283,6 +283,6 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_scatter4(
   __cuda_ptx_cp_async_bulk_tensor_tile_scatter4_is_not_supported_before_SM_100a_SM_101a__();
 #  endif
 }
-#endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 860
 
 #endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_GATHER_SCATTER_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
index 5f7b23dbb68..5f1a233908f 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
@@ -223,14 +223,14 @@ _CCCL_DEVICE static inline void mbarrier_arrive(
 
 /*
 // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90
-// .space     = { .shared::cta }
 // .sem       = { .relaxed }
 // .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::space_shared_t,
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& count);
 */
@@ -238,15 +238,15 @@ __device__ static inline uint64_t mbarrier_arrive(
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
 _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
-  space_shared_t,
   sem_relaxed_t,
   scope_t<_Scope> __scope,
+  space_shared_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __count)
 {
-  // __space == space_shared (due to parameter type constraint)
   // __sem == sem_relaxed (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+// __space == space_shared (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
   _CUDA_VSTD::uint64_t __state;
   _CCCL_IF_CONSTEXPR (__scope == scope_cta)
@@ -274,25 +274,25 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 
 /*
 // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90
-// .space     = { .shared::cta }
 // .sem       = { .relaxed }
 // .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::space_shared_t,
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
   uint64_t* addr);
 */
 #if __cccl_ptx_isa >= 860
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
 _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
-mbarrier_arrive(space_shared_t, sem_relaxed_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr)
+mbarrier_arrive(sem_relaxed_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr)
 {
-  // __space == space_shared (due to parameter type constraint)
   // __sem == sem_relaxed (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+// __space == space_shared (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
   _CUDA_VSTD::uint64_t __state;
   _CCCL_IF_CONSTEXPR (__scope == scope_cta)
@@ -320,14 +320,14 @@ mbarrier_arrive(space_shared_t, sem_relaxed_t, scope_t<_Scope> __scope, _CUDA_VS
 
 /*
 // mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // PTX ISA 86, SM_90
-// .space     = { .shared::cluster }
 // .sem       = { .relaxed }
 // .scope     = { .cluster }
+// .space     = { .shared::cluster }
 template <typename = void>
 __device__ static inline void mbarrier_arrive(
-  cuda::ptx::space_cluster_t,
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
   uint64_t* addr,
   const uint32_t& count);
 */
@@ -335,11 +335,11 @@ __device__ static inline void mbarrier_arrive(
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void mbarrier_arrive(
-  space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
+  sem_relaxed_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
 {
-// __space == space_cluster (due to parameter type constraint)
 // __sem == sem_relaxed (due to parameter type constraint)
 // __scope == scope_cluster (due to parameter type constraint)
+// __space == space_cluster (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
   asm("mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [%0], %1;"
       :
@@ -354,25 +354,25 @@ _CCCL_DEVICE static inline void mbarrier_arrive(
 
 /*
 // mbarrier.arrive.sem.scope.space.b64 _, [addr]; // PTX ISA 86, SM_90
-// .space     = { .shared::cluster }
 // .sem       = { .relaxed }
 // .scope     = { .cluster }
+// .space     = { .shared::cluster }
 template <typename = void>
 __device__ static inline void mbarrier_arrive(
-  cuda::ptx::space_cluster_t,
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
   uint64_t* addr);
 */
 #if __cccl_ptx_isa >= 860
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void
-mbarrier_arrive(space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr)
+mbarrier_arrive(sem_relaxed_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr)
 {
-// __space == space_cluster (due to parameter type constraint)
 // __sem == sem_relaxed (due to parameter type constraint)
 // __scope == scope_cluster (due to parameter type constraint)
+// __space == space_cluster (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
   asm("mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [%0];" : : "r"(__as_ptr_smem(__addr)) : "memory");
 #  else
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
index 5cbcd4cb3aa..05bd963c185 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
@@ -90,14 +90,14 @@ _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx(
 
 /*
 // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90
-// .space     = { .shared::cta }
 // .sem       = { .relaxed }
 // .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive_expect_tx(
-  cuda::ptx::space_shared_t,
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& txCount);
 */
@@ -105,15 +105,15 @@ __device__ static inline uint64_t mbarrier_arrive_expect_tx(
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
 _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
-  space_shared_t,
   sem_relaxed_t,
   scope_t<_Scope> __scope,
+  space_shared_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __txCount)
 {
-  // __space == space_shared (due to parameter type constraint)
   // __sem == sem_relaxed (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+// __space == space_shared (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
   _CUDA_VSTD::uint64_t __state;
   _CCCL_IF_CONSTEXPR (__scope == scope_cta)
@@ -141,14 +141,14 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
 
 /*
 // mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], txCount; // PTX ISA 86, SM_90
-// .space     = { .shared::cluster }
 // .sem       = { .relaxed }
 // .scope     = { .cluster }
+// .space     = { .shared::cluster }
 template <typename = void>
 __device__ static inline void mbarrier_arrive_expect_tx(
-  cuda::ptx::space_cluster_t,
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
   uint64_t* addr,
   const uint32_t& txCount);
 */
@@ -156,11 +156,11 @@ __device__ static inline void mbarrier_arrive_expect_tx(
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx(
-  space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __txCount)
+  sem_relaxed_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __txCount)
 {
-// __space == space_cluster (due to parameter type constraint)
 // __sem == sem_relaxed (due to parameter type constraint)
 // __scope == scope_cluster (due to parameter type constraint)
+// __space == space_cluster (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
   asm("mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 _, [%0], %1;"
       :
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h
index 930cfa09125..4a9e6da279b 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h
@@ -159,7 +159,7 @@ __global__ void test_cp_async_bulk_tensor_gather_scatter(void** fn_ptr)
                                    const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor_tile_gather4));));
 #endif // __cccl_ptx_isa >= 860
 
-#if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 860
   NV_IF_TARGET(
     NV_HAS_FEATURE_SM_100a,
     (
@@ -176,5 +176,5 @@ __global__ void test_cp_async_bulk_tensor_gather_scatter(void** fn_ptr)
           static_cast<void (*)(
             cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[5], const void*)>(
             cuda::ptx::cp_async_bulk_tensor_tile_scatter4));));
-#endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 860
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h
index d32773c118d..1c9484f7104 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h
@@ -95,13 +95,13 @@ __global__ void test_mbarrier_arrive(void** fn_ptr)
         // mbarrier.arrive.relaxed.cta.shared::cta.b64 state, [addr], count;
         * fn_ptr++ = reinterpret_cast<void*>(
           static_cast<uint64_t (*)(
-            cuda::ptx::space_shared_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cta_t, uint64_t*, const uint32_t&)>(
+            cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cta_t, cuda::ptx::space_shared_t, uint64_t*, const uint32_t&)>(
             cuda::ptx::mbarrier_arrive));
           // mbarrier.arrive.relaxed.cluster.shared::cta.b64 state, [addr], count;
             * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<uint64_t (*)(cuda::ptx::space_shared_t,
-                                       cuda::ptx::sem_relaxed_t,
+              static_cast<uint64_t (*)(cuda::ptx::sem_relaxed_t,
                                        cuda::ptx::scope_cluster_t,
+                                       cuda::ptx::space_shared_t,
                                        uint64_t*,
                                        const uint32_t&)>(cuda::ptx::mbarrier_arrive));));
 #endif // __cccl_ptx_isa >= 860
@@ -113,12 +113,12 @@ __global__ void test_mbarrier_arrive(void** fn_ptr)
         // mbarrier.arrive.relaxed.cta.shared::cta.b64 state, [addr];
         * fn_ptr++ = reinterpret_cast<void*>(
           static_cast<uint64_t (*)(
-            cuda::ptx::space_shared_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cta_t, uint64_t*)>(
+            cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cta_t, cuda::ptx::space_shared_t, uint64_t*)>(
             cuda::ptx::mbarrier_arrive));
           // mbarrier.arrive.relaxed.cluster.shared::cta.b64 state, [addr];
             * fn_ptr++ = reinterpret_cast<void*>(
               static_cast<uint64_t (*)(
-                cuda::ptx::space_shared_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, uint64_t*)>(
+                cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_shared_t, uint64_t*)>(
                 cuda::ptx::mbarrier_arrive));));
 #endif // __cccl_ptx_isa >= 860
 
@@ -129,7 +129,7 @@ __global__ void test_mbarrier_arrive(void** fn_ptr)
         // mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [addr], count;
         * fn_ptr++ = reinterpret_cast<void*>(
           static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint32_t&)>(
+            cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>(
             cuda::ptx::mbarrier_arrive));));
 #endif // __cccl_ptx_isa >= 860
 
@@ -140,7 +140,7 @@ __global__ void test_mbarrier_arrive(void** fn_ptr)
         // mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [addr];
         * fn_ptr++ = reinterpret_cast<void*>(
           static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, uint64_t*)>(
+            cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*)>(
             cuda::ptx::mbarrier_arrive));));
 #endif // __cccl_ptx_isa >= 860
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h
index 8ef925662ac..99012490516 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h
@@ -52,13 +52,13 @@ __global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr)
         // mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 state, [addr], txCount;
         * fn_ptr++ = reinterpret_cast<void*>(
           static_cast<uint64_t (*)(
-            cuda::ptx::space_shared_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cta_t, uint64_t*, const uint32_t&)>(
+            cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cta_t, cuda::ptx::space_shared_t, uint64_t*, const uint32_t&)>(
             cuda::ptx::mbarrier_arrive_expect_tx));
           // mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64 state, [addr], txCount;
             * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<uint64_t (*)(cuda::ptx::space_shared_t,
-                                       cuda::ptx::sem_relaxed_t,
+              static_cast<uint64_t (*)(cuda::ptx::sem_relaxed_t,
                                        cuda::ptx::scope_cluster_t,
+                                       cuda::ptx::space_shared_t,
                                        uint64_t*,
                                        const uint32_t&)>(cuda::ptx::mbarrier_arrive_expect_tx));));
 #endif // __cccl_ptx_isa >= 860
@@ -70,7 +70,7 @@ __global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr)
         // mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 _, [addr], txCount;
         * fn_ptr++ = reinterpret_cast<void*>(
           static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint32_t&)>(
+            cuda::ptx::sem_relaxed_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>(
             cuda::ptx::mbarrier_arrive_expect_tx));));
 #endif // __cccl_ptx_isa >= 860
 }