NVIDIA · bernhardmgruber · Feb 2, 2025 · Jan 31, 2025 · Feb 2, 2025
@@ -112,7 +112,7 @@ cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
-   // cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_100a, SM_101a
+   // cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 86, SM_100a, SM_101a
    // .dst       = { .global }
    // .src       = { .shared::cta }
    template <typename = void>

@@ -118,14 +118,14 @@ mbarrier.arrive.relaxed.cta.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90
-   // .space     = { .shared::cta }
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
    template <cuda::ptx::dot_scope Scope>
    __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::space_shared_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
      uint64_t* addr,
      const uint32_t& count);
 
@@ -134,14 +134,14 @@ mbarrier.arrive.relaxed.cluster.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90
-   // .space     = { .shared::cta }
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
    template <cuda::ptx::dot_scope Scope>
    __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::space_shared_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
      uint64_t* addr,
      const uint32_t& count);
 
@@ -150,44 +150,44 @@ mbarrier.arrive.relaxed.cta.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90
-   // .space     = { .shared::cta }
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
    template <cuda::ptx::dot_scope Scope>
    __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::space_shared_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
      uint64_t* addr);
 
 mbarrier.arrive.relaxed.cluster.shared::cta.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90
-   // .space     = { .shared::cta }
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
    template <cuda::ptx::dot_scope Scope>
    __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::space_shared_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
      uint64_t* addr);
 
 mbarrier.arrive.relaxed.cluster.shared::cluster.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // PTX ISA 86, SM_90
-   // .space     = { .shared::cluster }
    // .sem       = { .relaxed }
    // .scope     = { .cluster }
+   // .space     = { .shared::cluster }
    template <typename = void>
    __device__ static inline void mbarrier_arrive(
-     cuda::ptx::space_cluster_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_cluster_t,
+     cuda::ptx::space_cluster_t,
      uint64_t* addr,
      const uint32_t& count);
 
@@ -196,12 +196,12 @@ mbarrier.arrive.relaxed.cluster.shared::cluster.b64
 .. code:: cuda
 
    // mbarrier.arrive.sem.scope.space.b64 _, [addr]; // PTX ISA 86, SM_90
-   // .space     = { .shared::cluster }
    // .sem       = { .relaxed }
    // .scope     = { .cluster }
+   // .space     = { .shared::cluster }
    template <typename = void>
    __device__ static inline void mbarrier_arrive(
-     cuda::ptx::space_cluster_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_cluster_t,
+     cuda::ptx::space_cluster_t,
      uint64_t* addr);
@@ -54,14 +54,14 @@ mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90
-   // .space     = { .shared::cta }
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
    template <cuda::ptx::dot_scope Scope>
    __device__ static inline uint64_t mbarrier_arrive_expect_tx(
-     cuda::ptx::space_shared_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
      uint64_t* addr,
      const uint32_t& txCount);
 
@@ -70,14 +70,14 @@ mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90
-   // .space     = { .shared::cta }
    // .sem       = { .relaxed }
    // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
    template <cuda::ptx::dot_scope Scope>
    __device__ static inline uint64_t mbarrier_arrive_expect_tx(
-     cuda::ptx::space_shared_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
      uint64_t* addr,
      const uint32_t& txCount);
 
@@ -86,13 +86,13 @@ mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64
 .. code:: cuda
 
    // mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], txCount; // PTX ISA 86, SM_90
-   // .space     = { .shared::cluster }
    // .sem       = { .relaxed }
    // .scope     = { .cluster }
+   // .space     = { .shared::cluster }
    template <typename = void>
    __device__ static inline void mbarrier_arrive_expect_tx(
-     cuda::ptx::space_cluster_t,
      cuda::ptx::sem_relaxed_t,
      cuda::ptx::scope_cluster_t,
+     cuda::ptx::space_cluster_t,
      uint64_t* addr,
      const uint32_t& txCount);
@@ -243,7 +243,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4(
 #endif // __cccl_ptx_isa >= 860
 
 /*
-// cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80,
+// cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 86,
 SM_100a, SM_101a
 // .dst       = { .global }
 // .src       = { .shared::cta }
@@ -255,7 +255,7 @@ __device__ static inline void cp_async_bulk_tensor_tile_scatter4(
   const int32_t (&tensorCoords)[5],
   const void* srcMem);
 */
-#if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 860
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_scatter4_is_not_supported_before_SM_100a_SM_101a__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_scatter4(
@@ -283,6 +283,6 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_scatter4(
   __cuda_ptx_cp_async_bulk_tensor_tile_scatter4_is_not_supported_before_SM_100a_SM_101a__();
 #  endif
 }
-#endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 860
 
 #endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_GATHER_SCATTER_H_
@@ -223,30 +223,30 @@ _CCCL_DEVICE static inline void mbarrier_arrive(
 
 /*
 // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90
-// .space     = { .shared::cta }
 // .sem       = { .relaxed }
 // .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::space_shared_t,
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 860
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
 _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
-  space_shared_t,
   sem_relaxed_t,
   scope_t<_Scope> __scope,
+  space_shared_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __count)
 {
-  // __space == space_shared (due to parameter type constraint)
   // __sem == sem_relaxed (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+// __space == space_shared (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
   _CUDA_VSTD::uint64_t __state;
   _CCCL_IF_CONSTEXPR (__scope == scope_cta)
@@ -274,25 +274,25 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 
 /*
 // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90
-// .space     = { .shared::cta }
 // .sem       = { .relaxed }
 // .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::space_shared_t,
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
   uint64_t* addr);
 */
 #if __cccl_ptx_isa >= 860
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
 _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
-mbarrier_arrive(space_shared_t, sem_relaxed_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr)
+mbarrier_arrive(sem_relaxed_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr)
 {
-  // __space == space_shared (due to parameter type constraint)
   // __sem == sem_relaxed (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+// __space == space_shared (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
   _CUDA_VSTD::uint64_t __state;
   _CCCL_IF_CONSTEXPR (__scope == scope_cta)
@@ -320,26 +320,26 @@ mbarrier_arrive(space_shared_t, sem_relaxed_t, scope_t<_Scope> __scope, _CUDA_VS
 
 /*
 // mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // PTX ISA 86, SM_90
-// .space     = { .shared::cluster }
 // .sem       = { .relaxed }
 // .scope     = { .cluster }
+// .space     = { .shared::cluster }
 template <typename = void>
 __device__ static inline void mbarrier_arrive(
-  cuda::ptx::space_cluster_t,
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
   uint64_t* addr,
   const uint32_t& count);
 */
 #if __cccl_ptx_isa >= 860
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void mbarrier_arrive(
-  space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
+  sem_relaxed_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
 {
-// __space == space_cluster (due to parameter type constraint)
 // __sem == sem_relaxed (due to parameter type constraint)
 // __scope == scope_cluster (due to parameter type constraint)
+// __space == space_cluster (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
   asm("mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [%0], %1;"
       :
@@ -354,25 +354,25 @@ _CCCL_DEVICE static inline void mbarrier_arrive(
 
 /*
 // mbarrier.arrive.sem.scope.space.b64 _, [addr]; // PTX ISA 86, SM_90
-// .space     = { .shared::cluster }
 // .sem       = { .relaxed }
 // .scope     = { .cluster }
+// .space     = { .shared::cluster }
 template <typename = void>
 __device__ static inline void mbarrier_arrive(
-  cuda::ptx::space_cluster_t,
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
   uint64_t* addr);
 */
 #if __cccl_ptx_isa >= 860
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void
-mbarrier_arrive(space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr)
+mbarrier_arrive(sem_relaxed_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr)
 {
-// __space == space_cluster (due to parameter type constraint)
 // __sem == sem_relaxed (due to parameter type constraint)
 // __scope == scope_cluster (due to parameter type constraint)
+// __space == space_cluster (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
   asm("mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [%0];" : : "r"(__as_ptr_smem(__addr)) : "memory");
 #  else

@@ -90,30 +90,30 @@ _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx(
 
 /*
 // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90
-// .space     = { .shared::cta }
 // .sem       = { .relaxed }
 // .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
 template <cuda::ptx::dot_scope Scope>
 __device__ static inline uint64_t mbarrier_arrive_expect_tx(
-  cuda::ptx::space_shared_t,
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
   uint64_t* addr,
   const uint32_t& txCount);
 */
 #if __cccl_ptx_isa >= 860
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
 template <dot_scope _Scope>
 _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
-  space_shared_t,
   sem_relaxed_t,
   scope_t<_Scope> __scope,
+  space_shared_t,
   _CUDA_VSTD::uint64_t* __addr,
   const _CUDA_VSTD::uint32_t& __txCount)
 {
-  // __space == space_shared (due to parameter type constraint)
   // __sem == sem_relaxed (due to parameter type constraint)
   static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+// __space == space_shared (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
   _CUDA_VSTD::uint64_t __state;
   _CCCL_IF_CONSTEXPR (__scope == scope_cta)
@@ -141,26 +141,26 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
 
 /*
 // mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], txCount; // PTX ISA 86, SM_90
-// .space     = { .shared::cluster }
 // .sem       = { .relaxed }
 // .scope     = { .cluster }
+// .space     = { .shared::cluster }
 template <typename = void>
 __device__ static inline void mbarrier_arrive_expect_tx(
-  cuda::ptx::space_cluster_t,
   cuda::ptx::sem_relaxed_t,
   cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
   uint64_t* addr,
   const uint32_t& txCount);
 */
 #if __cccl_ptx_isa >= 860
 extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx(
-  space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __txCount)
+  sem_relaxed_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __txCount)
 {
-// __space == space_cluster (due to parameter type constraint)
 // __sem == sem_relaxed (due to parameter type constraint)
 // __scope == scope_cluster (due to parameter type constraint)
+// __space == space_cluster (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
   asm("mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 _, [%0], %1;"
       :

@@ -159,7 +159,7 @@ __global__ void test_cp_async_bulk_tensor_gather_scatter(void** fn_ptr)
                                    const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor_tile_gather4));));
 #endif // __cccl_ptx_isa >= 860
 
-#if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 860
   NV_IF_TARGET(
     NV_HAS_FEATURE_SM_100a,
     (
@@ -176,5 +176,5 @@ __global__ void test_cp_async_bulk_tensor_gather_scatter(void** fn_ptr)
           static_cast<void (*)(
             cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[5], const void*)>(
             cuda::ptx::cp_async_bulk_tensor_tile_scatter4));));
-#endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 860
 }