NVIDIA · bernhardmgruber · Nov 22, 2024 · Nov 21, 2024 · Nov 22, 2024
@@ -32,129 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
-/*
-// barrier.cluster.arrive; // PTX ISA 78, SM_90
-// Marked volatile and as clobbering memory
-template <typename=void>
-__device__ static inline void barrier_cluster_arrive();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.arrive;"
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// barrier.cluster.wait; // PTX ISA 78, SM_90
-// Marked volatile and as clobbering memory
-template <typename=void>
-__device__ static inline void barrier_cluster_wait();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_wait()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.wait;"
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
-// .sem       = { .release }
-// Marked volatile and as clobbering memory
-template <typename=void>
-__device__ static inline void barrier_cluster_arrive(
-  cuda::ptx::sem_release_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.arrive.release;"
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
-// .sem       = { .relaxed }
-// Marked volatile
-template <typename=void>
-__device__ static inline void barrier_cluster_arrive(
-  cuda::ptx::sem_relaxed_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t)
-{
-  // __sem == sem_relaxed (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.arrive.relaxed;"
-                  :
-                  :
-                  :);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
-// .sem       = { .acquire }
-// Marked volatile and as clobbering memory
-template <typename=void>
-__device__ static inline void barrier_cluster_wait(
-  cuda::ptx::sem_acquire_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.wait.acquire;"
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/barrier_cluster.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 

@@ -32,162 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-/*
-// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80,
-SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* srcMem,
-  const uint32_t& size,
-  uint64_t* smem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __srcMem,
-  const _CUDA_VSTD::uint32_t& __size,
-  _CUDA_VSTD::uint64_t* __smem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast"
-         :
-         : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  void* dstMem,
-  const void* srcMem,
-  const uint32_t& size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  void* __dstMem,
-  const void* __srcMem,
-  const _CUDA_VSTD::uint32_t& __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  void* dstMem,
-  const void* srcMem,
-  const uint32_t& size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void
-cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. "
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar],
-ctaMask; // 1.  PTX ISA 80, SM_90a
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* srcMem,
-  const uint32_t& size,
-  uint64_t* smem_bar,
-  const uint16_t& ctaMask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __srcMem,
-  const _CUDA_VSTD::uint32_t& __size,
-  _CUDA_VSTD::uint64_t* __smem_bar,
-  const _CUDA_VSTD::uint16_t& __ctaMask)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], "
-         "%4; // 1. "
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__as_ptr_gmem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/cp_async_bulk.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 

@@ -32,27 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
-/*
-// cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-template <typename=void>
-__device__ static inline void cp_async_bulk_commit_group();
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_commit_group()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("cp.async.bulk.commit_group;"
-                  :
-                  :
-                  :);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX