PTX: Update generated files with Blackwell instructions (#3568)

* ptx: Update existing instructions * ptx: Add new instructions * Fix returning error out values See: - https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/74 - https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/73 * ptx: Fix out var declaration See https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/75 * mbarrier.{test,try}_wait: Fix test. Wrong files were included. * docs: Fix special registers include * Allow non-included documentation pages * Workaround NVRTC Co-authored-by: Allard Hendriksen <[email protected]>
NVIDIA · Jan 29, 2025 · d21e0c9 · d21e0c9
1 parent ced506d
commit d21e0c9
Show file tree

Hide file tree

Showing 155 changed files with 58,115 additions and 2,683 deletions.
diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst
@@ -0,0 +1,63 @@
+..
+   This file was automatically generated. Do not edit.
+
+barrier.cluster.arrive.aligned
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90
+   // .aligned   = { .aligned }
+   // Marked volatile and as clobbering memory
+   template <typename = void>
+   __device__ static inline void barrier_cluster_arrive(
+     cuda::ptx::dot_aligned_t);
+
+barrier.cluster.wait.aligned
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // barrier.cluster.wait.aligned; // PTX ISA 78, SM_90
+   // .aligned   = { .aligned }
+   // Marked volatile and as clobbering memory
+   template <typename = void>
+   __device__ static inline void barrier_cluster_wait(
+     cuda::ptx::dot_aligned_t);
+
+barrier.cluster.arrive.release.aligned
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .aligned   = { .aligned }
+   // Marked volatile and as clobbering memory
+   template <typename = void>
+   __device__ static inline void barrier_cluster_arrive(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::dot_aligned_t);
+
+barrier.cluster.arrive.relaxed.aligned
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90
+   // .sem       = { .relaxed }
+   // .aligned   = { .aligned }
+   // Marked volatile
+   template <typename = void>
+   __device__ static inline void barrier_cluster_arrive(
+     cuda::ptx::sem_relaxed_t,
+     cuda::ptx::dot_aligned_t);
+
+barrier.cluster.wait.acquire.aligned
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .aligned   = { .aligned }
+   // Marked volatile and as clobbering memory
+   template <typename = void>
+   __device__ static inline void barrier_cluster_wait(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::dot_aligned_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst b/docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst
@@ -0,0 +1,68 @@
+..
+   This file was automatically generated. Do not edit.
+
+clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [addr], [smem_bar]; // PTX ISA 86, SM_100
+   template <typename = void>
+   __device__ static inline void clusterlaunchcontrol_try_cancel(
+     void* addr,
+     uint64_t* smem_bar);
+
+clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [addr], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a
+   template <typename = void>
+   __device__ static inline void clusterlaunchcontrol_try_cancel_multicast(
+     void* addr,
+     uint64_t* smem_bar);
+
+clusterlaunchcontrol.query_cancel.is_canceled.pred.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 pred_is_canceled, try_cancel_response; // PTX ISA 86, SM_100
+   template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
+   __device__ static inline bool clusterlaunchcontrol_query_cancel_is_canceled(
+     B128 try_cancel_response);
+
+clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100
+   template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
+   __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_x(
+     B128 try_cancel_response);
+
+clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100
+   template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
+   __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_y(
+     B128 try_cancel_response);
+
+clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100
+   template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
+   __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_z(
+     B128 try_cancel_response);
+
+clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 block_dim, try_cancel_response; // PTX ISA 86, SM_100
+   template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
+   __device__ static inline void clusterlaunchcontrol_query_cancel_get_first_ctaid(
+     B32 (&block_dim)[4],
+     B128 try_cancel_response);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
@@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
-   // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90
+   // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
    template <typename = void>
@@ -17,11 +17,27 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
      const uint32_t& size,
      uint64_t* smem_bar);
 
+cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 86, SM_90
+   // .dst       = { .shared::cta }
+   // .src       = { .global }
+   template <typename = void>
+   __device__ static inline void cp_async_bulk(
+     cuda::ptx::space_shared_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* srcMem,
+     const uint32_t& size,
+     uint64_t* smem_bar);
+
 cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
-   // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
+   // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .shared::cta }
    template <typename = void>
@@ -37,7 +53,7 @@ cp.async.bulk.global.shared::cta.bulk_group
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
-   // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
+   // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
    template <typename = void>
@@ -47,3 +63,19 @@ cp.async.bulk.global.shared::cta.bulk_group
      void* dstMem,
      const void* srcMem,
      const uint32_t& size);
+
+cp.async.bulk.global.shared::cta.bulk_group.cp_mask
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.dst.src.bulk_group.cp_mask [dstMem], [srcMem], size, byteMask; // PTX ISA 86, SM_100
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   template <typename = void>
+   __device__ static inline void cp_async_bulk_cp_mask(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     void* dstMem,
+     const void* srcMem,
+     const uint32_t& size,
+     const uint16_t& byteMask);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
@@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::clu
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
-   // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.  PTX ISA 80, SM_90a
+   // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
    template <typename = void>