Skip to content

Commit

Permalink
PTX: Update generated files with Blackwell instructions (#3568)
Browse files Browse the repository at this point in the history
* ptx: Update existing instructions
* ptx: Add new instructions
* Fix returning error out values
See:
- https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/74
- https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/73
* ptx: Fix out var declaration
See  https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/75
* mbarrier.{test,try}_wait: Fix test. Wrong files were included.
* docs: Fix special registers include
* Allow non-included documentation pages
* Workaround NVRTC

Co-authored-by: Allard Hendriksen <[email protected]>
  • Loading branch information
bernhardmgruber and ahendriksen authored Jan 29, 2025
1 parent ced506d commit d21e0c9
Show file tree
Hide file tree
Showing 155 changed files with 58,115 additions and 2,683 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
..
This file was automatically generated. Do not edit.
barrier.cluster.arrive.aligned
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90
// .aligned = { .aligned }
// Marked volatile and as clobbering memory
template <typename = void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::dot_aligned_t);
barrier.cluster.wait.aligned
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// barrier.cluster.wait.aligned; // PTX ISA 78, SM_90
// .aligned = { .aligned }
// Marked volatile and as clobbering memory
template <typename = void>
__device__ static inline void barrier_cluster_wait(
cuda::ptx::dot_aligned_t);
barrier.cluster.arrive.release.aligned
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90
// .sem = { .release }
// .aligned = { .aligned }
// Marked volatile and as clobbering memory
template <typename = void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_release_t,
cuda::ptx::dot_aligned_t);
barrier.cluster.arrive.relaxed.aligned
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90
// .sem = { .relaxed }
// .aligned = { .aligned }
// Marked volatile
template <typename = void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_relaxed_t,
cuda::ptx::dot_aligned_t);
barrier.cluster.wait.acquire.aligned
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90
// .sem = { .acquire }
// .aligned = { .aligned }
// Marked volatile and as clobbering memory
template <typename = void>
__device__ static inline void barrier_cluster_wait(
cuda::ptx::sem_acquire_t,
cuda::ptx::dot_aligned_t);
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
..
This file was automatically generated. Do not edit.
clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [addr], [smem_bar]; // PTX ISA 86, SM_100
template <typename = void>
__device__ static inline void clusterlaunchcontrol_try_cancel(
void* addr,
uint64_t* smem_bar);
clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [addr], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a
template <typename = void>
__device__ static inline void clusterlaunchcontrol_try_cancel_multicast(
void* addr,
uint64_t* smem_bar);
clusterlaunchcontrol.query_cancel.is_canceled.pred.b128
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 pred_is_canceled, try_cancel_response; // PTX ISA 86, SM_100
template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
__device__ static inline bool clusterlaunchcontrol_query_cancel_is_canceled(
B128 try_cancel_response);
clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
__device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_x(
B128 try_cancel_response);
clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
__device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_y(
B128 try_cancel_response);
clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
__device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_z(
B128 try_cancel_response);
clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 block_dim, try_cancel_response; // PTX ISA 86, SM_100
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
__device__ static inline void clusterlaunchcontrol_query_cancel_get_first_ctaid(
B32 (&block_dim)[4],
B128 try_cancel_response);
38 changes: 35 additions & 3 deletions docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 80, SM_90
// .dst = { .shared::cluster }
// .src = { .global }
template <typename = void>
Expand All @@ -17,11 +17,27 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
const uint32_t& size,
uint64_t* smem_bar);
cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 86, SM_90
// .dst = { .shared::cta }
// .src = { .global }
template <typename = void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_shared_t,
cuda::ptx::space_global_t,
void* dstMem,
const void* srcMem,
const uint32_t& size,
uint64_t* smem_bar);
cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // PTX ISA 80, SM_90
// .dst = { .shared::cluster }
// .src = { .shared::cta }
template <typename = void>
Expand All @@ -37,7 +53,7 @@ cp.async.bulk.global.shared::cta.bulk_group
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // PTX ISA 80, SM_90
// .dst = { .global }
// .src = { .shared::cta }
template <typename = void>
Expand All @@ -47,3 +63,19 @@ cp.async.bulk.global.shared::cta.bulk_group
void* dstMem,
const void* srcMem,
const uint32_t& size);
cp.async.bulk.global.shared::cta.bulk_group.cp_mask
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// cp.async.bulk.dst.src.bulk_group.cp_mask [dstMem], [srcMem], size, byteMask; // PTX ISA 86, SM_100
// .dst = { .global }
// .src = { .shared::cta }
template <typename = void>
__device__ static inline void cp_async_bulk_cp_mask(
cuda::ptx::space_global_t,
cuda::ptx::space_shared_t,
void* dstMem,
const void* srcMem,
const uint32_t& size,
const uint16_t& byteMask);
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::clu
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename = void>
Expand Down
Loading

0 comments on commit d21e0c9

Please sign in to comment.