diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index a518dad0ff2..f0776974eec 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -6,25 +6,25 @@ PTX Instructions .. toctree:: :maxdepth: 1 - instructions/barrier.cluster - instructions/cp.async.bulk - instructions/cp.async.bulk.commit_group - instructions/cp.async.bulk.wait_group - instructions/cp.async.bulk.tensor - instructions/cp.reduce.async.bulk - instructions/cp.reduce.async.bulk.tensor + instructions/barrier_cluster + instructions/cp_async_bulk + instructions/cp_async_bulk_commit_group + instructions/cp_async_bulk_wait_group + instructions/cp_async_bulk_tensor + instructions/cp_reduce_async_bulk + instructions/cp_reduce_async_bulk_tensor instructions/fence instructions/getctarank instructions/mapa - instructions/mbarrier.init - instructions/mbarrier.arrive - instructions/mbarrier.expect_tx - instructions/mbarrier.test_wait - instructions/mbarrier.try_wait - instructions/red.async - instructions/st.async - instructions/tensormap.replace - instructions/tensormap.cp_fenceproxy + instructions/mbarrier_init + instructions/mbarrier_arrive + instructions/mbarrier_expect_tx + instructions/mbarrier_test_wait + instructions/mbarrier_try_wait + instructions/red_async + instructions/st_async + instructions/tensormap_replace + instructions/tensormap_cp_fenceproxy instructions/special_registers diff --git a/docs/libcudacxx/ptx/instructions/barrier_cluster.rst b/docs/libcudacxx/ptx/instructions/barrier_cluster.rst new file mode 100644 index 00000000000..bc8943bc619 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/barrier_cluster.rst @@ -0,0 +1,16 @@ +.. _libcudacxx-ptx-instructions-barrier-cluster: + +barrier.cluster +=============== + +- PTX ISA: + `barrier.cluster `__ + +Similar functionality is provided through the builtins +``__cluster_barrier_arrive(), __cluster_barrier_arrive_relaxed(), __cluster_barrier_wait()``, +as well as the ``cooperative_groups::cluster_group`` +`API `__. + +The ``.aligned`` variants of the instructions are not exposed. + +.. include:: generated/barrier_cluster.rst diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst new file mode 100644 index 00000000000..32121ef8a12 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst @@ -0,0 +1,30 @@ +.. _libcudacxx-ptx-instructions-cp-async-bulk: + +cp.async.bulk +============= + +- PTX ISA: + `cp.async.bulk `__ + +Implementation notes +-------------------- + +**NOTE.** Both ``srcMem`` and ``dstMem`` must be 16-byte aligned, and +``size`` must be a multiple of 16. + +Changelog +--------- + +- In earlier versions, ``cp_async_bulk_multicast`` was enabled for + SM_90. This has been changed to SM_90a. + + +Unicast +------- + +.. include:: generated/cp_async_bulk.rst + +Multicast +--------- + +.. include:: generated/cp_async_bulk_multicast.rst diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.commit_group.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_commit_group.rst similarity index 58% rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.commit_group.rst rename to docs/libcudacxx/ptx/instructions/cp_async_bulk_commit_group.rst index cc549f54163..8efc5ac0488 100644 --- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.commit_group.rst +++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_commit_group.rst @@ -6,10 +6,4 @@ cp.async.bulk.commit_group - PTX ISA: `cp.async.bulk.commit_group `__ -cp.async.bulk.commit_group -^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk.commit_group; // PTX ISA 80, SM_90 - template - __device__ static inline void cp_async_bulk_commit_group(); +.. include:: generated/cp_async_bulk_commit_group.rst diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst new file mode 100644 index 00000000000..bde3488bac9 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst @@ -0,0 +1,23 @@ +.. _libcudacxx-ptx-instructions-cp-async-bulk-tensor: + +cp.async.bulk.tensor +==================== + +- PTX ISA: + `cp.async.bulk.tensor `__ + +Changelog +--------- + +- In earlier versions, ``cp_async_bulk_tensor_multicast`` was enabled + for SM_90. This has been changed to SM_90a. + +Unicast +------- + +.. include:: generated/cp_async_bulk_tensor.rst + +Multicast +--------- + +.. include:: generated/cp_async_bulk_tensor_multicast.rst diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst new file mode 100644 index 00000000000..e24bb0fc9fd --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-cp-async-bulk-wait_group: + +cp.async.bulk.wait_group +======================== + +- PTX ISA: + `cp.async.bulk.wait_group `__ + +.. include:: generated/cp_async_bulk_wait_group.rst diff --git a/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst new file mode 100644 index 00000000000..a4710b5ce30 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst @@ -0,0 +1,61 @@ +.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk: + +cp.reduce.async.bulk +==================== + +- PTX ISA: + `cp.reduce.async.bulk `__ + + +Integer and floating point instructions +--------------------------------------- + +.. include:: generated/cp_reduce_async_bulk.rst + +Emulation of ``.s64`` instruction +--------------------------------- + +PTX does not currently (CTK 12.3) expose +``cp.reduce.async.bulk.add.s64``. This exposure is emulated in +``cuda::ptx`` using: + +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 + // .dst = { .shared::cluster } + // .src = { .shared::cta } + // .type = { .s64 } + // .op = { .add } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + int64_t* dstMem, + const int64_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); + + // cp.reduce.async.bulk.dst.src.bulk_group.op.u64 [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .s64 } + // .op = { .add } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + int64_t* dstMem, + const int64_t* srcMem, + uint32_t size); + +FP16 instructions +----------------- + +.. include:: generated/cp_reduce_async_bulk_f16.rst + +BF16 instructions +----------------- + +.. include:: generated/cp_reduce_async_bulk_bf16.rst diff --git a/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst new file mode 100644 index 00000000000..598d9e1e3ea --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk-tensor: + +cp.reduce.async.bulk.tensor +=========================== + +- PTX ISA: + `cp.reduce.async.bulk.tensor `__ + +.. include:: generated/cp_reduce_async_bulk_tensor.rst diff --git a/docs/libcudacxx/ptx/instructions/fence.rst b/docs/libcudacxx/ptx/instructions/fence.rst index 8a4e7f281cb..82de170f63b 100644 --- a/docs/libcudacxx/ptx/instructions/fence.rst +++ b/docs/libcudacxx/ptx/instructions/fence.rst @@ -11,272 +11,25 @@ fence fence ----- -fence.sc.cta -^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } - // .scope = { .cta, .gpu, .sys } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); - -fence.sc.gpu -^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } - // .scope = { .cta, .gpu, .sys } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); - -fence.sc.sys -^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } - // .scope = { .cta, .gpu, .sys } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); - -fence.acq_rel.cta -^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } - // .scope = { .cta, .gpu, .sys } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); - -fence.acq_rel.gpu -^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } - // .scope = { .cta, .gpu, .sys } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); - -fence.acq_rel.sys -^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } - // .scope = { .cta, .gpu, .sys } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); - -fence.sc.cluster -^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 - // .sem = { .sc, .acq_rel } - // .scope = { .cluster } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_cluster_t); - -fence.acq_rel.cluster -^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 - // .sem = { .sc, .acq_rel } - // .scope = { .cluster } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_cluster_t); +.. include:: generated/fence.rst fence.mbarrier_init ------------------- -fence.mbarrier_init.release.cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cluster } - template - __device__ static inline void fence_mbarrier_init( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t); +.. include:: generated/fence_mbarrier_init.rst fence.proxy.alias ----------------- -fence.proxy.alias -^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.alias; // 4. PTX ISA 75, SM_70 - template - __device__ static inline void fence_proxy_alias(); +.. include:: generated/fence_proxy_alias.rst fence.proxy.async ----------------- -fence.proxy.async -^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.async; // 5. PTX ISA 80, SM_90 - template - __device__ static inline void fence_proxy_async(); -fence.proxy.async.global -^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 - // .space = { .global, .shared::cluster, .shared::cta } - template - __device__ static inline void fence_proxy_async( - cuda::ptx::space_t space); - -fence.proxy.async.shared::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 - // .space = { .global, .shared::cluster, .shared::cta } - template - __device__ static inline void fence_proxy_async( - cuda::ptx::space_t space); - -fence.proxy.async.shared::cta -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 - // .space = { .global, .shared::cluster, .shared::cta } - template - __device__ static inline void fence_proxy_async( - cuda::ptx::space_t space); +.. include:: generated/fence_proxy_async.rst fence.proxy.tensormap --------------------- -fence.proxy.tensormap::generic.release.cta -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope); - -fence.proxy.tensormap::generic.release.cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope); - -fence.proxy.tensormap::generic.release.gpu -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope); - -fence.proxy.tensormap::generic.release.sys -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope); - -fence.proxy.tensormap::generic.acquire.cta -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - const void* addr, - cuda::ptx::n32_t size); - -fence.proxy.tensormap::generic.acquire.cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - const void* addr, - cuda::ptx::n32_t size); - -fence.proxy.tensormap::generic.acquire.gpu -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - const void* addr, - cuda::ptx::n32_t size); - -fence.proxy.tensormap::generic.acquire.sys -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - const void* addr, - cuda::ptx::n32_t size); +.. include:: generated/fence_proxy_tensormap_generic.rst diff --git a/docs/libcudacxx/ptx/instructions/barrier.cluster.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst similarity index 70% rename from docs/libcudacxx/ptx/instructions/barrier.cluster.rst rename to docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst index 99048587eb5..bd994990c05 100644 --- a/docs/libcudacxx/ptx/instructions/barrier.cluster.rst +++ b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst @@ -1,18 +1,3 @@ -.. _libcudacxx-ptx-instructions-barrier-cluster: - -barrier.cluster -=============== - -- PTX ISA: - `barrier.cluster `__ - -Similar functionality is provided through the builtins -``__cluster_barrier_arrive(), __cluster_barrier_arrive_relaxed(), __cluster_barrier_wait()``, -as well as the ``cooperative_groups::cluster_group`` -`API `__. - -The ``.aligned`` variants of the instructions are not exposed. - barrier.cluster.arrive ^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst similarity index 57% rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.rst rename to docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst index 434a44a15a4..f5c236f8bf9 100644 --- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst @@ -1,26 +1,3 @@ -.. _libcudacxx-ptx-instructions-cp-async-bulk: - -cp.async.bulk -============= - -- PTX ISA: - `cp.async.bulk `__ - -Implementation notes --------------------- - -**NOTE.** Both ``srcMem`` and ``dstMem`` must be 16-byte aligned, and -``size`` must be a multiple of 16. - -Changelog ---------- - -- In earlier versions, ``cp_async_bulk_multicast`` was enabled for - SM_90. This has been changed to SM_90a. - -Unicast -------- - cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -67,23 +44,3 @@ cp.async.bulk.global.shared::cta.bulk_group void* dstMem, const void* srcMem, const uint32_t& size); - -Multicast ---------- - -cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a - // .dst = { .shared::cluster } - // .src = { .global } - template - __device__ static inline void cp_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* srcMem, - const uint32_t& size, - uint64_t* smem_bar, - const uint16_t& ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst new file mode 100644 index 00000000000..984b4aff976 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst @@ -0,0 +1,7 @@ +cp.async.bulk.commit_group +^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.commit_group; // PTX ISA 80, SM_90 + template + __device__ static inline void cp_async_bulk_commit_group(); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst new file mode 100644 index 00000000000..9cb15d06fa3 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst @@ -0,0 +1,16 @@ +cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* smem_bar, + const uint16_t& ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst similarity index 59% rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.tensor.rst rename to docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst index f095abcd1a3..40eb070e66a 100644 --- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.tensor.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst @@ -1,20 +1,3 @@ -.. _libcudacxx-ptx-instructions-cp-async-bulk-tensor: - -cp.async.bulk.tensor -==================== - -- PTX ISA: - `cp.async.bulk.tensor `__ - -Changelog ---------- - -- In earlier versions, ``cp_async_bulk_tensor_multicast`` was enabled - for SM_90. This has been changed to SM_90a. - -Unicast -------- - cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -169,91 +152,3 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group const void* tensorMap, const int32_t (&tensorCoords)[5], const void* srcMem); - -Multicast ---------- - -cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a - // .dst = { .shared::cluster } - // .src = { .global } - template - __device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[1], - uint64_t* smem_bar, - const uint16_t& ctaMask); - -cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a - // .dst = { .shared::cluster } - // .src = { .global } - template - __device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[2], - uint64_t* smem_bar, - const uint16_t& ctaMask); - -cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a - // .dst = { .shared::cluster } - // .src = { .global } - template - __device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[3], - uint64_t* smem_bar, - const uint16_t& ctaMask); - -cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a - // .dst = { .shared::cluster } - // .src = { .global } - template - __device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[4], - uint64_t* smem_bar, - const uint16_t& ctaMask); - -cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a - // .dst = { .shared::cluster } - // .src = { .global } - template - __device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[5], - uint64_t* smem_bar, - const uint16_t& ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst new file mode 100644 index 00000000000..2481c80bf3c --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst @@ -0,0 +1,84 @@ +cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.wait_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst similarity index 62% rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.wait_group.rst rename to docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst index 8939292d340..08ebd3c28a7 100644 --- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.wait_group.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst @@ -1,11 +1,3 @@ -.. _libcudacxx-ptx-instructions-cp-async-bulk-wait_group: - -cp.async.bulk.wait_group -======================== - -- PTX ISA: - `cp.async.bulk.wait_group `__ - cp.async.bulk.wait_group ^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst similarity index 80% rename from docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.rst rename to docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst index 571e1d9842f..cc82d633375 100644 --- a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst @@ -1,15 +1,3 @@ -.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk: - -cp.reduce.async.bulk -==================== - -- PTX ISA: - `cp.reduce.async.bulk `__ - - -Integer and floating point instructions ---------------------------------------- - cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -652,155 +640,3 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 int64_t* dstMem, const int64_t* srcMem, uint32_t size); - -Emulation of ``.s64`` instruction ---------------------------------- - -PTX does not currently (CTK 12.3) expose -``cp.reduce.async.bulk.add.s64``. This exposure is emulated in -``cuda::ptx`` using: - -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 - // .dst = { .shared::cluster } - // .src = { .shared::cta } - // .type = { .s64 } - // .op = { .add } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - int64_t* dstMem, - const int64_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); - - // cp.reduce.async.bulk.dst.src.bulk_group.op.u64 [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .s64 } - // .op = { .add } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - int64_t* dstMem, - const int64_t* srcMem, - uint32_t size); - -FP16 instructions ------------------ - -cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .f16 } - // .op = { .min } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - __half* dstMem, - const __half* srcMem, - uint32_t size); - -cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .f16 } - // .op = { .max } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - __half* dstMem, - const __half* srcMem, - uint32_t size); - -cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .f16 } - // .op = { .add } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - __half* dstMem, - const __half* srcMem, - uint32_t size); - -BF16 instructions ------------------ - -cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .bf16 } - // .op = { .min } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - __nv_bfloat16* dstMem, - const __nv_bfloat16* srcMem, - uint32_t size); - -cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .bf16 } - // .op = { .max } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - __nv_bfloat16* dstMem, - const __nv_bfloat16* srcMem, - uint32_t size); - -cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .bf16 } - // .op = { .add } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - __nv_bfloat16* dstMem, - const __nv_bfloat16* srcMem, - uint32_t size); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst new file mode 100644 index 00000000000..e4dea98a119 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst @@ -0,0 +1,53 @@ +cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .bf16 } + // .op = { .min } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + __nv_bfloat16* dstMem, + const __nv_bfloat16* srcMem, + uint32_t size); + +cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .bf16 } + // .op = { .max } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + __nv_bfloat16* dstMem, + const __nv_bfloat16* srcMem, + uint32_t size); + +cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .bf16 } + // .op = { .add } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + __nv_bfloat16* dstMem, + const __nv_bfloat16* srcMem, + uint32_t size); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst new file mode 100644 index 00000000000..18c5e0bfc60 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst @@ -0,0 +1,53 @@ +cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .f16 } + // .op = { .min } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + __half* dstMem, + const __half* srcMem, + uint32_t size); + +cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .f16 } + // .op = { .max } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + __half* dstMem, + const __half* srcMem, + uint32_t size); + +cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .f16 } + // .op = { .add } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + __half* dstMem, + const __half* srcMem, + uint32_t size); diff --git a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst similarity index 98% rename from docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.tensor.rst rename to docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst index 7ea7b5675aa..c653b01cd60 100644 --- a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.tensor.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst @@ -1,11 +1,3 @@ -.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk-tensor: - -cp.reduce.async.bulk.tensor -=========================== - -- PTX ISA: - `cp.reduce.async.bulk.tensor `__ - cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/fence.rst b/docs/libcudacxx/ptx/instructions/generated/fence.rst new file mode 100644 index 00000000000..2fe14dcb3b2 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence.rst @@ -0,0 +1,95 @@ +fence.sc.cta +^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc, .acq_rel } + // .scope = { .cta, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); + +fence.sc.gpu +^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc, .acq_rel } + // .scope = { .cta, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); + +fence.sc.sys +^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc, .acq_rel } + // .scope = { .cta, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); + +fence.acq_rel.cta +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc, .acq_rel } + // .scope = { .cta, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); + +fence.acq_rel.gpu +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc, .acq_rel } + // .scope = { .cta, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); + +fence.acq_rel.sys +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc, .acq_rel } + // .scope = { .cta, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); + +fence.sc.cluster +^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 + // .sem = { .sc, .acq_rel } + // .scope = { .cluster } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_cluster_t); + +fence.acq_rel.cluster +^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 + // .sem = { .sc, .acq_rel } + // .scope = { .cluster } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_cluster_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst new file mode 100644 index 00000000000..0f5298e3359 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst @@ -0,0 +1,11 @@ +fence.mbarrier_init.release.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cluster } + template + __device__ static inline void fence_mbarrier_init( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst new file mode 100644 index 00000000000..935aab9b6df --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst @@ -0,0 +1,7 @@ +fence.proxy.alias +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.alias; // 4. PTX ISA 75, SM_70 + template + __device__ static inline void fence_proxy_alias(); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst new file mode 100644 index 00000000000..3e741a1f6c4 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst @@ -0,0 +1,37 @@ +fence.proxy.async +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async; // 5. PTX ISA 80, SM_90 + template + __device__ static inline void fence_proxy_async(); + +fence.proxy.async.global +^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // .space = { .global, .shared::cluster, .shared::cta } + template + __device__ static inline void fence_proxy_async( + cuda::ptx::space_t space); + +fence.proxy.async.shared::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // .space = { .global, .shared::cluster, .shared::cta } + template + __device__ static inline void fence_proxy_async( + cuda::ptx::space_t space); + +fence.proxy.async.shared::cta +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // .space = { .global, .shared::cluster, .shared::cta } + template + __device__ static inline void fence_proxy_async( + cuda::ptx::space_t space); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst new file mode 100644 index 00000000000..db582971c3d --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst @@ -0,0 +1,103 @@ +fence.proxy.tensormap::generic.release.cta +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.proxy.tensormap::generic.release.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.proxy.tensormap::generic.release.gpu +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.proxy.tensormap::generic.release.sys +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.proxy.tensormap::generic.acquire.cta +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + const void* addr, + cuda::ptx::n32_t size); + +fence.proxy.tensormap::generic.acquire.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + const void* addr, + cuda::ptx::n32_t size); + +fence.proxy.tensormap::generic.acquire.gpu +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + const void* addr, + cuda::ptx::n32_t size); + +fence.proxy.tensormap::generic.acquire.sys +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + const void* addr, + cuda::ptx::n32_t size); diff --git a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst new file mode 100644 index 00000000000..c85f52ee302 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst @@ -0,0 +1,10 @@ +getctarank.shared::cluster.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 + // .space = { .shared::cluster } + template + __device__ static inline uint32_t getctarank( + cuda::ptx::space_cluster_t, + const void* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst new file mode 100644 index 00000000000..92cd106cad9 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst @@ -0,0 +1,111 @@ +mbarrier.arrive.shared.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 + template + __device__ static inline uint64_t mbarrier_arrive( + uint64_t* addr); + +mbarrier.arrive.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 + template + __device__ static inline uint64_t mbarrier_arrive( + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.release.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster } + // .space = { .shared::cta } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr); + +mbarrier.arrive.release.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster } + // .space = { .shared::cta } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr); + +mbarrier.arrive.release.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster } + // .space = { .shared::cta } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.release.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster } + // .space = { .shared::cta } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.release.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cluster } + // .space = { .shared::cluster } + template + __device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr); + +mbarrier.arrive.release.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cluster } + // .space = { .shared::cluster } + template + __device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr, + const uint32_t& count); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst new file mode 100644 index 00000000000..0087ae2f458 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst @@ -0,0 +1,47 @@ +mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster } + // .space = { .shared::cta } + template + __device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& tx_count); + +mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster } + // .space = { .shared::cta } + template + __device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& tx_count); + +mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cluster } + // .space = { .shared::cluster } + template + __device__ static inline void mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr, + const uint32_t& tx_count); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst new file mode 100644 index 00000000000..b6d7edbbeee --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst @@ -0,0 +1,9 @@ +mbarrier.arrive.noComplete.shared.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 + template + __device__ static inline uint64_t mbarrier_arrive_no_complete( + uint64_t* addr, + const uint32_t& count); diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst similarity index 88% rename from docs/libcudacxx/ptx/instructions/mbarrier.expect_tx.rst rename to docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst index 9b40db58d0c..b87d6f62a23 100644 --- a/docs/libcudacxx/ptx/instructions/mbarrier.expect_tx.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst @@ -1,11 +1,3 @@ -.. _libcudacxx-ptx-instructions-mbarrier-expect_tx: - -mbarrier.expect_tx -================== - -- PTX ISA: - `mbarrier.expect_tx `__ - mbarrier.expect_tx.relaxed.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.init.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst similarity index 50% rename from docs/libcudacxx/ptx/instructions/mbarrier.init.rst rename to docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst index 8c7e65eeab6..3e529d86d78 100644 --- a/docs/libcudacxx/ptx/instructions/mbarrier.init.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst @@ -1,11 +1,3 @@ -.. _libcudacxx-ptx-instructions-mbarrier-init: - -mbarrier.init -============= - -- PTX ISA: - `mbarrier.arrive `__ - mbarrier.init.shared.b64 ^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst new file mode 100644 index 00000000000..4cb241c7ca8 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst @@ -0,0 +1,37 @@ +mbarrier.test_wait.shared.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX ISA 70, SM_80 + template + __device__ static inline bool mbarrier_test_wait( + uint64_t* addr, + const uint64_t& state); + +mbarrier.test_wait.acquire.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_test_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); + +mbarrier.test_wait.acquire.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_test_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst new file mode 100644 index 00000000000..e750c4a543f --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst @@ -0,0 +1,37 @@ +mbarrier.test_wait.parity.shared.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX ISA 71, SM_80 + template + __device__ static inline bool mbarrier_test_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_test_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_test_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst new file mode 100644 index 00000000000..ce648c66ee9 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst @@ -0,0 +1,78 @@ +mbarrier.try_wait.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. PTX ISA 78, SM_90 + template + __device__ static inline bool mbarrier_try_wait( + uint64_t* addr, + const uint64_t& state); + +mbarrier.try_wait.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX ISA 78, SM_90 + template + __device__ static inline bool mbarrier_try_wait( + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.acquire.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); + +mbarrier.try_wait.acquire.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); + +mbarrier.try_wait.acquire.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.acquire.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst new file mode 100644 index 00000000000..3210dc0eab1 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst @@ -0,0 +1,78 @@ +mbarrier.try_wait.parity.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. PTX ISA 78, SM_90 + template + __device__ static inline bool mbarrier_try_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.try_wait.parity.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. PTX ISA 78, SM_90 + template + __device__ static inline bool mbarrier_try_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); diff --git a/docs/libcudacxx/ptx/instructions/red.async.rst b/docs/libcudacxx/ptx/instructions/generated/red_async.rst similarity index 89% rename from docs/libcudacxx/ptx/instructions/red.async.rst rename to docs/libcudacxx/ptx/instructions/generated/red_async.rst index 62599548a22..d6b9cf36549 100644 --- a/docs/libcudacxx/ptx/instructions/red.async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/red_async.rst @@ -1,16 +1,3 @@ -.. _libcudacxx-ptx-instructions-mbarrier-red-async: - -red.async -========= - -- PTX ISA: - `red.async `__ - -.. _red.async-1: - -red.async ---------- - red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -191,20 +178,3 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 int64_t* dest, const int64_t& value, int64_t* remote_bar); - -red.async ``.s64`` emulation ----------------------------- - -PTX does not currently (CTK 12.3) expose ``red.async.add.s64``. This -exposure is emulated in ``cuda::ptx`` using - -.. code:: cuda - - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 - // .op = { .add } - template - __device__ static inline void red_async( - cuda::ptx::op_add_t, - int64_t* dest, - const int64_t& value, - int64_t* remote_bar); diff --git a/docs/libcudacxx/ptx/instructions/generated/special_registers.rst b/docs/libcudacxx/ptx/instructions/generated/special_registers.rst new file mode 100644 index 00000000000..aa1add84781 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/special_registers.rst @@ -0,0 +1,383 @@ +tid.x +^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%tid.x; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_tid_x(); + +tid.y +^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%tid.y; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_tid_y(); + +tid.z +^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%tid.z; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_tid_z(); + +ntid.x +^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_ntid_x(); + +ntid.y +^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_ntid_y(); + +ntid.z +^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_ntid_z(); + +laneid +^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%laneid; // PTX ISA 13 + template + __device__ static inline uint32_t get_sreg_laneid(); + +warpid +^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%warpid; // PTX ISA 13 + template + __device__ static inline uint32_t get_sreg_warpid(); + +nwarpid +^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_nwarpid(); + +ctaid.x +^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_ctaid_x(); + +ctaid.y +^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_ctaid_y(); + +ctaid.z +^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_ctaid_z(); + +nctaid.x +^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_nctaid_x(); + +nctaid.y +^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_nctaid_y(); + +nctaid.z +^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_nctaid_z(); + +smid +^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%smid; // PTX ISA 13 + template + __device__ static inline uint32_t get_sreg_smid(); + +nsmid +^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_nsmid(); + +gridid +^^^^^^ +.. code:: cuda + + // mov.u64 sreg_value, %%gridid; // PTX ISA 30 + template + __device__ static inline uint64_t get_sreg_gridid(); + +is_explicit_cluster +^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 + template + __device__ static inline bool get_sreg_is_explicit_cluster(); + +clusterid.x +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_clusterid_x(); + +clusterid.y +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_clusterid_y(); + +clusterid.z +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_clusterid_z(); + +nclusterid.x +^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_nclusterid_x(); + +nclusterid.y +^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_nclusterid_y(); + +nclusterid.z +^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_nclusterid_z(); + +cluster_ctaid.x +^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_ctaid_x(); + +cluster_ctaid.y +^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_ctaid_y(); + +cluster_ctaid.z +^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_ctaid_z(); + +cluster_nctaid.x +^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_nctaid_x(); + +cluster_nctaid.y +^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_nctaid_y(); + +cluster_nctaid.z +^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_nctaid_z(); + +cluster_ctarank +^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_ctarank(); + +cluster_nctarank +^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_nctarank(); + +lanemask_eq +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_lanemask_eq(); + +lanemask_le +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_lanemask_le(); + +lanemask_lt +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_lanemask_lt(); + +lanemask_ge +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_lanemask_ge(); + +lanemask_gt +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_lanemask_gt(); + +clock +^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%clock; // PTX ISA 10 + template + __device__ static inline uint32_t get_sreg_clock(); + +clock_hi +^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 + template + __device__ static inline uint32_t get_sreg_clock_hi(); + +clock64 +^^^^^^^ +.. code:: cuda + + // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 + template + __device__ static inline uint64_t get_sreg_clock64(); + +globaltimer +^^^^^^^^^^^ +.. code:: cuda + + // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 + template + __device__ static inline uint64_t get_sreg_globaltimer(); + +globaltimer_lo +^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 + template + __device__ static inline uint32_t get_sreg_globaltimer_lo(); + +globaltimer_hi +^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 + template + __device__ static inline uint32_t get_sreg_globaltimer_hi(); + +total_smem_size +^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 + template + __device__ static inline uint32_t get_sreg_total_smem_size(); + +aggr_smem_size +^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 + template + __device__ static inline uint32_t get_sreg_aggr_smem_size(); + +dynamic_smem_size +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 + template + __device__ static inline uint32_t get_sreg_dynamic_smem_size(); + +current_graph_exec +^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 + template + __device__ static inline uint64_t get_sreg_current_graph_exec(); diff --git a/docs/libcudacxx/ptx/instructions/st.async.rst b/docs/libcudacxx/ptx/instructions/generated/st_async.rst similarity index 83% rename from docs/libcudacxx/ptx/instructions/st.async.rst rename to docs/libcudacxx/ptx/instructions/generated/st_async.rst index a2e1ebe46a6..c519ea57f70 100644 --- a/docs/libcudacxx/ptx/instructions/st.async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/st_async.rst @@ -1,16 +1,3 @@ -.. _libcudacxx-ptx-instructions-st-async: - -st.async -======== - -- PTX ISA: - `st.async `__ -- Used in: :ref:`How to use st.async ` - -**NOTE.** Alignment of ``addr`` must be a multiple of vector size. For -instance, the ``addr`` supplied to the ``v2.b32`` variant must be -aligned to ``2 x 4 = 8`` bytes. - st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/tensormap.cp_fenceproxy.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst similarity index 89% rename from docs/libcudacxx/ptx/instructions/tensormap.cp_fenceproxy.rst rename to docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst index 1de158491a8..52fae102ad4 100644 --- a/docs/libcudacxx/ptx/instructions/tensormap.cp_fenceproxy.rst +++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst @@ -1,11 +1,3 @@ -.. _libcudacxx-ptx-instructions-tensormap-cp_fenceproxy: - -tensormap.cp_fenceproxy -======================= - -- PTX ISA: - `tensormap.cp_fenceproxy `__ - tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/tensormap.replace.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst similarity index 97% rename from docs/libcudacxx/ptx/instructions/tensormap.replace.rst rename to docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst index 7d8b839584e..33e6f1d839a 100644 --- a/docs/libcudacxx/ptx/instructions/tensormap.replace.rst +++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst @@ -1,11 +1,3 @@ -.. _libcudacxx-ptx-instructions-tensormap-replace: - -tensormap.replace -================= - -- PTX ISA: - `tensormap.replace `__ - tensormap.replace.tile.global_address.global.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/getctarank.rst b/docs/libcudacxx/ptx/instructions/getctarank.rst index 5bad6259103..d355ed80929 100644 --- a/docs/libcudacxx/ptx/instructions/getctarank.rst +++ b/docs/libcudacxx/ptx/instructions/getctarank.rst @@ -6,13 +6,4 @@ getctarank - PTX ISA: `getctarank `__ -getctarank.shared::cluster.u32 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 - // .space = { .shared::cluster } - template - __device__ static inline uint32_t getctarank( - cuda::ptx::space_cluster_t, - const void* addr); +.. include:: generated/getctarank.rst diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst b/docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst deleted file mode 100644 index c383c59c6fd..00000000000 --- a/docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst +++ /dev/null @@ -1,232 +0,0 @@ -.. _libcudacxx-ptx-instructions-mbarrier-arrive: - -mbarrier.arrive -=============== - -- PTX ISA: - `mbarrier.arrive `__ - -.. _mbarrier.arrive-1: - -mbarrier.arrive ---------------- - -Some of the listed PTX instructions below are semantically equivalent. -They differ in one important way: the shorter instructions are typically -supported on older compilers. - -mbarrier.arrive.shared.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 - template - __device__ static inline uint64_t mbarrier_arrive( - uint64_t* addr); - -mbarrier.arrive.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 - template - __device__ static inline uint64_t mbarrier_arrive( - uint64_t* addr, - const uint32_t& count); - -mbarrier.arrive.release.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster } - // .space = { .shared::cta } - template - __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr); - -mbarrier.arrive.release.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster } - // .space = { .shared::cta } - template - __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr); - -mbarrier.arrive.release.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster } - // .space = { .shared::cta } - template - __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& count); - -mbarrier.arrive.release.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster } - // .space = { .shared::cta } - template - __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& count); - -mbarrier.arrive.release.cluster.shared::cluster.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cluster } - // .space = { .shared::cluster } - template - __device__ static inline void mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, - cuda::ptx::space_cluster_t, - uint64_t* addr); - -mbarrier.arrive.release.cluster.shared::cluster.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cluster } - // .space = { .shared::cluster } - template - __device__ static inline void mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, - cuda::ptx::space_cluster_t, - uint64_t* addr, - const uint32_t& count); - -mbarrier.arrive.no_complete ---------------------------- - -mbarrier.arrive.noComplete.shared.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 - template - __device__ static inline uint64_t mbarrier_arrive_no_complete( - uint64_t* addr, - const uint32_t& count); - -mbarrier.arrive.expect_tx -------------------------- - -mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster } - // .space = { .shared::cta } - template - __device__ static inline uint64_t mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& tx_count); - -mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster } - // .space = { .shared::cta } - template - __device__ static inline uint64_t mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& tx_count); - -mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cluster } - // .space = { .shared::cluster } - template - __device__ static inline void mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, - cuda::ptx::space_cluster_t, - uint64_t* addr, - const uint32_t& tx_count); - -Usage ------ - -.. code:: cuda - - #include - #include - #include - - __global__ void kernel() { - using cuda::ptx::sem_release; - using cuda::ptx::space_cluster; - using cuda::ptx::space_shared; - using cuda::ptx::scope_cluster; - using cuda::ptx::scope_cta; - - using barrier_t = cuda::barrier; - __shared__ barrier_t bar; - init(&bar, blockDim.x); - __syncthreads(); - - NV_IF_TARGET(NV_PROVIDES_SM_90, ( - // Arrive on local shared memory barrier: - uint64_t token; - token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); - - // Get address of remote cluster barrier: - namespace cg = cooperative_groups; - cg::cluster_group cluster = cg::this_cluster(); - unsigned int other_block_rank = cluster.block_rank() ^ 1; - uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank); - - // Sync cluster to ensure remote barrier is initialized. - cluster.sync(); - - // Arrive on remote cluster barrier: - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, remote_bar, 1); - ) - } diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst deleted file mode 100644 index 23197e2eb7c..00000000000 --- a/docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst +++ /dev/null @@ -1,91 +0,0 @@ -.. _libcudacxx-ptx-instructions-mbarrier-test_wait: - -mbarrier.test_wait -================== - -- PTX ISA: - `mbarrier.test_wait `__ - -.. _mbarrier.test_wait-1: - -mbarrier.test_wait ------------------- - -mbarrier.test_wait.shared.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX ISA 70, SM_80 - template - __device__ static inline bool mbarrier_test_wait( - uint64_t* addr, - const uint64_t& state); - -mbarrier.test_wait.acquire.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_test_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state); - -mbarrier.test_wait.acquire.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_test_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state); - -mbarrier.test_wait.parity -------------------------- - -mbarrier.test_wait.parity.shared.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX ISA 71, SM_80 - template - __device__ static inline bool mbarrier_test_wait_parity( - uint64_t* addr, - const uint32_t& phaseParity); - -mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_test_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity); - -mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_test_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity); diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst deleted file mode 100644 index 762f5e100d7..00000000000 --- a/docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst +++ /dev/null @@ -1,174 +0,0 @@ -.. _libcudacxx-ptx-instructions-mbarrier-try_wait: - -mbarrier.try_wait -================= - -- PTX ISA: - `mbarrier.try_wait `__ - - -.. _mbarrier.try_wait-1: - -mbarrier.try_wait ------------------ - -mbarrier.try_wait.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. PTX ISA 78, SM_90 - template - __device__ static inline bool mbarrier_try_wait( - uint64_t* addr, - const uint64_t& state); - -mbarrier.try_wait.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX ISA 78, SM_90 - template - __device__ static inline bool mbarrier_try_wait( - uint64_t* addr, - const uint64_t& state, - const uint32_t& suspendTimeHint); - -mbarrier.try_wait.acquire.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state); - -mbarrier.try_wait.acquire.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state); - -mbarrier.try_wait.acquire.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state, - const uint32_t& suspendTimeHint); - -mbarrier.try_wait.acquire.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state, - const uint32_t& suspendTimeHint); - -mbarrier.try_wait.parity ------------------------- - -mbarrier.try_wait.parity.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. PTX ISA 78, SM_90 - template - __device__ static inline bool mbarrier_try_wait_parity( - uint64_t* addr, - const uint32_t& phaseParity); - -mbarrier.try_wait.parity.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. PTX ISA 78, SM_90 - template - __device__ static inline bool mbarrier_try_wait_parity( - uint64_t* addr, - const uint32_t& phaseParity, - const uint32_t& suspendTimeHint); - -mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity); - -mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity); - -mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity, - const uint32_t& suspendTimeHint); - -mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity, - const uint32_t& suspendTimeHint); diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst new file mode 100644 index 00000000000..f01e7a95465 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst @@ -0,0 +1,68 @@ +.. _libcudacxx-ptx-instructions-mbarrier-arrive: + +mbarrier.arrive +=============== + +- PTX ISA: + `mbarrier.arrive `__ + +.. _mbarrier.arrive-1: + +mbarrier.arrive +--------------- + +Some of the listed PTX instructions below are semantically equivalent. +They differ in one important way: the shorter instructions are typically +supported on older compilers. + +.. include:: generated/mbarrier_arrive.rst + +mbarrier.arrive.no_complete +--------------------------- + +.. include:: generated/mbarrier_arrive_no_complete.rst + +mbarrier.arrive.expect_tx +------------------------- + +.. include:: generated/mbarrier_arrive_expect_tx.rst + +Usage +----- + +.. code:: cuda + + #include + #include + #include + + __global__ void kernel() { + using cuda::ptx::sem_release; + using cuda::ptx::space_cluster; + using cuda::ptx::space_shared; + using cuda::ptx::scope_cluster; + using cuda::ptx::scope_cta; + + using barrier_t = cuda::barrier; + __shared__ barrier_t bar; + init(&bar, blockDim.x); + __syncthreads(); + + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + // Arrive on local shared memory barrier: + uint64_t token; + token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); + + // Get address of remote cluster barrier: + namespace cg = cooperative_groups; + cg::cluster_group cluster = cg::this_cluster(); + unsigned int other_block_rank = cluster.block_rank() ^ 1; + uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank); + + // Sync cluster to ensure remote barrier is initialized. + cluster.sync(); + + // Arrive on remote cluster barrier: + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, remote_bar, 1); + ) + } diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst b/docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst new file mode 100644 index 00000000000..6c34813242f --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-mbarrier-expect_tx: + +mbarrier.expect_tx +================== + +- PTX ISA: + `mbarrier.expect_tx `__ + +.. include:: generated/mbarrier_expect_tx.rst diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/mbarrier_init.rst new file mode 100644 index 00000000000..a736f53b0a2 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/mbarrier_init.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-mbarrier-init: + +mbarrier.init +============= + +- PTX ISA: + `mbarrier.arrive `__ + +.. include:: generated/mbarrier_init.rst diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst new file mode 100644 index 00000000000..d8a4e79473e --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst @@ -0,0 +1,19 @@ +.. _libcudacxx-ptx-instructions-mbarrier-test_wait: + +mbarrier.test_wait +================== + +- PTX ISA: + `mbarrier.test_wait `__ + +.. _mbarrier.test_wait-1: + +mbarrier.test_wait +------------------ + +.. include:: generated/mbarrier_test_wait.rst + +mbarrier.test_wait.parity +------------------------- + +.. include:: generated/mbarrier_test_wait_parity.rst diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst new file mode 100644 index 00000000000..1869695f3f6 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst @@ -0,0 +1,20 @@ +.. _libcudacxx-ptx-instructions-mbarrier-try_wait: + +mbarrier.try_wait +================= + +- PTX ISA: + `mbarrier.try_wait `__ + + +.. _mbarrier.try_wait-1: + +mbarrier.try_wait +----------------- + +.. include:: generated/mbarrier_try_wait.rst + +mbarrier.try_wait.parity +------------------------ + +.. include:: generated/mbarrier_try_wait_parity.rst diff --git a/docs/libcudacxx/ptx/instructions/red_async.rst b/docs/libcudacxx/ptx/instructions/red_async.rst new file mode 100644 index 00000000000..82ba07c38de --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/red_async.rst @@ -0,0 +1,31 @@ +.. _libcudacxx-ptx-instructions-mbarrier-red-async: + +red.async +========= + +- PTX ISA: + `red.async `__ + +.. _red.async-1: + +red.async +--------- + +.. include:: generated/red_async.rst + +red.async ``.s64`` emulation +---------------------------- + +PTX does not currently (CTK 12.3) expose ``red.async.add.s64``. This +exposure is emulated in ``cuda::ptx`` using + +.. code:: cuda + + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 + // .op = { .add } + template + __device__ static inline void red_async( + cuda::ptx::op_add_t, + int64_t* dest, + const int64_t& value, + int64_t* remote_bar); diff --git a/docs/libcudacxx/ptx/instructions/special_registers.rst b/docs/libcudacxx/ptx/instructions/special_registers.rst index 375ce44622e..1e9597fa726 100644 --- a/docs/libcudacxx/ptx/instructions/special_registers.rst +++ b/docs/libcudacxx/ptx/instructions/special_registers.rst @@ -6,386 +6,4 @@ Special registers - PTX ISA: `Special Register `__ -tid.x -^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%tid.x; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_tid_x(); - -tid.y -^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%tid.y; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_tid_y(); - -tid.z -^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%tid.z; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_tid_z(); - -ntid.x -^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_ntid_x(); - -ntid.y -^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_ntid_y(); - -ntid.z -^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_ntid_z(); - -laneid -^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%laneid; // PTX ISA 13 - template - __device__ static inline uint32_t get_sreg_laneid(); - -warpid -^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%warpid; // PTX ISA 13 - template - __device__ static inline uint32_t get_sreg_warpid(); - -nwarpid -^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_nwarpid(); - -ctaid.x -^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_ctaid_x(); - -ctaid.y -^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_ctaid_y(); - -ctaid.z -^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_ctaid_z(); - -nctaid.x -^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_nctaid_x(); - -nctaid.y -^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_nctaid_y(); - -nctaid.z -^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_nctaid_z(); - -smid -^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%smid; // PTX ISA 13 - template - __device__ static inline uint32_t get_sreg_smid(); - -nsmid -^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_nsmid(); - -gridid -^^^^^^ -.. code:: cuda - - // mov.u64 sreg_value, %%gridid; // PTX ISA 30 - template - __device__ static inline uint64_t get_sreg_gridid(); - -is_explicit_cluster -^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 - template - __device__ static inline bool get_sreg_is_explicit_cluster(); - -clusterid.x -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_clusterid_x(); - -clusterid.y -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_clusterid_y(); - -clusterid.z -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_clusterid_z(); - -nclusterid.x -^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_nclusterid_x(); - -nclusterid.y -^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_nclusterid_y(); - -nclusterid.z -^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_nclusterid_z(); - -cluster_ctaid.x -^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_ctaid_x(); - -cluster_ctaid.y -^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_ctaid_y(); - -cluster_ctaid.z -^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_ctaid_z(); - -cluster_nctaid.x -^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_nctaid_x(); - -cluster_nctaid.y -^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_nctaid_y(); - -cluster_nctaid.z -^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_nctaid_z(); - -cluster_ctarank -^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_ctarank(); - -cluster_nctarank -^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_nctarank(); - -lanemask_eq -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_lanemask_eq(); - -lanemask_le -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_lanemask_le(); - -lanemask_lt -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_lanemask_lt(); - -lanemask_ge -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_lanemask_ge(); - -lanemask_gt -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_lanemask_gt(); - -clock -^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%clock; // PTX ISA 10 - template - __device__ static inline uint32_t get_sreg_clock(); - -clock_hi -^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 - template - __device__ static inline uint32_t get_sreg_clock_hi(); - -clock64 -^^^^^^^ -.. code:: cuda - - // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 - template - __device__ static inline uint64_t get_sreg_clock64(); - -globaltimer -^^^^^^^^^^^ -.. code:: cuda - - // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 - template - __device__ static inline uint64_t get_sreg_globaltimer(); - -globaltimer_lo -^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 - template - __device__ static inline uint32_t get_sreg_globaltimer_lo(); - -globaltimer_hi -^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 - template - __device__ static inline uint32_t get_sreg_globaltimer_hi(); - -total_smem_size -^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 - template - __device__ static inline uint32_t get_sreg_total_smem_size(); - -aggr_smem_size -^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 - template - __device__ static inline uint32_t get_sreg_aggr_smem_size(); - -dynamic_smem_size -^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 - template - __device__ static inline uint32_t get_sreg_dynamic_smem_size(); - -current_graph_exec -^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 - template - __device__ static inline uint64_t get_sreg_current_graph_exec(); +.. include:: generated/special_registers.rst diff --git a/docs/libcudacxx/ptx/instructions/st_async.rst b/docs/libcudacxx/ptx/instructions/st_async.rst new file mode 100644 index 00000000000..c71aebd7da3 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/st_async.rst @@ -0,0 +1,14 @@ +.. _libcudacxx-ptx-instructions-st-async: + +st.async +======== + +- PTX ISA: + `st.async `__ +- Used in: :ref:`How to use st.async ` + +**NOTE.** Alignment of ``addr`` must be a multiple of vector size. For +instance, the ``addr`` supplied to the ``v2.b32`` variant must be +aligned to ``2 x 4 = 8`` bytes. + +.. include:: generated/st_async.rst diff --git a/docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst b/docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst new file mode 100644 index 00000000000..2f7622bba2c --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tensormap-cp_fenceproxy: + +tensormap.cp_fenceproxy +======================= + +- PTX ISA: + `tensormap.cp_fenceproxy `__ + +.. include:: generated/tensormap_cp_fenceproxy.rst diff --git a/docs/libcudacxx/ptx/instructions/tensormap_replace.rst b/docs/libcudacxx/ptx/instructions/tensormap_replace.rst new file mode 100644 index 00000000000..331dcff313a --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tensormap_replace.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tensormap-replace: + +tensormap.replace +================= + +- PTX ISA: + `tensormap.replace `__ + +.. include:: generated/tensormap_replace.rst